• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10           New API code Copyright (c) 2016-2019 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 /* This module contains functions for scanning a compiled pattern and
42 collecting data (e.g. minimum matching length). */
43 
44 
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48 
49 #include "pcre2_internal.h"
50 
51 /* The maximum remembered capturing brackets minimum. */
52 
53 #define MAX_CACHE_BACKREF 128
54 
55 /* Set a bit in the starting code unit bit map. */
56 
57 #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1u << ((c)&7))
58 
59 /* Returns from set_start_bits() */
60 
61 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
62 
63 
64 /*************************************************
65 *   Find the minimum subject length for a group  *
66 *************************************************/
67 
68 /* Scan a parenthesized group and compute the minimum length of subject that
69 is needed to match it. This is a lower bound; it does not mean there is a
70 string of that length that matches. In UTF mode, the result is in characters
71 rather than code units. The field in a compiled pattern for storing the minimum
72 length is 16-bits long (on the grounds that anything longer than that is
73 pathological), so we give up when we reach that amount. This also means that
74 integer overflow for really crazy patterns cannot happen.
75 
76 Backreference minimum lengths are cached to speed up multiple references. This
77 function is called only when the highest back reference in the pattern is less
78 than or equal to MAX_CACHE_BACKREF, which is one less than the size of the
79 caching vector. The zeroth element contains the number of the highest set
80 value.
81 
82 Arguments:
83   re              compiled pattern block
84   code            pointer to start of group (the bracket)
85   startcode       pointer to start of the whole pattern's code
86   utf             UTF flag
87   recurses        chain of recurse_check to catch mutual recursion
88   countptr        pointer to call count (to catch over complexity)
89   backref_cache   vector for caching back references.
90 
91 Returns:   the minimum length
92            -1 \C in UTF-8 mode
93               or (*ACCEPT)
94               or pattern too complicated
95               or back reference to duplicate name/number
96            -2 internal error (missing capturing bracket)
97            -3 internal error (opcode not listed)
98 */
99 
100 static int
find_minlength(const pcre2_real_code * re,PCRE2_SPTR code,PCRE2_SPTR startcode,BOOL utf,recurse_check * recurses,int * countptr,int * backref_cache)101 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
102   PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr,
103   int *backref_cache)
104 {
105 int length = -1;
106 int prev_cap_recno = -1;
107 int prev_cap_d = 0;
108 int prev_recurse_recno = -1;
109 int prev_recurse_d = 0;
110 uint32_t once_fudge = 0;
111 BOOL had_recurse = FALSE;
112 BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
113 recurse_check this_recurse;
114 int branchlength = 0;
115 PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
116 
117 /* If this is a "could be empty" group, its minimum length is 0. */
118 
119 if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
120 
121 /* Skip over capturing bracket number */
122 
123 if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
124 
125 /* A large and/or complex regex can take too long to process. */
126 
127 if ((*countptr)++ > 1000) return -1;
128 
129 /* Scan along the opcodes for this branch. If we get to the end of the branch,
130 check the length against that of the other branches. If the accumulated length
131 passes 16-bits, stop. */
132 
133 for (;;)
134   {
135   int d, min, recno;
136   PCRE2_UCHAR *cs, *ce;
137   PCRE2_UCHAR op = *cc;
138 
139   if (branchlength >= UINT16_MAX) return UINT16_MAX;
140 
141   switch (op)
142     {
143     case OP_COND:
144     case OP_SCOND:
145 
146     /* If there is only one branch in a condition, the implied branch has zero
147     length, so we don't add anything. This covers the DEFINE "condition"
148     automatically. If there are two branches we can treat it the same as any
149     other non-capturing subpattern. */
150 
151     cs = cc + GET(cc, 1);
152     if (*cs != OP_ALT)
153       {
154       cc = cs + 1 + LINK_SIZE;
155       break;
156       }
157     goto PROCESS_NON_CAPTURE;
158 
159     case OP_BRA:
160     /* There's a special case of OP_BRA, when it is wrapped round a repeated
161     OP_RECURSE. We'd like to process the latter at this level so that
162     remembering the value works for repeated cases. So we do nothing, but
163     set a fudge value to skip over the OP_KET after the recurse. */
164 
165     if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
166       {
167       once_fudge = 1 + LINK_SIZE;
168       cc += 1 + LINK_SIZE;
169       break;
170       }
171     /* Fall through */
172 
173     case OP_ONCE:
174     case OP_SCRIPT_RUN:
175     case OP_SBRA:
176     case OP_BRAPOS:
177     case OP_SBRAPOS:
178     PROCESS_NON_CAPTURE:
179     d = find_minlength(re, cc, startcode, utf, recurses, countptr,
180       backref_cache);
181     if (d < 0) return d;
182     branchlength += d;
183     do cc += GET(cc, 1); while (*cc == OP_ALT);
184     cc += 1 + LINK_SIZE;
185     break;
186 
187     /* To save time for repeated capturing subpatterns, we remember the
188     length of the previous one. Unfortunately we can't do the same for
189     the unnumbered ones above. Nor can we do this if (?| is present in the
190     pattern because captures with the same number are not then identical. */
191 
192     case OP_CBRA:
193     case OP_SCBRA:
194     case OP_CBRAPOS:
195     case OP_SCBRAPOS:
196     recno = (int)GET2(cc, 1+LINK_SIZE);
197     if (dupcapused || recno != prev_cap_recno)
198       {
199       prev_cap_recno = recno;
200       prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr,
201         backref_cache);
202       if (prev_cap_d < 0) return prev_cap_d;
203       }
204     branchlength += prev_cap_d;
205     do cc += GET(cc, 1); while (*cc == OP_ALT);
206     cc += 1 + LINK_SIZE;
207     break;
208 
209     /* ACCEPT makes things far too complicated; we have to give up. */
210 
211     case OP_ACCEPT:
212     case OP_ASSERT_ACCEPT:
213     return -1;
214 
215     /* Reached end of a branch; if it's a ket it is the end of a nested
216     call. If it's ALT it is an alternation in a nested call. If it is END it's
217     the end of the outer call. All can be handled by the same code. If an
218     ACCEPT was previously encountered, use the length that was in force at that
219     time, and pass back the shortest ACCEPT length. */
220 
221     case OP_ALT:
222     case OP_KET:
223     case OP_KETRMAX:
224     case OP_KETRMIN:
225     case OP_KETRPOS:
226     case OP_END:
227     if (length < 0 || (!had_recurse && branchlength < length))
228       length = branchlength;
229     if (op != OP_ALT) return length;
230     cc += 1 + LINK_SIZE;
231     branchlength = 0;
232     had_recurse = FALSE;
233     break;
234 
235     /* Skip over assertive subpatterns */
236 
237     case OP_ASSERT:
238     case OP_ASSERT_NOT:
239     case OP_ASSERTBACK:
240     case OP_ASSERTBACK_NOT:
241     do cc += GET(cc, 1); while (*cc == OP_ALT);
242     /* Fall through */
243 
244     /* Skip over things that don't match chars */
245 
246     case OP_REVERSE:
247     case OP_CREF:
248     case OP_DNCREF:
249     case OP_RREF:
250     case OP_DNRREF:
251     case OP_FALSE:
252     case OP_TRUE:
253     case OP_CALLOUT:
254     case OP_SOD:
255     case OP_SOM:
256     case OP_EOD:
257     case OP_EODN:
258     case OP_CIRC:
259     case OP_CIRCM:
260     case OP_DOLL:
261     case OP_DOLLM:
262     case OP_NOT_WORD_BOUNDARY:
263     case OP_WORD_BOUNDARY:
264     cc += PRIV(OP_lengths)[*cc];
265     break;
266 
267     case OP_CALLOUT_STR:
268     cc += GET(cc, 1 + 2*LINK_SIZE);
269     break;
270 
271     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
272 
273     case OP_BRAZERO:
274     case OP_BRAMINZERO:
275     case OP_BRAPOSZERO:
276     case OP_SKIPZERO:
277     cc += PRIV(OP_lengths)[*cc];
278     do cc += GET(cc, 1); while (*cc == OP_ALT);
279     cc += 1 + LINK_SIZE;
280     break;
281 
282     /* Handle literal characters and + repetitions */
283 
284     case OP_CHAR:
285     case OP_CHARI:
286     case OP_NOT:
287     case OP_NOTI:
288     case OP_PLUS:
289     case OP_PLUSI:
290     case OP_MINPLUS:
291     case OP_MINPLUSI:
292     case OP_POSPLUS:
293     case OP_POSPLUSI:
294     case OP_NOTPLUS:
295     case OP_NOTPLUSI:
296     case OP_NOTMINPLUS:
297     case OP_NOTMINPLUSI:
298     case OP_NOTPOSPLUS:
299     case OP_NOTPOSPLUSI:
300     branchlength++;
301     cc += 2;
302 #ifdef SUPPORT_UNICODE
303     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
304 #endif
305     break;
306 
307     case OP_TYPEPLUS:
308     case OP_TYPEMINPLUS:
309     case OP_TYPEPOSPLUS:
310     branchlength++;
311     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
312     break;
313 
314     /* Handle exact repetitions. The count is already in characters, but we
315     may need to skip over a multibyte character in UTF mode.  */
316 
317     case OP_EXACT:
318     case OP_EXACTI:
319     case OP_NOTEXACT:
320     case OP_NOTEXACTI:
321     branchlength += GET2(cc,1);
322     cc += 2 + IMM2_SIZE;
323 #ifdef SUPPORT_UNICODE
324     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
325 #endif
326     break;
327 
328     case OP_TYPEEXACT:
329     branchlength += GET2(cc,1);
330     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
331       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
332     break;
333 
334     /* Handle single-char non-literal matchers */
335 
336     case OP_PROP:
337     case OP_NOTPROP:
338     cc += 2;
339     /* Fall through */
340 
341     case OP_NOT_DIGIT:
342     case OP_DIGIT:
343     case OP_NOT_WHITESPACE:
344     case OP_WHITESPACE:
345     case OP_NOT_WORDCHAR:
346     case OP_WORDCHAR:
347     case OP_ANY:
348     case OP_ALLANY:
349     case OP_EXTUNI:
350     case OP_HSPACE:
351     case OP_NOT_HSPACE:
352     case OP_VSPACE:
353     case OP_NOT_VSPACE:
354     branchlength++;
355     cc++;
356     break;
357 
358     /* "Any newline" might match two characters, but it also might match just
359     one. */
360 
361     case OP_ANYNL:
362     branchlength += 1;
363     cc++;
364     break;
365 
366     /* The single-byte matcher means we can't proceed in UTF mode. (In
367     non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
368     appear, but leave the code, just in case.) */
369 
370     case OP_ANYBYTE:
371 #ifdef SUPPORT_UNICODE
372     if (utf) return -1;
373 #endif
374     branchlength++;
375     cc++;
376     break;
377 
378     /* For repeated character types, we have to test for \p and \P, which have
379     an extra two bytes of parameters. */
380 
381     case OP_TYPESTAR:
382     case OP_TYPEMINSTAR:
383     case OP_TYPEQUERY:
384     case OP_TYPEMINQUERY:
385     case OP_TYPEPOSSTAR:
386     case OP_TYPEPOSQUERY:
387     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
388     cc += PRIV(OP_lengths)[op];
389     break;
390 
391     case OP_TYPEUPTO:
392     case OP_TYPEMINUPTO:
393     case OP_TYPEPOSUPTO:
394     if (cc[1 + IMM2_SIZE] == OP_PROP
395       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
396     cc += PRIV(OP_lengths)[op];
397     break;
398 
399     /* Check a class for variable quantification */
400 
401     case OP_CLASS:
402     case OP_NCLASS:
403 #ifdef SUPPORT_WIDE_CHARS
404     case OP_XCLASS:
405     /* The original code caused an unsigned overflow in 64 bit systems,
406     so now we use a conditional statement. */
407     if (op == OP_XCLASS)
408       cc += GET(cc, 1);
409     else
410       cc += PRIV(OP_lengths)[OP_CLASS];
411 #else
412     cc += PRIV(OP_lengths)[OP_CLASS];
413 #endif
414 
415     switch (*cc)
416       {
417       case OP_CRPLUS:
418       case OP_CRMINPLUS:
419       case OP_CRPOSPLUS:
420       branchlength++;
421       /* Fall through */
422 
423       case OP_CRSTAR:
424       case OP_CRMINSTAR:
425       case OP_CRQUERY:
426       case OP_CRMINQUERY:
427       case OP_CRPOSSTAR:
428       case OP_CRPOSQUERY:
429       cc++;
430       break;
431 
432       case OP_CRRANGE:
433       case OP_CRMINRANGE:
434       case OP_CRPOSRANGE:
435       branchlength += GET2(cc,1);
436       cc += 1 + 2 * IMM2_SIZE;
437       break;
438 
439       default:
440       branchlength++;
441       break;
442       }
443     break;
444 
445     /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
446     way: we find the minimum length for the subpattern. A recursion
447     (backreference or subroutine) causes an a flag to be set that causes the
448     length of this branch to be ignored. The logic is that a recursion can only
449     make sense if there is another alternative that stops the recursing. That
450     will provide the minimum length (when no recursion happens).
451 
452     If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
453     matches an empty string (by default it causes a matching failure), so in
454     that case we must set the minimum length to zero. */
455 
456     /* Duplicate named pattern back reference. We cannot reliably find a length
457     for this if duplicate numbers are present in the pattern. */
458 
459     case OP_DNREF:
460     case OP_DNREFI:
461     if (dupcapused) return -1;
462     if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
463       {
464       int count = GET2(cc, 1+IMM2_SIZE);
465       PCRE2_UCHAR *slot =
466         (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
467           GET2(cc, 1) * re->name_entry_size;
468 
469       d = INT_MAX;
470 
471       /* Scan all groups with the same name; find the shortest. */
472 
473       while (count-- > 0)
474         {
475         int dd, i;
476         recno = GET2(slot, 0);
477 
478         if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
479           dd = backref_cache[recno];
480         else
481           {
482           ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
483           if (cs == NULL) return -2;
484           do ce += GET(ce, 1); while (*ce == OP_ALT);
485           if (cc > cs && cc < ce)    /* Simple recursion */
486             {
487             dd = 0;
488             had_recurse = TRUE;
489             }
490           else
491             {
492             recurse_check *r = recurses;
493             for (r = recurses; r != NULL; r = r->prev)
494               if (r->group == cs) break;
495             if (r != NULL)           /* Mutual recursion */
496               {
497               dd = 0;
498               had_recurse = TRUE;
499               }
500             else
501               {
502               this_recurse.prev = recurses;
503               this_recurse.group = cs;
504               dd = find_minlength(re, cs, startcode, utf, &this_recurse,
505                 countptr, backref_cache);
506               if (dd < 0) return dd;
507               }
508             }
509 
510           backref_cache[recno] = dd;
511           for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
512           backref_cache[0] = recno;
513           }
514 
515         if (dd < d) d = dd;
516         if (d <= 0) break;    /* No point looking at any more */
517         slot += re->name_entry_size;
518         }
519       }
520     else d = 0;
521     cc += 1 + 2*IMM2_SIZE;
522     goto REPEAT_BACK_REFERENCE;
523 
524     /* Single back reference. We cannot find a length for this if duplicate
525     numbers are present in the pattern. */
526 
527     case OP_REF:
528     case OP_REFI:
529     if (dupcapused) return -1;
530     recno = GET2(cc, 1);
531     if (recno <= backref_cache[0] && backref_cache[recno] >= 0)
532       d = backref_cache[recno];
533     else
534       {
535       int i;
536       if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
537         {
538         ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, recno);
539         if (cs == NULL) return -2;
540         do ce += GET(ce, 1); while (*ce == OP_ALT);
541         if (cc > cs && cc < ce)    /* Simple recursion */
542           {
543           d = 0;
544           had_recurse = TRUE;
545           }
546         else
547           {
548           recurse_check *r = recurses;
549           for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
550           if (r != NULL)           /* Mutual recursion */
551             {
552             d = 0;
553             had_recurse = TRUE;
554             }
555           else
556             {
557             this_recurse.prev = recurses;
558             this_recurse.group = cs;
559             d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr,
560               backref_cache);
561             if (d < 0) return d;
562             }
563           }
564         }
565       else d = 0;
566 
567       backref_cache[recno] = d;
568       for (i = backref_cache[0] + 1; i < recno; i++) backref_cache[i] = -1;
569       backref_cache[0] = recno;
570       }
571 
572     cc += 1 + IMM2_SIZE;
573 
574     /* Handle repeated back references */
575 
576     REPEAT_BACK_REFERENCE:
577     switch (*cc)
578       {
579       case OP_CRSTAR:
580       case OP_CRMINSTAR:
581       case OP_CRQUERY:
582       case OP_CRMINQUERY:
583       case OP_CRPOSSTAR:
584       case OP_CRPOSQUERY:
585       min = 0;
586       cc++;
587       break;
588 
589       case OP_CRPLUS:
590       case OP_CRMINPLUS:
591       case OP_CRPOSPLUS:
592       min = 1;
593       cc++;
594       break;
595 
596       case OP_CRRANGE:
597       case OP_CRMINRANGE:
598       case OP_CRPOSRANGE:
599       min = GET2(cc, 1);
600       cc += 1 + 2 * IMM2_SIZE;
601       break;
602 
603       default:
604       min = 1;
605       break;
606       }
607 
608      /* Take care not to overflow: (1) min and d are ints, so check that their
609      product is not greater than INT_MAX. (2) branchlength is limited to
610      UINT16_MAX (checked at the top of the loop). */
611 
612     if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
613       branchlength = UINT16_MAX;
614     else branchlength += min * d;
615     break;
616 
617     /* Recursion always refers to the first occurrence of a subpattern with a
618     given number. Therefore, we can always make use of caching, even when the
619     pattern contains multiple subpatterns with the same number. */
620 
621     case OP_RECURSE:
622     cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
623     recno = GET2(cs, 1+LINK_SIZE);
624     if (recno == prev_recurse_recno)
625       {
626       branchlength += prev_recurse_d;
627       }
628     else
629       {
630       do ce += GET(ce, 1); while (*ce == OP_ALT);
631       if (cc > cs && cc < ce)    /* Simple recursion */
632         had_recurse = TRUE;
633       else
634         {
635         recurse_check *r = recurses;
636         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
637         if (r != NULL)          /* Mutual recursion */
638           had_recurse = TRUE;
639         else
640           {
641           this_recurse.prev = recurses;
642           this_recurse.group = cs;
643           prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
644             countptr, backref_cache);
645           if (prev_recurse_d < 0) return prev_recurse_d;
646           prev_recurse_recno = recno;
647           branchlength += prev_recurse_d;
648           }
649         }
650       }
651     cc += 1 + LINK_SIZE + once_fudge;
652     once_fudge = 0;
653     break;
654 
655     /* Anything else does not or need not match a character. We can get the
656     item's length from the table, but for those that can match zero occurrences
657     of a character, we must take special action for UTF-8 characters. As it
658     happens, the "NOT" versions of these opcodes are used at present only for
659     ASCII characters, so they could be omitted from this list. However, in
660     future that may change, so we include them here so as not to leave a
661     gotcha for a future maintainer. */
662 
663     case OP_UPTO:
664     case OP_UPTOI:
665     case OP_NOTUPTO:
666     case OP_NOTUPTOI:
667     case OP_MINUPTO:
668     case OP_MINUPTOI:
669     case OP_NOTMINUPTO:
670     case OP_NOTMINUPTOI:
671     case OP_POSUPTO:
672     case OP_POSUPTOI:
673     case OP_NOTPOSUPTO:
674     case OP_NOTPOSUPTOI:
675 
676     case OP_STAR:
677     case OP_STARI:
678     case OP_NOTSTAR:
679     case OP_NOTSTARI:
680     case OP_MINSTAR:
681     case OP_MINSTARI:
682     case OP_NOTMINSTAR:
683     case OP_NOTMINSTARI:
684     case OP_POSSTAR:
685     case OP_POSSTARI:
686     case OP_NOTPOSSTAR:
687     case OP_NOTPOSSTARI:
688 
689     case OP_QUERY:
690     case OP_QUERYI:
691     case OP_NOTQUERY:
692     case OP_NOTQUERYI:
693     case OP_MINQUERY:
694     case OP_MINQUERYI:
695     case OP_NOTMINQUERY:
696     case OP_NOTMINQUERYI:
697     case OP_POSQUERY:
698     case OP_POSQUERYI:
699     case OP_NOTPOSQUERY:
700     case OP_NOTPOSQUERYI:
701 
702     cc += PRIV(OP_lengths)[op];
703 #ifdef SUPPORT_UNICODE
704     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
705 #endif
706     break;
707 
708     /* Skip these, but we need to add in the name length. */
709 
710     case OP_MARK:
711     case OP_COMMIT_ARG:
712     case OP_PRUNE_ARG:
713     case OP_SKIP_ARG:
714     case OP_THEN_ARG:
715     cc += PRIV(OP_lengths)[op] + cc[1];
716     break;
717 
718     /* The remaining opcodes are just skipped over. */
719 
720     case OP_CLOSE:
721     case OP_COMMIT:
722     case OP_FAIL:
723     case OP_PRUNE:
724     case OP_SET_SOM:
725     case OP_SKIP:
726     case OP_THEN:
727     cc += PRIV(OP_lengths)[op];
728     break;
729 
730     /* This should not occur: we list all opcodes explicitly so that when
731     new ones get added they are properly considered. */
732 
733     default:
734     return -3;
735     }
736   }
737 /* Control never gets here */
738 }
739 
740 
741 
742 /*************************************************
743 *      Set a bit and maybe its alternate case    *
744 *************************************************/
745 
746 /* Given a character, set its first code unit's bit in the table, and also the
747 corresponding bit for the other version of a letter if we are caseless.
748 
749 Arguments:
750   re            points to the regex block
751   p             points to the first code unit of the character
752   caseless      TRUE if caseless
753   utf           TRUE for UTF mode
754 
755 Returns:        pointer after the character
756 */
757 
758 static PCRE2_SPTR
set_table_bit(pcre2_real_code * re,PCRE2_SPTR p,BOOL caseless,BOOL utf)759 set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
760 {
761 uint32_t c = *p++;   /* First code unit */
762 (void)utf;           /* Stop compiler warning when UTF not supported */
763 
764 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
765 0xff. */
766 
767 #if PCRE2_CODE_UNIT_WIDTH != 8
768 if (c > 0xff) SET_BIT(0xff); else
769 #endif
770 
771 SET_BIT(c);
772 
773 /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
774 the end of the character, even when caseless. */
775 
776 #ifdef SUPPORT_UNICODE
777 if (utf)
778   {
779 #if PCRE2_CODE_UNIT_WIDTH == 8
780   if (c >= 0xc0) GETUTF8INC(c, p);
781 #elif PCRE2_CODE_UNIT_WIDTH == 16
782   if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
783 #endif
784   }
785 #endif  /* SUPPORT_UNICODE */
786 
787 /* If caseless, handle the other case of the character. */
788 
789 if (caseless)
790   {
791 #ifdef SUPPORT_UNICODE
792   if (utf)
793     {
794 #if PCRE2_CODE_UNIT_WIDTH == 8
795     PCRE2_UCHAR buff[6];
796     c = UCD_OTHERCASE(c);
797     (void)PRIV(ord2utf)(c, buff);
798     SET_BIT(buff[0]);
799 #else  /* 16-bit or 32-bit mode */
800     c = UCD_OTHERCASE(c);
801     if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
802 #endif
803     }
804   else
805 #endif  /* SUPPORT_UNICODE */
806 
807   /* Not UTF */
808 
809   if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
810   }
811 
812 return p;
813 }
814 
815 
816 
817 /*************************************************
818 *     Set bits for a positive character type     *
819 *************************************************/
820 
821 /* This function sets starting bits for a character type. In UTF-8 mode, we can
822 only do a direct setting for bytes less than 128, as otherwise there can be
823 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
824 environment, the tables will only recognize ASCII characters anyway, but in at
825 least one Windows environment, some higher bytes bits were set in the tables.
826 So we deal with that case by considering the UTF-8 encoding.
827 
828 Arguments:
829   re             the regex block
830   cbit type      the type of character wanted
831   table_limit    32 for non-UTF-8; 16 for UTF-8
832 
833 Returns:         nothing
834 */
835 
836 static void
set_type_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)837 set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
838 {
839 uint32_t c;
840 for (c = 0; c < table_limit; c++)
841   re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
842 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
843 if (table_limit == 32) return;
844 for (c = 128; c < 256; c++)
845   {
846   if ((re->tables[cbits_offset + c/8] & (1u << (c&7))) != 0)
847     {
848     PCRE2_UCHAR buff[6];
849     (void)PRIV(ord2utf)(c, buff);
850     SET_BIT(buff[0]);
851     }
852   }
853 #endif  /* UTF-8 */
854 }
855 
856 
857 /*************************************************
858 *     Set bits for a negative character type     *
859 *************************************************/
860 
861 /* This function sets starting bits for a negative character type such as \D.
862 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
863 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
864 Unlike in the positive case, where we can set appropriate starting bits for
865 specific high-valued UTF-8 characters, in this case we have to set the bits for
866 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
867 0xc0 (192) for simplicity.
868 
869 Arguments:
870   re             the regex block
871   cbit type      the type of character wanted
872   table_limit    32 for non-UTF-8; 16 for UTF-8
873 
874 Returns:         nothing
875 */
876 
877 static void
set_nottype_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)878 set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
879 {
880 uint32_t c;
881 for (c = 0; c < table_limit; c++)
882   re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
883 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
884 if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
885 #endif
886 }
887 
888 
889 
890 /*************************************************
891 *          Create bitmap of starting bytes       *
892 *************************************************/
893 
894 /* This function scans a compiled unanchored expression recursively and
895 attempts to build a bitmap of the set of possible starting code units whose
896 values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
897 the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
898 we pass a value of 16 rather than 32 as the final argument. (See comments in
899 those functions for the reason.)
900 
901 The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
902 (a*)b where the group provides some optional starting code units but scanning
903 must continue at the outer level to find at least one mandatory code unit. At
904 the outermost level, this function fails unless the result is SSB_DONE.
905 
906 Arguments:
907   re           points to the compiled regex block
908   code         points to an expression
909   utf          TRUE if in UTF mode
910 
911 Returns:       SSB_FAIL     => Failed to find any starting code units
912                SSB_DONE     => Found mandatory starting code units
913                SSB_CONTINUE => Found optional starting code units
914                SSB_UNKNOWN  => Hit an unrecognized opcode
915 */
916 
917 static int
set_start_bits(pcre2_real_code * re,PCRE2_SPTR code,BOOL utf)918 set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
919 {
920 uint32_t c;
921 int yield = SSB_DONE;
922 
923 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
924 int table_limit = utf? 16:32;
925 #else
926 int table_limit = 32;
927 #endif
928 
929 do
930   {
931   BOOL try_next = TRUE;
932   PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
933 
934   if (*code == OP_CBRA || *code == OP_SCBRA ||
935       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
936 
937   while (try_next)    /* Loop for items in this branch */
938     {
939     int rc;
940     uint8_t *classmap = NULL;
941 
942     switch(*tcode)
943       {
944       /* If we reach something we don't understand, it means a new opcode has
945       been created that hasn't been added to this function. Hopefully this
946       problem will be discovered during testing. */
947 
948       default:
949       return SSB_UNKNOWN;
950 
951       /* Fail for a valid opcode that implies no starting bits. */
952 
953       case OP_ACCEPT:
954       case OP_ASSERT_ACCEPT:
955       case OP_ALLANY:
956       case OP_ANY:
957       case OP_ANYBYTE:
958       case OP_CIRCM:
959       case OP_CLOSE:
960       case OP_COMMIT:
961       case OP_COMMIT_ARG:
962       case OP_COND:
963       case OP_CREF:
964       case OP_FALSE:
965       case OP_TRUE:
966       case OP_DNCREF:
967       case OP_DNREF:
968       case OP_DNREFI:
969       case OP_DNRREF:
970       case OP_DOLL:
971       case OP_DOLLM:
972       case OP_END:
973       case OP_EOD:
974       case OP_EODN:
975       case OP_EXTUNI:
976       case OP_FAIL:
977       case OP_MARK:
978       case OP_NOT:
979       case OP_NOTEXACT:
980       case OP_NOTEXACTI:
981       case OP_NOTI:
982       case OP_NOTMINPLUS:
983       case OP_NOTMINPLUSI:
984       case OP_NOTMINQUERY:
985       case OP_NOTMINQUERYI:
986       case OP_NOTMINSTAR:
987       case OP_NOTMINSTARI:
988       case OP_NOTMINUPTO:
989       case OP_NOTMINUPTOI:
990       case OP_NOTPLUS:
991       case OP_NOTPLUSI:
992       case OP_NOTPOSPLUS:
993       case OP_NOTPOSPLUSI:
994       case OP_NOTPOSQUERY:
995       case OP_NOTPOSQUERYI:
996       case OP_NOTPOSSTAR:
997       case OP_NOTPOSSTARI:
998       case OP_NOTPOSUPTO:
999       case OP_NOTPOSUPTOI:
1000       case OP_NOTPROP:
1001       case OP_NOTQUERY:
1002       case OP_NOTQUERYI:
1003       case OP_NOTSTAR:
1004       case OP_NOTSTARI:
1005       case OP_NOTUPTO:
1006       case OP_NOTUPTOI:
1007       case OP_NOT_HSPACE:
1008       case OP_NOT_VSPACE:
1009       case OP_PRUNE:
1010       case OP_PRUNE_ARG:
1011       case OP_RECURSE:
1012       case OP_REF:
1013       case OP_REFI:
1014       case OP_REVERSE:
1015       case OP_RREF:
1016       case OP_SCOND:
1017       case OP_SET_SOM:
1018       case OP_SKIP:
1019       case OP_SKIP_ARG:
1020       case OP_SOD:
1021       case OP_SOM:
1022       case OP_THEN:
1023       case OP_THEN_ARG:
1024       return SSB_FAIL;
1025 
1026       /* OP_CIRC happens only at the start of an anchored branch (multiline ^
1027       uses OP_CIRCM). Skip over it. */
1028 
1029       case OP_CIRC:
1030       tcode += PRIV(OP_lengths)[OP_CIRC];
1031       break;
1032 
1033       /* A "real" property test implies no starting bits, but the fake property
1034       PT_CLIST identifies a list of characters. These lists are short, as they
1035       are used for characters with more than one "other case", so there is no
1036       point in recognizing them for OP_NOTPROP. */
1037 
1038       case OP_PROP:
1039       if (tcode[1] != PT_CLIST) return SSB_FAIL;
1040         {
1041         const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
1042         while ((c = *p++) < NOTACHAR)
1043           {
1044 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1045           if (utf)
1046             {
1047             PCRE2_UCHAR buff[6];
1048             (void)PRIV(ord2utf)(c, buff);
1049             c = buff[0];
1050             }
1051 #endif
1052           if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1053           }
1054         }
1055       try_next = FALSE;
1056       break;
1057 
1058       /* We can ignore word boundary tests. */
1059 
1060       case OP_WORD_BOUNDARY:
1061       case OP_NOT_WORD_BOUNDARY:
1062       tcode++;
1063       break;
1064 
1065       /* If we hit a bracket or a positive lookahead assertion, recurse to set
1066       bits from within the subpattern. If it can't find anything, we have to
1067       give up. If it finds some mandatory character(s), we are done for this
1068       branch. Otherwise, carry on scanning after the subpattern. */
1069 
1070       case OP_BRA:
1071       case OP_SBRA:
1072       case OP_CBRA:
1073       case OP_SCBRA:
1074       case OP_BRAPOS:
1075       case OP_SBRAPOS:
1076       case OP_CBRAPOS:
1077       case OP_SCBRAPOS:
1078       case OP_ONCE:
1079       case OP_SCRIPT_RUN:
1080       case OP_ASSERT:
1081       rc = set_start_bits(re, tcode, utf);
1082       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1083       if (rc == SSB_DONE) try_next = FALSE; else
1084         {
1085         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1086         tcode += 1 + LINK_SIZE;
1087         }
1088       break;
1089 
1090       /* If we hit ALT or KET, it means we haven't found anything mandatory in
1091       this branch, though we might have found something optional. For ALT, we
1092       continue with the next alternative, but we have to arrange that the final
1093       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1094       return SSB_CONTINUE: if this is the top level, that indicates failure,
1095       but after a nested subpattern, it causes scanning to continue. */
1096 
1097       case OP_ALT:
1098       yield = SSB_CONTINUE;
1099       try_next = FALSE;
1100       break;
1101 
1102       case OP_KET:
1103       case OP_KETRMAX:
1104       case OP_KETRMIN:
1105       case OP_KETRPOS:
1106       return SSB_CONTINUE;
1107 
1108       /* Skip over callout */
1109 
1110       case OP_CALLOUT:
1111       tcode += PRIV(OP_lengths)[OP_CALLOUT];
1112       break;
1113 
1114       case OP_CALLOUT_STR:
1115       tcode += GET(tcode, 1 + 2*LINK_SIZE);
1116       break;
1117 
1118       /* Skip over lookbehind and negative lookahead assertions */
1119 
1120       case OP_ASSERT_NOT:
1121       case OP_ASSERTBACK:
1122       case OP_ASSERTBACK_NOT:
1123       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1124       tcode += 1 + LINK_SIZE;
1125       break;
1126 
1127       /* BRAZERO does the bracket, but carries on. */
1128 
1129       case OP_BRAZERO:
1130       case OP_BRAMINZERO:
1131       case OP_BRAPOSZERO:
1132       rc = set_start_bits(re, ++tcode, utf);
1133       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1134       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1135       tcode += 1 + LINK_SIZE;
1136       break;
1137 
1138       /* SKIPZERO skips the bracket. */
1139 
1140       case OP_SKIPZERO:
1141       tcode++;
1142       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1143       tcode += 1 + LINK_SIZE;
1144       break;
1145 
1146       /* Single-char * or ? sets the bit and tries the next item */
1147 
1148       case OP_STAR:
1149       case OP_MINSTAR:
1150       case OP_POSSTAR:
1151       case OP_QUERY:
1152       case OP_MINQUERY:
1153       case OP_POSQUERY:
1154       tcode = set_table_bit(re, tcode + 1, FALSE, utf);
1155       break;
1156 
1157       case OP_STARI:
1158       case OP_MINSTARI:
1159       case OP_POSSTARI:
1160       case OP_QUERYI:
1161       case OP_MINQUERYI:
1162       case OP_POSQUERYI:
1163       tcode = set_table_bit(re, tcode + 1, TRUE, utf);
1164       break;
1165 
1166       /* Single-char upto sets the bit and tries the next */
1167 
1168       case OP_UPTO:
1169       case OP_MINUPTO:
1170       case OP_POSUPTO:
1171       tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
1172       break;
1173 
1174       case OP_UPTOI:
1175       case OP_MINUPTOI:
1176       case OP_POSUPTOI:
1177       tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
1178       break;
1179 
1180       /* At least one single char sets the bit and stops */
1181 
1182       case OP_EXACT:
1183       tcode += IMM2_SIZE;
1184       /* Fall through */
1185       case OP_CHAR:
1186       case OP_PLUS:
1187       case OP_MINPLUS:
1188       case OP_POSPLUS:
1189       (void)set_table_bit(re, tcode + 1, FALSE, utf);
1190       try_next = FALSE;
1191       break;
1192 
1193       case OP_EXACTI:
1194       tcode += IMM2_SIZE;
1195       /* Fall through */
1196       case OP_CHARI:
1197       case OP_PLUSI:
1198       case OP_MINPLUSI:
1199       case OP_POSPLUSI:
1200       (void)set_table_bit(re, tcode + 1, TRUE, utf);
1201       try_next = FALSE;
1202       break;
1203 
1204       /* Special spacing and line-terminating items. These recognize specific
1205       lists of characters. The difference between VSPACE and ANYNL is that the
1206       latter can match the two-character CRLF sequence, but that is not
1207       relevant for finding the first character, so their code here is
1208       identical. */
1209 
1210       case OP_HSPACE:
1211       SET_BIT(CHAR_HT);
1212       SET_BIT(CHAR_SPACE);
1213 
1214       /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1215       the bits for 0xA0 and for code units >= 255, independently of UTF. */
1216 
1217 #if PCRE2_CODE_UNIT_WIDTH != 8
1218       SET_BIT(0xA0);
1219       SET_BIT(0xFF);
1220 #else
1221       /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1222       units of horizontal space characters. */
1223 
1224 #ifdef SUPPORT_UNICODE
1225       if (utf)
1226         {
1227         SET_BIT(0xC2);  /* For U+00A0 */
1228         SET_BIT(0xE1);  /* For U+1680, U+180E */
1229         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1230         SET_BIT(0xE3);  /* For U+3000 */
1231         }
1232       else
1233 #endif
1234       /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1235       the code is EBCDIC. */
1236         {
1237 #ifndef EBCDIC
1238         SET_BIT(0xA0);
1239 #endif  /* Not EBCDIC */
1240         }
1241 #endif  /* 8-bit support */
1242 
1243       try_next = FALSE;
1244       break;
1245 
1246       case OP_ANYNL:
1247       case OP_VSPACE:
1248       SET_BIT(CHAR_LF);
1249       SET_BIT(CHAR_VT);
1250       SET_BIT(CHAR_FF);
1251       SET_BIT(CHAR_CR);
1252 
1253       /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1254       the bits for NEL and for code units >= 255, independently of UTF. */
1255 
1256 #if PCRE2_CODE_UNIT_WIDTH != 8
1257       SET_BIT(CHAR_NEL);
1258       SET_BIT(0xFF);
1259 #else
1260       /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1261       units of vertical space characters. */
1262 
1263 #ifdef SUPPORT_UNICODE
1264       if (utf)
1265         {
1266         SET_BIT(0xC2);  /* For U+0085 (NEL) */
1267         SET_BIT(0xE2);  /* For U+2028, U+2029 */
1268         }
1269       else
1270 #endif
1271       /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1272         {
1273         SET_BIT(CHAR_NEL);
1274         }
1275 #endif  /* 8-bit support */
1276 
1277       try_next = FALSE;
1278       break;
1279 
1280       /* Single character types set the bits and stop. Note that if PCRE2_UCP
1281       is set, we do not see these opcodes because \d etc are converted to
1282       properties. Therefore, these apply in the case when only characters less
1283       than 256 are recognized to match the types. */
1284 
1285       case OP_NOT_DIGIT:
1286       set_nottype_bits(re, cbit_digit, table_limit);
1287       try_next = FALSE;
1288       break;
1289 
1290       case OP_DIGIT:
1291       set_type_bits(re, cbit_digit, table_limit);
1292       try_next = FALSE;
1293       break;
1294 
1295       case OP_NOT_WHITESPACE:
1296       set_nottype_bits(re, cbit_space, table_limit);
1297       try_next = FALSE;
1298       break;
1299 
1300       case OP_WHITESPACE:
1301       set_type_bits(re, cbit_space, table_limit);
1302       try_next = FALSE;
1303       break;
1304 
1305       case OP_NOT_WORDCHAR:
1306       set_nottype_bits(re, cbit_word, table_limit);
1307       try_next = FALSE;
1308       break;
1309 
1310       case OP_WORDCHAR:
1311       set_type_bits(re, cbit_word, table_limit);
1312       try_next = FALSE;
1313       break;
1314 
1315       /* One or more character type fudges the pointer and restarts, knowing
1316       it will hit a single character type and stop there. */
1317 
1318       case OP_TYPEPLUS:
1319       case OP_TYPEMINPLUS:
1320       case OP_TYPEPOSPLUS:
1321       tcode++;
1322       break;
1323 
1324       case OP_TYPEEXACT:
1325       tcode += 1 + IMM2_SIZE;
1326       break;
1327 
1328       /* Zero or more repeats of character types set the bits and then
1329       try again. */
1330 
1331       case OP_TYPEUPTO:
1332       case OP_TYPEMINUPTO:
1333       case OP_TYPEPOSUPTO:
1334       tcode += IMM2_SIZE;  /* Fall through */
1335 
1336       case OP_TYPESTAR:
1337       case OP_TYPEMINSTAR:
1338       case OP_TYPEPOSSTAR:
1339       case OP_TYPEQUERY:
1340       case OP_TYPEMINQUERY:
1341       case OP_TYPEPOSQUERY:
1342       switch(tcode[1])
1343         {
1344         default:
1345         case OP_ANY:
1346         case OP_ALLANY:
1347         return SSB_FAIL;
1348 
1349         case OP_HSPACE:
1350         SET_BIT(CHAR_HT);
1351         SET_BIT(CHAR_SPACE);
1352 
1353         /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1354         the bits for 0xA0 and for code units >= 255, independently of UTF. */
1355 
1356 #if PCRE2_CODE_UNIT_WIDTH != 8
1357         SET_BIT(0xA0);
1358         SET_BIT(0xFF);
1359 #else
1360         /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1361         units of horizontal space characters. */
1362 
1363 #ifdef SUPPORT_UNICODE
1364         if (utf)
1365           {
1366           SET_BIT(0xC2);  /* For U+00A0 */
1367           SET_BIT(0xE1);  /* For U+1680, U+180E */
1368           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1369           SET_BIT(0xE3);  /* For U+3000 */
1370           }
1371         else
1372 #endif
1373         /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1374         the code is EBCDIC. */
1375           {
1376 #ifndef EBCDIC
1377           SET_BIT(0xA0);
1378 #endif  /* Not EBCDIC */
1379           }
1380 #endif  /* 8-bit support */
1381         break;
1382 
1383         case OP_ANYNL:
1384         case OP_VSPACE:
1385         SET_BIT(CHAR_LF);
1386         SET_BIT(CHAR_VT);
1387         SET_BIT(CHAR_FF);
1388         SET_BIT(CHAR_CR);
1389 
1390         /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1391         the bits for NEL and for code units >= 255, independently of UTF. */
1392 
1393 #if PCRE2_CODE_UNIT_WIDTH != 8
1394         SET_BIT(CHAR_NEL);
1395         SET_BIT(0xFF);
1396 #else
1397         /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1398         units of vertical space characters. */
1399 
1400 #ifdef SUPPORT_UNICODE
1401         if (utf)
1402           {
1403           SET_BIT(0xC2);  /* For U+0085 (NEL) */
1404           SET_BIT(0xE2);  /* For U+2028, U+2029 */
1405           }
1406         else
1407 #endif
1408         /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1409           {
1410           SET_BIT(CHAR_NEL);
1411           }
1412 #endif  /* 8-bit support */
1413         break;
1414 
1415         case OP_NOT_DIGIT:
1416         set_nottype_bits(re, cbit_digit, table_limit);
1417         break;
1418 
1419         case OP_DIGIT:
1420         set_type_bits(re, cbit_digit, table_limit);
1421         break;
1422 
1423         case OP_NOT_WHITESPACE:
1424         set_nottype_bits(re, cbit_space, table_limit);
1425         break;
1426 
1427         case OP_WHITESPACE:
1428         set_type_bits(re, cbit_space, table_limit);
1429         break;
1430 
1431         case OP_NOT_WORDCHAR:
1432         set_nottype_bits(re, cbit_word, table_limit);
1433         break;
1434 
1435         case OP_WORDCHAR:
1436         set_type_bits(re, cbit_word, table_limit);
1437         break;
1438         }
1439 
1440       tcode += 2;
1441       break;
1442 
1443       /* Extended class: if there are any property checks, or if this is a
1444       negative XCLASS without a map, give up. If there are no property checks,
1445       there must be wide characters on the XCLASS list, because otherwise an
1446       XCLASS would not have been created. This means that code points >= 255
1447       are always potential starters. */
1448 
1449 #ifdef SUPPORT_WIDE_CHARS
1450       case OP_XCLASS:
1451       if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
1452           (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1453         return SSB_FAIL;
1454 
1455       /* We have a positive XCLASS or a negative one without a map. Set up the
1456       map pointer if there is one, and fall through. */
1457 
1458       classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
1459         (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1460 #endif
1461       /* It seems that the fall through comment must be outside the #ifdef if
1462       it is to avoid the gcc compiler warning. */
1463 
1464       /* Fall through */
1465 
1466       /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1467       in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1468       because it starts a character with a value > 255. In 8-bit non-UTF mode,
1469       there is no difference between CLASS and NCLASS. In all other wide
1470       character modes, set the 0xFF bit to indicate code units >= 255. */
1471 
1472       case OP_NCLASS:
1473 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1474       if (utf)
1475         {
1476         re->start_bitmap[24] |= 0xf0;            /* Bits for 0xc4 - 0xc8 */
1477         memset(re->start_bitmap+25, 0xff, 7);    /* Bits for 0xc9 - 0xff */
1478         }
1479 #elif PCRE2_CODE_UNIT_WIDTH != 8
1480       SET_BIT(0xFF);                             /* For characters >= 255 */
1481 #endif
1482       /* Fall through */
1483 
1484       /* Enter here for a positive non-XCLASS. If we have fallen through from
1485       an XCLASS, classmap will already be set; just advance the code pointer.
1486       Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1487 
1488       case OP_CLASS:
1489       if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1490         {
1491         classmap = (uint8_t *)(++tcode);
1492         tcode += 32 / sizeof(PCRE2_UCHAR);
1493         }
1494 
1495       /* When wide characters are supported, classmap may be NULL. In UTF-8
1496       (sic) mode, the bits in a class bit map correspond to character values,
1497       not to byte values. However, the bit map we are constructing is for byte
1498       values. So we have to do a conversion for characters whose code point is
1499       greater than 127. In fact, there are only two possible starting bytes for
1500       characters in the range 128 - 255. */
1501 
1502       if (classmap != NULL)
1503         {
1504 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1505         if (utf)
1506           {
1507           for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1508           for (c = 128; c < 256; c++)
1509             {
1510             if ((classmap[c/8] & (1u << (c&7))) != 0)
1511               {
1512               int d = (c >> 6) | 0xc0;                 /* Set bit for this starter */
1513               re->start_bitmap[d/8] |= (1u << (d&7));  /* and then skip on to the */
1514               c = (c & 0xc0) + 0x40 - 1;               /* next relevant character. */
1515               }
1516             }
1517           }
1518         else
1519 #endif
1520         /* In all modes except UTF-8, the two bit maps are compatible. */
1521 
1522           {
1523           for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1524           }
1525         }
1526 
1527       /* Act on what follows the class. For a zero minimum repeat, continue;
1528       otherwise stop processing. */
1529 
1530       switch (*tcode)
1531         {
1532         case OP_CRSTAR:
1533         case OP_CRMINSTAR:
1534         case OP_CRQUERY:
1535         case OP_CRMINQUERY:
1536         case OP_CRPOSSTAR:
1537         case OP_CRPOSQUERY:
1538         tcode++;
1539         break;
1540 
1541         case OP_CRRANGE:
1542         case OP_CRMINRANGE:
1543         case OP_CRPOSRANGE:
1544         if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1545           else try_next = FALSE;
1546         break;
1547 
1548         default:
1549         try_next = FALSE;
1550         break;
1551         }
1552       break; /* End of class handling case */
1553       }      /* End of switch for opcodes */
1554     }        /* End of try_next loop */
1555 
1556   code += GET(code, 1);   /* Advance to next branch */
1557   }
1558 while (*code == OP_ALT);
1559 
1560 return yield;
1561 }
1562 
1563 
1564 
1565 /*************************************************
1566 *          Study a compiled expression           *
1567 *************************************************/
1568 
1569 /* This function is handed a compiled expression that it must study to produce
1570 information that will speed up the matching.
1571 
1572 Argument:  points to the compiled expression
1573 Returns:   0 normally; non-zero should never normally occur
1574            1 unknown opcode in set_start_bits
1575            2 missing capturing bracket
1576            3 unknown opcode in find_minlength
1577 */
1578 
1579 int
PRIV(study)1580 PRIV(study)(pcre2_real_code *re)
1581 {
1582 int min;
1583 int count = 0;
1584 PCRE2_UCHAR *code;
1585 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1586 
1587 /* Find start of compiled code */
1588 
1589 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
1590   re->name_entry_size * re->name_count;
1591 
1592 /* For a pattern that has a first code unit, or a multiline pattern that
1593 matches only at "line start", there is no point in seeking a list of starting
1594 code units. */
1595 
1596 if ((re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1597   {
1598   int rc = set_start_bits(re, code, utf);
1599   if (rc == SSB_UNKNOWN) return 1;
1600   if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
1601   }
1602 
1603 /* Find the minimum length of subject string. If the pattern can match an empty
1604 string, the minimum length is already known. If there are more back references
1605 than the size of the vector we are going to cache them in, do nothing. A
1606 pattern that complicated will probably take a long time to analyze and may in
1607 any case turn out to be too complicated. Note that back reference minima are
1608 held as 16-bit numbers. */
1609 
1610 if ((re->flags & PCRE2_MATCH_EMPTY) == 0 &&
1611      re->top_backref <= MAX_CACHE_BACKREF)
1612   {
1613   int backref_cache[MAX_CACHE_BACKREF+1];
1614   backref_cache[0] = 0;    /* Highest one that is set */
1615   min = find_minlength(re, code, code, utf, NULL, &count, backref_cache);
1616   switch(min)
1617     {
1618     case -1:  /* \C in UTF mode or (*ACCEPT) or over-complex regex */
1619     break;    /* Leave minlength unchanged (will be zero) */
1620 
1621     case -2:
1622     return 2; /* missing capturing bracket */
1623 
1624     case -3:
1625     return 3; /* unrecognized opcode */
1626 
1627     default:
1628     if (min > UINT16_MAX) min = UINT16_MAX;
1629     re->minlength = min;
1630     break;
1631     }
1632   }
1633 
1634 return 0;
1635 }
1636 
1637 /* End of pcre2_study.c */
1638