• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9      Original API code Copyright (c) 1997-2012 University of Cambridge
10          New API code Copyright (c) 2016 University of Cambridge
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 /* This module contains functions for scanning a compiled pattern and
42 collecting data (e.g. minimum matching length). */
43 
44 
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48 
49 
50 #include "pcre2_internal.h"
51 
52 
53 /* Set a bit in the starting code unit bit map. */
54 
55 #define SET_BIT(c) re->start_bitmap[(c)/8] |= (1 << ((c)&7))
56 
57 /* Returns from set_start_bits() */
58 
59 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
60 
61 
62 /*************************************************
63 *   Find the minimum subject length for a group  *
64 *************************************************/
65 
66 /* Scan a parenthesized group and compute the minimum length of subject that
67 is needed to match it. This is a lower bound; it does not mean there is a
68 string of that length that matches. In UTF mode, the result is in characters
69 rather than code units. The field in a compiled pattern for storing the minimum
70 length is 16-bits long (on the grounds that anything longer than that is
71 pathological), so we give up when we reach that amount. This also means that
72 integer overflow for really crazy patterns cannot happen.
73 
74 Arguments:
75   re              compiled pattern block
76   code            pointer to start of group (the bracket)
77   startcode       pointer to start of the whole pattern's code
78   utf             UTF flag
79   recurses        chain of recurse_check to catch mutual recursion
80   countptr        pointer to call count (to catch over complexity)
81 
82 Returns:   the minimum length
83            -1 \C in UTF-8 mode
84               or (*ACCEPT)
85               or pattern too complicated
86               or back reference to duplicate name/number
87            -2 internal error (missing capturing bracket)
88            -3 internal error (opcode not listed)
89 */
90 
91 static int
find_minlength(const pcre2_real_code * re,PCRE2_SPTR code,PCRE2_SPTR startcode,BOOL utf,recurse_check * recurses,int * countptr)92 find_minlength(const pcre2_real_code *re, PCRE2_SPTR code,
93   PCRE2_SPTR startcode, BOOL utf, recurse_check *recurses, int *countptr)
94 {
95 int length = -1;
96 int prev_cap_recno = -1;
97 int prev_cap_d = 0;
98 int prev_recurse_recno = -1;
99 int prev_recurse_d = 0;
100 uint32_t once_fudge = 0;
101 BOOL had_recurse = FALSE;
102 BOOL dupcapused = (re->flags & PCRE2_DUPCAPUSED) != 0;
103 recurse_check this_recurse;
104 register int branchlength = 0;
105 register PCRE2_UCHAR *cc = (PCRE2_UCHAR *)code + 1 + LINK_SIZE;
106 
107 /* If this is a "could be empty" group, its minimum length is 0. */
108 
109 if (*code >= OP_SBRA && *code <= OP_SCOND) return 0;
110 
111 /* Skip over capturing bracket number */
112 
113 if (*code == OP_CBRA || *code == OP_CBRAPOS) cc += IMM2_SIZE;
114 
115 /* A large and/or complex regex can take too long to process. */
116 
117 if ((*countptr)++ > 1000) return -1;
118 
119 /* Scan along the opcodes for this branch. If we get to the end of the branch,
120 check the length against that of the other branches. If the accumulated length
121 passes 16-bits, stop. */
122 
123 for (;;)
124   {
125   int d, min, recno;
126   PCRE2_UCHAR *cs, *ce;
127   register PCRE2_UCHAR op = *cc;
128 
129   if (branchlength >= UINT16_MAX) return UINT16_MAX;
130 
131   switch (op)
132     {
133     case OP_COND:
134     case OP_SCOND:
135 
136     /* If there is only one branch in a condition, the implied branch has zero
137     length, so we don't add anything. This covers the DEFINE "condition"
138     automatically. If there are two branches we can treat it the same as any
139     other non-capturing subpattern. */
140 
141     cs = cc + GET(cc, 1);
142     if (*cs != OP_ALT)
143       {
144       cc = cs + 1 + LINK_SIZE;
145       break;
146       }
147     goto PROCESS_NON_CAPTURE;
148 
149     /* There's a special case of OP_ONCE, when it is wrapped round an
150     OP_RECURSE. We'd like to process the latter at this level so that
151     remembering the value works for repeated cases. So we do nothing, but
152     set a fudge value to skip over the OP_KET after the recurse. */
153 
154     case OP_ONCE:
155     if (cc[1+LINK_SIZE] == OP_RECURSE && cc[2*(1+LINK_SIZE)] == OP_KET)
156       {
157       once_fudge = 1 + LINK_SIZE;
158       cc += 1 + LINK_SIZE;
159       break;
160       }
161     /* Fall through */
162 
163     case OP_ONCE_NC:
164     case OP_BRA:
165     case OP_SBRA:
166     case OP_BRAPOS:
167     case OP_SBRAPOS:
168     PROCESS_NON_CAPTURE:
169     d = find_minlength(re, cc, startcode, utf, recurses, countptr);
170     if (d < 0) return d;
171     branchlength += d;
172     do cc += GET(cc, 1); while (*cc == OP_ALT);
173     cc += 1 + LINK_SIZE;
174     break;
175 
176     /* To save time for repeated capturing subpatterns, we remember the
177     length of the previous one. Unfortunately we can't do the same for
178     the unnumbered ones above. Nor can we do this if (?| is present in the
179     pattern because captures with the same number are not then identical. */
180 
181     case OP_CBRA:
182     case OP_SCBRA:
183     case OP_CBRAPOS:
184     case OP_SCBRAPOS:
185     recno = dupcapused? prev_cap_recno - 1 : (int)GET2(cc, 1+LINK_SIZE);
186     if (recno != prev_cap_recno)
187       {
188       prev_cap_recno = recno;
189       prev_cap_d = find_minlength(re, cc, startcode, utf, recurses, countptr);
190       if (prev_cap_d < 0) return prev_cap_d;
191       }
192     branchlength += prev_cap_d;
193     do cc += GET(cc, 1); while (*cc == OP_ALT);
194     cc += 1 + LINK_SIZE;
195     break;
196 
197     /* ACCEPT makes things far too complicated; we have to give up. */
198 
199     case OP_ACCEPT:
200     case OP_ASSERT_ACCEPT:
201     return -1;
202 
203     /* Reached end of a branch; if it's a ket it is the end of a nested
204     call. If it's ALT it is an alternation in a nested call. If it is END it's
205     the end of the outer call. All can be handled by the same code. If an
206     ACCEPT was previously encountered, use the length that was in force at that
207     time, and pass back the shortest ACCEPT length. */
208 
209     case OP_ALT:
210     case OP_KET:
211     case OP_KETRMAX:
212     case OP_KETRMIN:
213     case OP_KETRPOS:
214     case OP_END:
215     if (length < 0 || (!had_recurse && branchlength < length))
216       length = branchlength;
217     if (op != OP_ALT) return length;
218     cc += 1 + LINK_SIZE;
219     branchlength = 0;
220     had_recurse = FALSE;
221     break;
222 
223     /* Skip over assertive subpatterns */
224 
225     case OP_ASSERT:
226     case OP_ASSERT_NOT:
227     case OP_ASSERTBACK:
228     case OP_ASSERTBACK_NOT:
229     do cc += GET(cc, 1); while (*cc == OP_ALT);
230     /* Fall through */
231 
232     /* Skip over things that don't match chars */
233 
234     case OP_REVERSE:
235     case OP_CREF:
236     case OP_DNCREF:
237     case OP_RREF:
238     case OP_DNRREF:
239     case OP_FALSE:
240     case OP_TRUE:
241     case OP_CALLOUT:
242     case OP_SOD:
243     case OP_SOM:
244     case OP_EOD:
245     case OP_EODN:
246     case OP_CIRC:
247     case OP_CIRCM:
248     case OP_DOLL:
249     case OP_DOLLM:
250     case OP_NOT_WORD_BOUNDARY:
251     case OP_WORD_BOUNDARY:
252     cc += PRIV(OP_lengths)[*cc];
253     break;
254 
255     case OP_CALLOUT_STR:
256     cc += GET(cc, 1 + 2*LINK_SIZE);
257     break;
258 
259     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
260 
261     case OP_BRAZERO:
262     case OP_BRAMINZERO:
263     case OP_BRAPOSZERO:
264     case OP_SKIPZERO:
265     cc += PRIV(OP_lengths)[*cc];
266     do cc += GET(cc, 1); while (*cc == OP_ALT);
267     cc += 1 + LINK_SIZE;
268     break;
269 
270     /* Handle literal characters and + repetitions */
271 
272     case OP_CHAR:
273     case OP_CHARI:
274     case OP_NOT:
275     case OP_NOTI:
276     case OP_PLUS:
277     case OP_PLUSI:
278     case OP_MINPLUS:
279     case OP_MINPLUSI:
280     case OP_POSPLUS:
281     case OP_POSPLUSI:
282     case OP_NOTPLUS:
283     case OP_NOTPLUSI:
284     case OP_NOTMINPLUS:
285     case OP_NOTMINPLUSI:
286     case OP_NOTPOSPLUS:
287     case OP_NOTPOSPLUSI:
288     branchlength++;
289     cc += 2;
290 #ifdef SUPPORT_UNICODE
291     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
292 #endif
293     break;
294 
295     case OP_TYPEPLUS:
296     case OP_TYPEMINPLUS:
297     case OP_TYPEPOSPLUS:
298     branchlength++;
299     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
300     break;
301 
302     /* Handle exact repetitions. The count is already in characters, but we
303     may need to skip over a multibyte character in UTF mode.  */
304 
305     case OP_EXACT:
306     case OP_EXACTI:
307     case OP_NOTEXACT:
308     case OP_NOTEXACTI:
309     branchlength += GET2(cc,1);
310     cc += 2 + IMM2_SIZE;
311 #ifdef SUPPORT_UNICODE
312     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
313 #endif
314     break;
315 
316     case OP_TYPEEXACT:
317     branchlength += GET2(cc,1);
318     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
319       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
320     break;
321 
322     /* Handle single-char non-literal matchers */
323 
324     case OP_PROP:
325     case OP_NOTPROP:
326     cc += 2;
327     /* Fall through */
328 
329     case OP_NOT_DIGIT:
330     case OP_DIGIT:
331     case OP_NOT_WHITESPACE:
332     case OP_WHITESPACE:
333     case OP_NOT_WORDCHAR:
334     case OP_WORDCHAR:
335     case OP_ANY:
336     case OP_ALLANY:
337     case OP_EXTUNI:
338     case OP_HSPACE:
339     case OP_NOT_HSPACE:
340     case OP_VSPACE:
341     case OP_NOT_VSPACE:
342     branchlength++;
343     cc++;
344     break;
345 
346     /* "Any newline" might match two characters, but it also might match just
347     one. */
348 
349     case OP_ANYNL:
350     branchlength += 1;
351     cc++;
352     break;
353 
354     /* The single-byte matcher means we can't proceed in UTF mode. (In
355     non-UTF mode \C will actually be turned into OP_ALLANY, so won't ever
356     appear, but leave the code, just in case.) */
357 
358     case OP_ANYBYTE:
359 #ifdef SUPPORT_UNICODE
360     if (utf) return -1;
361 #endif
362     branchlength++;
363     cc++;
364     break;
365 
366     /* For repeated character types, we have to test for \p and \P, which have
367     an extra two bytes of parameters. */
368 
369     case OP_TYPESTAR:
370     case OP_TYPEMINSTAR:
371     case OP_TYPEQUERY:
372     case OP_TYPEMINQUERY:
373     case OP_TYPEPOSSTAR:
374     case OP_TYPEPOSQUERY:
375     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
376     cc += PRIV(OP_lengths)[op];
377     break;
378 
379     case OP_TYPEUPTO:
380     case OP_TYPEMINUPTO:
381     case OP_TYPEPOSUPTO:
382     if (cc[1 + IMM2_SIZE] == OP_PROP
383       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
384     cc += PRIV(OP_lengths)[op];
385     break;
386 
387     /* Check a class for variable quantification */
388 
389     case OP_CLASS:
390     case OP_NCLASS:
391 #ifdef SUPPORT_WIDE_CHARS
392     case OP_XCLASS:
393     /* The original code caused an unsigned overflow in 64 bit systems,
394     so now we use a conditional statement. */
395     if (op == OP_XCLASS)
396       cc += GET(cc, 1);
397     else
398       cc += PRIV(OP_lengths)[OP_CLASS];
399 #else
400     cc += PRIV(OP_lengths)[OP_CLASS];
401 #endif
402 
403     switch (*cc)
404       {
405       case OP_CRPLUS:
406       case OP_CRMINPLUS:
407       case OP_CRPOSPLUS:
408       branchlength++;
409       /* Fall through */
410 
411       case OP_CRSTAR:
412       case OP_CRMINSTAR:
413       case OP_CRQUERY:
414       case OP_CRMINQUERY:
415       case OP_CRPOSSTAR:
416       case OP_CRPOSQUERY:
417       cc++;
418       break;
419 
420       case OP_CRRANGE:
421       case OP_CRMINRANGE:
422       case OP_CRPOSRANGE:
423       branchlength += GET2(cc,1);
424       cc += 1 + 2 * IMM2_SIZE;
425       break;
426 
427       default:
428       branchlength++;
429       break;
430       }
431     break;
432 
433     /* Backreferences and subroutine calls (OP_RECURSE) are treated in the same
434     way: we find the minimum length for the subpattern. A recursion
435     (backreference or subroutine) causes an a flag to be set that causes the
436     length of this branch to be ignored. The logic is that a recursion can only
437     make sense if there is another alternative that stops the recursing. That
438     will provide the minimum length (when no recursion happens).
439 
440     If PCRE2_MATCH_UNSET_BACKREF is set, a backreference to an unset bracket
441     matches an empty string (by default it causes a matching failure), so in
442     that case we must set the minimum length to zero. */
443 
444     /* Duplicate named pattern back reference. We cannot reliably find a length
445     for this if duplicate numbers are present in the pattern. */
446 
447     case OP_DNREF:
448     case OP_DNREFI:
449     if (dupcapused) return -1;
450     if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
451       {
452       int count = GET2(cc, 1+IMM2_SIZE);
453       PCRE2_UCHAR *slot =
454         (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
455           GET2(cc, 1) * re->name_entry_size;
456 
457       d = INT_MAX;
458 
459       /* Scan all groups with the same name */
460 
461       while (count-- > 0)
462         {
463         ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(slot, 0));
464         if (cs == NULL) return -2;
465         do ce += GET(ce, 1); while (*ce == OP_ALT);
466         if (cc > cs && cc < ce)    /* Simple recursion */
467           {
468           d = 0;
469           had_recurse = TRUE;
470           break;
471           }
472         else
473           {
474           recurse_check *r = recurses;
475           for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
476           if (r != NULL)           /* Mutual recursion */
477             {
478             d = 0;
479             had_recurse = TRUE;
480             break;
481             }
482           else
483             {
484             int dd;
485             this_recurse.prev = recurses;
486             this_recurse.group = cs;
487             dd = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
488             if (dd < d) d = dd;
489             }
490           }
491         slot += re->name_entry_size;
492         }
493       }
494     else d = 0;
495     cc += 1 + 2*IMM2_SIZE;
496     goto REPEAT_BACK_REFERENCE;
497 
498     /* Single back reference. We cannot find a length for this if duplicate
499     numbers are present in the pattern. */
500 
501     case OP_REF:
502     case OP_REFI:
503     if (dupcapused) return -1;
504     if ((re->overall_options & PCRE2_MATCH_UNSET_BACKREF) == 0)
505       {
506       ce = cs = (PCRE2_UCHAR *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
507       if (cs == NULL) return -2;
508       do ce += GET(ce, 1); while (*ce == OP_ALT);
509       if (cc > cs && cc < ce)    /* Simple recursion */
510         {
511         d = 0;
512         had_recurse = TRUE;
513         }
514       else
515         {
516         recurse_check *r = recurses;
517         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
518         if (r != NULL)           /* Mutual recursion */
519           {
520           d = 0;
521           had_recurse = TRUE;
522           }
523         else
524           {
525           this_recurse.prev = recurses;
526           this_recurse.group = cs;
527           d = find_minlength(re, cs, startcode, utf, &this_recurse, countptr);
528           }
529         }
530       }
531     else d = 0;
532     cc += 1 + IMM2_SIZE;
533 
534     /* Handle repeated back references */
535 
536     REPEAT_BACK_REFERENCE:
537     switch (*cc)
538       {
539       case OP_CRSTAR:
540       case OP_CRMINSTAR:
541       case OP_CRQUERY:
542       case OP_CRMINQUERY:
543       case OP_CRPOSSTAR:
544       case OP_CRPOSQUERY:
545       min = 0;
546       cc++;
547       break;
548 
549       case OP_CRPLUS:
550       case OP_CRMINPLUS:
551       case OP_CRPOSPLUS:
552       min = 1;
553       cc++;
554       break;
555 
556       case OP_CRRANGE:
557       case OP_CRMINRANGE:
558       case OP_CRPOSRANGE:
559       min = GET2(cc, 1);
560       cc += 1 + 2 * IMM2_SIZE;
561       break;
562 
563       default:
564       min = 1;
565       break;
566       }
567 
568      /* Take care not to overflow: (1) min and d are ints, so check that their
569      product is not greater than INT_MAX. (2) branchlength is limited to
570      UINT16_MAX (checked at the top of the loop). */
571 
572     if ((d > 0 && (INT_MAX/d) < min) || UINT16_MAX - branchlength < min*d)
573       branchlength = UINT16_MAX;
574     else branchlength += min * d;
575     break;
576 
577     /* Recursion always refers to the first occurrence of a subpattern with a
578     given number. Therefore, we can always make use of caching, even when the
579     pattern contains multiple subpatterns with the same number. */
580 
581     case OP_RECURSE:
582     cs = ce = (PCRE2_UCHAR *)startcode + GET(cc, 1);
583     recno = GET2(cs, 1+LINK_SIZE);
584     if (recno == prev_recurse_recno)
585       {
586       branchlength += prev_recurse_d;
587       }
588     else
589       {
590       do ce += GET(ce, 1); while (*ce == OP_ALT);
591       if (cc > cs && cc < ce)    /* Simple recursion */
592         had_recurse = TRUE;
593       else
594         {
595         recurse_check *r = recurses;
596         for (r = recurses; r != NULL; r = r->prev) if (r->group == cs) break;
597         if (r != NULL)          /* Mutual recursion */
598           had_recurse = TRUE;
599         else
600           {
601           this_recurse.prev = recurses;
602           this_recurse.group = cs;
603           prev_recurse_d = find_minlength(re, cs, startcode, utf, &this_recurse,
604             countptr);
605           if (prev_recurse_d < 0) return prev_recurse_d;
606           prev_recurse_recno = recno;
607           branchlength += prev_recurse_d;
608           }
609         }
610       }
611     cc += 1 + LINK_SIZE + once_fudge;
612     once_fudge = 0;
613     break;
614 
615     /* Anything else does not or need not match a character. We can get the
616     item's length from the table, but for those that can match zero occurrences
617     of a character, we must take special action for UTF-8 characters. As it
618     happens, the "NOT" versions of these opcodes are used at present only for
619     ASCII characters, so they could be omitted from this list. However, in
620     future that may change, so we include them here so as not to leave a
621     gotcha for a future maintainer. */
622 
623     case OP_UPTO:
624     case OP_UPTOI:
625     case OP_NOTUPTO:
626     case OP_NOTUPTOI:
627     case OP_MINUPTO:
628     case OP_MINUPTOI:
629     case OP_NOTMINUPTO:
630     case OP_NOTMINUPTOI:
631     case OP_POSUPTO:
632     case OP_POSUPTOI:
633     case OP_NOTPOSUPTO:
634     case OP_NOTPOSUPTOI:
635 
636     case OP_STAR:
637     case OP_STARI:
638     case OP_NOTSTAR:
639     case OP_NOTSTARI:
640     case OP_MINSTAR:
641     case OP_MINSTARI:
642     case OP_NOTMINSTAR:
643     case OP_NOTMINSTARI:
644     case OP_POSSTAR:
645     case OP_POSSTARI:
646     case OP_NOTPOSSTAR:
647     case OP_NOTPOSSTARI:
648 
649     case OP_QUERY:
650     case OP_QUERYI:
651     case OP_NOTQUERY:
652     case OP_NOTQUERYI:
653     case OP_MINQUERY:
654     case OP_MINQUERYI:
655     case OP_NOTMINQUERY:
656     case OP_NOTMINQUERYI:
657     case OP_POSQUERY:
658     case OP_POSQUERYI:
659     case OP_NOTPOSQUERY:
660     case OP_NOTPOSQUERYI:
661 
662     cc += PRIV(OP_lengths)[op];
663 #ifdef SUPPORT_UNICODE
664     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
665 #endif
666     break;
667 
668     /* Skip these, but we need to add in the name length. */
669 
670     case OP_MARK:
671     case OP_PRUNE_ARG:
672     case OP_SKIP_ARG:
673     case OP_THEN_ARG:
674     cc += PRIV(OP_lengths)[op] + cc[1];
675     break;
676 
677     /* The remaining opcodes are just skipped over. */
678 
679     case OP_CLOSE:
680     case OP_COMMIT:
681     case OP_FAIL:
682     case OP_PRUNE:
683     case OP_SET_SOM:
684     case OP_SKIP:
685     case OP_THEN:
686     cc += PRIV(OP_lengths)[op];
687     break;
688 
689     /* This should not occur: we list all opcodes explicitly so that when
690     new ones get added they are properly considered. */
691 
692     default:
693     return -3;
694     }
695   }
696 /* Control never gets here */
697 }
698 
699 
700 
701 /*************************************************
702 *      Set a bit and maybe its alternate case    *
703 *************************************************/
704 
705 /* Given a character, set its first code unit's bit in the table, and also the
706 corresponding bit for the other version of a letter if we are caseless.
707 
708 Arguments:
709   re            points to the regex block
710   p             points to the first code unit of the character
711   caseless      TRUE if caseless
712   utf           TRUE for UTF mode
713 
714 Returns:        pointer after the character
715 */
716 
717 static PCRE2_SPTR
set_table_bit(pcre2_real_code * re,PCRE2_SPTR p,BOOL caseless,BOOL utf)718 set_table_bit(pcre2_real_code *re, PCRE2_SPTR p, BOOL caseless, BOOL utf)
719 {
720 uint32_t c = *p++;   /* First code unit */
721 (void)utf;           /* Stop compiler warning when UTF not supported */
722 
723 /* In 16-bit and 32-bit modes, code units greater than 0xff set the bit for
724 0xff. */
725 
726 #if PCRE2_CODE_UNIT_WIDTH != 8
727 if (c > 0xff) SET_BIT(0xff); else
728 #endif
729 
730 SET_BIT(c);
731 
732 /* In UTF-8 or UTF-16 mode, pick up the remaining code units in order to find
733 the end of the character, even when caseless. */
734 
735 #ifdef SUPPORT_UNICODE
736 if (utf)
737   {
738 #if PCRE2_CODE_UNIT_WIDTH == 8
739   if (c >= 0xc0) GETUTF8INC(c, p);
740 #elif PCRE2_CODE_UNIT_WIDTH == 16
741   if ((c & 0xfc00) == 0xd800) GETUTF16INC(c, p);
742 #endif
743   }
744 #endif  /* SUPPORT_UNICODE */
745 
746 /* If caseless, handle the other case of the character. */
747 
748 if (caseless)
749   {
750   if (utf)
751     {
752 #if PCRE2_CODE_UNIT_WIDTH == 8
753     PCRE2_UCHAR buff[6];
754     c = UCD_OTHERCASE(c);
755     (void)PRIV(ord2utf)(c, buff);
756     SET_BIT(buff[0]);
757 #else  /* 16-bit or 32-bit mode */
758     c = UCD_OTHERCASE(c);
759     if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
760 #endif
761     }
762 
763   /* Not UTF */
764 
765   else if (MAX_255(c)) SET_BIT(re->tables[fcc_offset + c]);
766   }
767 
768 return p;
769 }
770 
771 
772 
773 /*************************************************
774 *     Set bits for a positive character type     *
775 *************************************************/
776 
777 /* This function sets starting bits for a character type. In UTF-8 mode, we can
778 only do a direct setting for bytes less than 128, as otherwise there can be
779 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
780 environment, the tables will only recognize ASCII characters anyway, but in at
781 least one Windows environment, some higher bytes bits were set in the tables.
782 So we deal with that case by considering the UTF-8 encoding.
783 
784 Arguments:
785   re             the regex block
786   cbit type      the type of character wanted
787   table_limit    32 for non-UTF-8; 16 for UTF-8
788 
789 Returns:         nothing
790 */
791 
792 static void
set_type_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)793 set_type_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
794 {
795 register uint32_t c;
796 for (c = 0; c < table_limit; c++)
797   re->start_bitmap[c] |= re->tables[c+cbits_offset+cbit_type];
798 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
799 if (table_limit == 32) return;
800 for (c = 128; c < 256; c++)
801   {
802   if ((re->tables[cbits_offset + c/8] & (1 << (c&7))) != 0)
803     {
804     PCRE2_UCHAR buff[6];
805     (void)PRIV(ord2utf)(c, buff);
806     SET_BIT(buff[0]);
807     }
808   }
809 #endif  /* UTF-8 */
810 }
811 
812 
813 /*************************************************
814 *     Set bits for a negative character type     *
815 *************************************************/
816 
817 /* This function sets starting bits for a negative character type such as \D.
818 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
819 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
820 Unlike in the positive case, where we can set appropriate starting bits for
821 specific high-valued UTF-8 characters, in this case we have to set the bits for
822 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
823 0xc0 (192) for simplicity.
824 
825 Arguments:
826   re             the regex block
827   cbit type      the type of character wanted
828   table_limit    32 for non-UTF-8; 16 for UTF-8
829 
830 Returns:         nothing
831 */
832 
833 static void
set_nottype_bits(pcre2_real_code * re,int cbit_type,unsigned int table_limit)834 set_nottype_bits(pcre2_real_code *re, int cbit_type, unsigned int table_limit)
835 {
836 register uint32_t c;
837 for (c = 0; c < table_limit; c++)
838   re->start_bitmap[c] |= ~(re->tables[c+cbits_offset+cbit_type]);
839 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
840 if (table_limit != 32) for (c = 24; c < 32; c++) re->start_bitmap[c] = 0xff;
841 #endif
842 }
843 
844 
845 
846 /*************************************************
847 *          Create bitmap of starting bytes       *
848 *************************************************/
849 
850 /* This function scans a compiled unanchored expression recursively and
851 attempts to build a bitmap of the set of possible starting code units whose
852 values are less than 256. In 16-bit and 32-bit mode, values above 255 all cause
853 the 255 bit to be set. When calling set[_not]_type_bits() in UTF-8 (sic) mode
854 we pass a value of 16 rather than 32 as the final argument. (See comments in
855 those functions for the reason.)
856 
857 The SSB_CONTINUE return is useful for parenthesized groups in patterns such as
858 (a*)b where the group provides some optional starting code units but scanning
859 must continue at the outer level to find at least one mandatory code unit. At
860 the outermost level, this function fails unless the result is SSB_DONE.
861 
862 Arguments:
863   re           points to the compiled regex block
864   code         points to an expression
865   utf          TRUE if in UTF mode
866 
867 Returns:       SSB_FAIL     => Failed to find any starting code units
868                SSB_DONE     => Found mandatory starting code units
869                SSB_CONTINUE => Found optional starting code units
870                SSB_UNKNOWN  => Hit an unrecognized opcode
871 */
872 
873 static int
set_start_bits(pcre2_real_code * re,PCRE2_SPTR code,BOOL utf)874 set_start_bits(pcre2_real_code *re, PCRE2_SPTR code, BOOL utf)
875 {
876 register uint32_t c;
877 int yield = SSB_DONE;
878 
879 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
880 int table_limit = utf? 16:32;
881 #else
882 int table_limit = 32;
883 #endif
884 
885 do
886   {
887   BOOL try_next = TRUE;
888   PCRE2_SPTR tcode = code + 1 + LINK_SIZE;
889 
890   if (*code == OP_CBRA || *code == OP_SCBRA ||
891       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
892 
893   while (try_next)    /* Loop for items in this branch */
894     {
895     int rc;
896     uint8_t *classmap = NULL;
897 
898     switch(*tcode)
899       {
900       /* If we reach something we don't understand, it means a new opcode has
901       been created that hasn't been added to this function. Hopefully this
902       problem will be discovered during testing. */
903 
904       default:
905       return SSB_UNKNOWN;
906 
907       /* Fail for a valid opcode that implies no starting bits. */
908 
909       case OP_ACCEPT:
910       case OP_ASSERT_ACCEPT:
911       case OP_ALLANY:
912       case OP_ANY:
913       case OP_ANYBYTE:
914       case OP_CIRC:
915       case OP_CIRCM:
916       case OP_CLOSE:
917       case OP_COMMIT:
918       case OP_COND:
919       case OP_CREF:
920       case OP_FALSE:
921       case OP_TRUE:
922       case OP_DNCREF:
923       case OP_DNREF:
924       case OP_DNREFI:
925       case OP_DNRREF:
926       case OP_DOLL:
927       case OP_DOLLM:
928       case OP_END:
929       case OP_EOD:
930       case OP_EODN:
931       case OP_EXTUNI:
932       case OP_FAIL:
933       case OP_MARK:
934       case OP_NOT:
935       case OP_NOTEXACT:
936       case OP_NOTEXACTI:
937       case OP_NOTI:
938       case OP_NOTMINPLUS:
939       case OP_NOTMINPLUSI:
940       case OP_NOTMINQUERY:
941       case OP_NOTMINQUERYI:
942       case OP_NOTMINSTAR:
943       case OP_NOTMINSTARI:
944       case OP_NOTMINUPTO:
945       case OP_NOTMINUPTOI:
946       case OP_NOTPLUS:
947       case OP_NOTPLUSI:
948       case OP_NOTPOSPLUS:
949       case OP_NOTPOSPLUSI:
950       case OP_NOTPOSQUERY:
951       case OP_NOTPOSQUERYI:
952       case OP_NOTPOSSTAR:
953       case OP_NOTPOSSTARI:
954       case OP_NOTPOSUPTO:
955       case OP_NOTPOSUPTOI:
956       case OP_NOTPROP:
957       case OP_NOTQUERY:
958       case OP_NOTQUERYI:
959       case OP_NOTSTAR:
960       case OP_NOTSTARI:
961       case OP_NOTUPTO:
962       case OP_NOTUPTOI:
963       case OP_NOT_HSPACE:
964       case OP_NOT_VSPACE:
965       case OP_PRUNE:
966       case OP_PRUNE_ARG:
967       case OP_RECURSE:
968       case OP_REF:
969       case OP_REFI:
970       case OP_REVERSE:
971       case OP_RREF:
972       case OP_SCOND:
973       case OP_SET_SOM:
974       case OP_SKIP:
975       case OP_SKIP_ARG:
976       case OP_SOD:
977       case OP_SOM:
978       case OP_THEN:
979       case OP_THEN_ARG:
980       return SSB_FAIL;
981 
982       /* A "real" property test implies no starting bits, but the fake property
983       PT_CLIST identifies a list of characters. These lists are short, as they
984       are used for characters with more than one "other case", so there is no
985       point in recognizing them for OP_NOTPROP. */
986 
987       case OP_PROP:
988       if (tcode[1] != PT_CLIST) return SSB_FAIL;
989         {
990         const uint32_t *p = PRIV(ucd_caseless_sets) + tcode[2];
991         while ((c = *p++) < NOTACHAR)
992           {
993 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
994           if (utf)
995             {
996             PCRE2_UCHAR buff[6];
997             (void)PRIV(ord2utf)(c, buff);
998             c = buff[0];
999             }
1000 #endif
1001           if (c > 0xff) SET_BIT(0xff); else SET_BIT(c);
1002           }
1003         }
1004       try_next = FALSE;
1005       break;
1006 
1007       /* We can ignore word boundary tests. */
1008 
1009       case OP_WORD_BOUNDARY:
1010       case OP_NOT_WORD_BOUNDARY:
1011       tcode++;
1012       break;
1013 
1014       /* If we hit a bracket or a positive lookahead assertion, recurse to set
1015       bits from within the subpattern. If it can't find anything, we have to
1016       give up. If it finds some mandatory character(s), we are done for this
1017       branch. Otherwise, carry on scanning after the subpattern. */
1018 
1019       case OP_BRA:
1020       case OP_SBRA:
1021       case OP_CBRA:
1022       case OP_SCBRA:
1023       case OP_BRAPOS:
1024       case OP_SBRAPOS:
1025       case OP_CBRAPOS:
1026       case OP_SCBRAPOS:
1027       case OP_ONCE:
1028       case OP_ONCE_NC:
1029       case OP_ASSERT:
1030       rc = set_start_bits(re, tcode, utf);
1031       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1032       if (rc == SSB_DONE) try_next = FALSE; else
1033         {
1034         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1035         tcode += 1 + LINK_SIZE;
1036         }
1037       break;
1038 
1039       /* If we hit ALT or KET, it means we haven't found anything mandatory in
1040       this branch, though we might have found something optional. For ALT, we
1041       continue with the next alternative, but we have to arrange that the final
1042       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
1043       return SSB_CONTINUE: if this is the top level, that indicates failure,
1044       but after a nested subpattern, it causes scanning to continue. */
1045 
1046       case OP_ALT:
1047       yield = SSB_CONTINUE;
1048       try_next = FALSE;
1049       break;
1050 
1051       case OP_KET:
1052       case OP_KETRMAX:
1053       case OP_KETRMIN:
1054       case OP_KETRPOS:
1055       return SSB_CONTINUE;
1056 
1057       /* Skip over callout */
1058 
1059       case OP_CALLOUT:
1060       tcode += PRIV(OP_lengths)[OP_CALLOUT];
1061       break;
1062 
1063       case OP_CALLOUT_STR:
1064       tcode += GET(tcode, 1 + 2*LINK_SIZE);
1065       break;
1066 
1067       /* Skip over lookbehind and negative lookahead assertions */
1068 
1069       case OP_ASSERT_NOT:
1070       case OP_ASSERTBACK:
1071       case OP_ASSERTBACK_NOT:
1072       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
1073       tcode += 1 + LINK_SIZE;
1074       break;
1075 
1076       /* BRAZERO does the bracket, but carries on. */
1077 
1078       case OP_BRAZERO:
1079       case OP_BRAMINZERO:
1080       case OP_BRAPOSZERO:
1081       rc = set_start_bits(re, ++tcode, utf);
1082       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
1083       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1084       tcode += 1 + LINK_SIZE;
1085       break;
1086 
1087       /* SKIPZERO skips the bracket. */
1088 
1089       case OP_SKIPZERO:
1090       tcode++;
1091       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
1092       tcode += 1 + LINK_SIZE;
1093       break;
1094 
1095       /* Single-char * or ? sets the bit and tries the next item */
1096 
1097       case OP_STAR:
1098       case OP_MINSTAR:
1099       case OP_POSSTAR:
1100       case OP_QUERY:
1101       case OP_MINQUERY:
1102       case OP_POSQUERY:
1103       tcode = set_table_bit(re, tcode + 1, FALSE, utf);
1104       break;
1105 
1106       case OP_STARI:
1107       case OP_MINSTARI:
1108       case OP_POSSTARI:
1109       case OP_QUERYI:
1110       case OP_MINQUERYI:
1111       case OP_POSQUERYI:
1112       tcode = set_table_bit(re, tcode + 1, TRUE, utf);
1113       break;
1114 
1115       /* Single-char upto sets the bit and tries the next */
1116 
1117       case OP_UPTO:
1118       case OP_MINUPTO:
1119       case OP_POSUPTO:
1120       tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, FALSE, utf);
1121       break;
1122 
1123       case OP_UPTOI:
1124       case OP_MINUPTOI:
1125       case OP_POSUPTOI:
1126       tcode = set_table_bit(re, tcode + 1 + IMM2_SIZE, TRUE, utf);
1127       break;
1128 
1129       /* At least one single char sets the bit and stops */
1130 
1131       case OP_EXACT:
1132       tcode += IMM2_SIZE;
1133       /* Fall through */
1134       case OP_CHAR:
1135       case OP_PLUS:
1136       case OP_MINPLUS:
1137       case OP_POSPLUS:
1138       (void)set_table_bit(re, tcode + 1, FALSE, utf);
1139       try_next = FALSE;
1140       break;
1141 
1142       case OP_EXACTI:
1143       tcode += IMM2_SIZE;
1144       /* Fall through */
1145       case OP_CHARI:
1146       case OP_PLUSI:
1147       case OP_MINPLUSI:
1148       case OP_POSPLUSI:
1149       (void)set_table_bit(re, tcode + 1, TRUE, utf);
1150       try_next = FALSE;
1151       break;
1152 
1153       /* Special spacing and line-terminating items. These recognize specific
1154       lists of characters. The difference between VSPACE and ANYNL is that the
1155       latter can match the two-character CRLF sequence, but that is not
1156       relevant for finding the first character, so their code here is
1157       identical. */
1158 
1159       case OP_HSPACE:
1160       SET_BIT(CHAR_HT);
1161       SET_BIT(CHAR_SPACE);
1162 
1163       /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1164       the bits for 0xA0 and for code units >= 255, independently of UTF. */
1165 
1166 #if PCRE2_CODE_UNIT_WIDTH != 8
1167       SET_BIT(0xA0);
1168       SET_BIT(0xFF);
1169 #else
1170       /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1171       units of horizontal space characters. */
1172 
1173 #ifdef SUPPORT_UNICODE
1174       if (utf)
1175         {
1176         SET_BIT(0xC2);  /* For U+00A0 */
1177         SET_BIT(0xE1);  /* For U+1680, U+180E */
1178         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1179         SET_BIT(0xE3);  /* For U+3000 */
1180         }
1181       else
1182 #endif
1183       /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1184       the code is EBCDIC. */
1185         {
1186 #ifndef EBCDIC
1187         SET_BIT(0xA0);
1188 #endif  /* Not EBCDIC */
1189         }
1190 #endif  /* 8-bit support */
1191 
1192       try_next = FALSE;
1193       break;
1194 
1195       case OP_ANYNL:
1196       case OP_VSPACE:
1197       SET_BIT(CHAR_LF);
1198       SET_BIT(CHAR_VT);
1199       SET_BIT(CHAR_FF);
1200       SET_BIT(CHAR_CR);
1201 
1202       /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1203       the bits for NEL and for code units >= 255, independently of UTF. */
1204 
1205 #if PCRE2_CODE_UNIT_WIDTH != 8
1206       SET_BIT(CHAR_NEL);
1207       SET_BIT(0xFF);
1208 #else
1209       /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1210       units of vertical space characters. */
1211 
1212 #ifdef SUPPORT_UNICODE
1213       if (utf)
1214         {
1215         SET_BIT(0xC2);  /* For U+0085 (NEL) */
1216         SET_BIT(0xE2);  /* For U+2028, U+2029 */
1217         }
1218       else
1219 #endif
1220       /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1221         {
1222         SET_BIT(CHAR_NEL);
1223         }
1224 #endif  /* 8-bit support */
1225 
1226       try_next = FALSE;
1227       break;
1228 
1229       /* Single character types set the bits and stop. Note that if PCRE2_UCP
1230       is set, we do not see these op codes because \d etc are converted to
1231       properties. Therefore, these apply in the case when only characters less
1232       than 256 are recognized to match the types. */
1233 
1234       case OP_NOT_DIGIT:
1235       set_nottype_bits(re, cbit_digit, table_limit);
1236       try_next = FALSE;
1237       break;
1238 
1239       case OP_DIGIT:
1240       set_type_bits(re, cbit_digit, table_limit);
1241       try_next = FALSE;
1242       break;
1243 
1244       case OP_NOT_WHITESPACE:
1245       set_nottype_bits(re, cbit_space, table_limit);
1246       try_next = FALSE;
1247       break;
1248 
1249       case OP_WHITESPACE:
1250       set_type_bits(re, cbit_space, table_limit);
1251       try_next = FALSE;
1252       break;
1253 
1254       case OP_NOT_WORDCHAR:
1255       set_nottype_bits(re, cbit_word, table_limit);
1256       try_next = FALSE;
1257       break;
1258 
1259       case OP_WORDCHAR:
1260       set_type_bits(re, cbit_word, table_limit);
1261       try_next = FALSE;
1262       break;
1263 
1264       /* One or more character type fudges the pointer and restarts, knowing
1265       it will hit a single character type and stop there. */
1266 
1267       case OP_TYPEPLUS:
1268       case OP_TYPEMINPLUS:
1269       case OP_TYPEPOSPLUS:
1270       tcode++;
1271       break;
1272 
1273       case OP_TYPEEXACT:
1274       tcode += 1 + IMM2_SIZE;
1275       break;
1276 
1277       /* Zero or more repeats of character types set the bits and then
1278       try again. */
1279 
1280       case OP_TYPEUPTO:
1281       case OP_TYPEMINUPTO:
1282       case OP_TYPEPOSUPTO:
1283       tcode += IMM2_SIZE;  /* Fall through */
1284 
1285       case OP_TYPESTAR:
1286       case OP_TYPEMINSTAR:
1287       case OP_TYPEPOSSTAR:
1288       case OP_TYPEQUERY:
1289       case OP_TYPEMINQUERY:
1290       case OP_TYPEPOSQUERY:
1291       switch(tcode[1])
1292         {
1293         default:
1294         case OP_ANY:
1295         case OP_ALLANY:
1296         return SSB_FAIL;
1297 
1298         case OP_HSPACE:
1299         SET_BIT(CHAR_HT);
1300         SET_BIT(CHAR_SPACE);
1301 
1302         /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1303         the bits for 0xA0 and for code units >= 255, independently of UTF. */
1304 
1305 #if PCRE2_CODE_UNIT_WIDTH != 8
1306         SET_BIT(0xA0);
1307         SET_BIT(0xFF);
1308 #else
1309         /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1310         units of horizontal space characters. */
1311 
1312 #ifdef SUPPORT_UNICODE
1313         if (utf)
1314           {
1315           SET_BIT(0xC2);  /* For U+00A0 */
1316           SET_BIT(0xE1);  /* For U+1680, U+180E */
1317           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1318           SET_BIT(0xE3);  /* For U+3000 */
1319           }
1320         else
1321 #endif
1322         /* For the 8-bit library not in UTF-8 mode, set the bit for 0xA0, unless
1323         the code is EBCDIC. */
1324           {
1325 #ifndef EBCDIC
1326           SET_BIT(0xA0);
1327 #endif  /* Not EBCDIC */
1328           }
1329 #endif  /* 8-bit support */
1330         break;
1331 
1332         case OP_ANYNL:
1333         case OP_VSPACE:
1334         SET_BIT(CHAR_LF);
1335         SET_BIT(CHAR_VT);
1336         SET_BIT(CHAR_FF);
1337         SET_BIT(CHAR_CR);
1338 
1339         /* For the 16-bit and 32-bit libraries (which can never be EBCDIC), set
1340         the bits for NEL and for code units >= 255, independently of UTF. */
1341 
1342 #if PCRE2_CODE_UNIT_WIDTH != 8
1343         SET_BIT(CHAR_NEL);
1344         SET_BIT(0xFF);
1345 #else
1346         /* For the 8-bit library in UTF-8 mode, set the bits for the first code
1347         units of vertical space characters. */
1348 
1349 #ifdef SUPPORT_UNICODE
1350         if (utf)
1351           {
1352           SET_BIT(0xC2);  /* For U+0085 (NEL) */
1353           SET_BIT(0xE2);  /* For U+2028, U+2029 */
1354           }
1355         else
1356 #endif
1357         /* For the 8-bit library not in UTF-8 mode, set the bit for NEL. */
1358           {
1359           SET_BIT(CHAR_NEL);
1360           }
1361 #endif  /* 8-bit support */
1362         break;
1363 
1364         case OP_NOT_DIGIT:
1365         set_nottype_bits(re, cbit_digit, table_limit);
1366         break;
1367 
1368         case OP_DIGIT:
1369         set_type_bits(re, cbit_digit, table_limit);
1370         break;
1371 
1372         case OP_NOT_WHITESPACE:
1373         set_nottype_bits(re, cbit_space, table_limit);
1374         break;
1375 
1376         case OP_WHITESPACE:
1377         set_type_bits(re, cbit_space, table_limit);
1378         break;
1379 
1380         case OP_NOT_WORDCHAR:
1381         set_nottype_bits(re, cbit_word, table_limit);
1382         break;
1383 
1384         case OP_WORDCHAR:
1385         set_type_bits(re, cbit_word, table_limit);
1386         break;
1387         }
1388 
1389       tcode += 2;
1390       break;
1391 
1392       /* Extended class: if there are any property checks, or if this is a
1393       negative XCLASS without a map, give up. If there are no property checks,
1394       there must be wide characters on the XCLASS list, because otherwise an
1395       XCLASS would not have been created. This means that code points >= 255
1396       are always potential starters. */
1397 
1398 #ifdef SUPPORT_WIDE_CHARS
1399       case OP_XCLASS:
1400       if ((tcode[1 + LINK_SIZE] & XCL_HASPROP) != 0 ||
1401           (tcode[1 + LINK_SIZE] & (XCL_MAP|XCL_NOT)) == XCL_NOT)
1402         return SSB_FAIL;
1403 
1404       /* We have a positive XCLASS or a negative one without a map. Set up the
1405       map pointer if there is one, and fall through. */
1406 
1407       classmap = ((tcode[1 + LINK_SIZE] & XCL_MAP) == 0)? NULL :
1408         (uint8_t *)(tcode + 1 + LINK_SIZE + 1);
1409 #endif
1410 
1411       /* Enter here for a negative non-XCLASS. In the 8-bit library, if we are
1412       in UTF mode, any byte with a value >= 0xc4 is a potentially valid starter
1413       because it starts a character with a value > 255. In 8-bit non-UTF mode,
1414       there is no difference between CLASS and NCLASS. In all other wide
1415       character modes, set the 0xFF bit to indicate code units >= 255. */
1416 
1417       case OP_NCLASS:
1418 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1419       if (utf)
1420         {
1421         re->start_bitmap[24] |= 0xf0;            /* Bits for 0xc4 - 0xc8 */
1422         memset(re->start_bitmap+25, 0xff, 7);    /* Bits for 0xc9 - 0xff */
1423         }
1424 #elif PCRE2_CODE_UNIT_WIDTH != 8
1425       SET_BIT(0xFF);                             /* For characters >= 255 */
1426 #endif
1427       /* Fall through */
1428 
1429       /* Enter here for a positive non-XCLASS. If we have fallen through from
1430       an XCLASS, classmap will already be set; just advance the code pointer.
1431       Otherwise, set up classmap for a a non-XCLASS and advance past it. */
1432 
1433       case OP_CLASS:
1434       if (*tcode == OP_XCLASS) tcode += GET(tcode, 1); else
1435         {
1436         classmap = (uint8_t *)(++tcode);
1437         tcode += 32 / sizeof(PCRE2_UCHAR);
1438         }
1439 
1440       /* When wide characters are supported, classmap may be NULL. In UTF-8
1441       (sic) mode, the bits in a class bit map correspond to character values,
1442       not to byte values. However, the bit map we are constructing is for byte
1443       values. So we have to do a conversion for characters whose code point is
1444       greater than 127. In fact, there are only two possible starting bytes for
1445       characters in the range 128 - 255. */
1446 
1447       if (classmap != NULL)
1448         {
1449 #if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH == 8
1450         if (utf)
1451           {
1452           for (c = 0; c < 16; c++) re->start_bitmap[c] |= classmap[c];
1453           for (c = 128; c < 256; c++)
1454             {
1455             if ((classmap[c/8] & (1 << (c&7))) != 0)
1456               {
1457               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1458               re->start_bitmap[d/8] |= (1 << (d&7));  /* and then skip on to the */
1459               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
1460               }
1461             }
1462           }
1463         else
1464 #endif
1465         /* In all modes except UTF-8, the two bit maps are compatible. */
1466 
1467           {
1468           for (c = 0; c < 32; c++) re->start_bitmap[c] |= classmap[c];
1469           }
1470         }
1471 
1472       /* Act on what follows the class. For a zero minimum repeat, continue;
1473       otherwise stop processing. */
1474 
1475       switch (*tcode)
1476         {
1477         case OP_CRSTAR:
1478         case OP_CRMINSTAR:
1479         case OP_CRQUERY:
1480         case OP_CRMINQUERY:
1481         case OP_CRPOSSTAR:
1482         case OP_CRPOSQUERY:
1483         tcode++;
1484         break;
1485 
1486         case OP_CRRANGE:
1487         case OP_CRMINRANGE:
1488         case OP_CRPOSRANGE:
1489         if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1490           else try_next = FALSE;
1491         break;
1492 
1493         default:
1494         try_next = FALSE;
1495         break;
1496         }
1497       break; /* End of class handling case */
1498       }      /* End of switch for opcodes */
1499     }        /* End of try_next loop */
1500 
1501   code += GET(code, 1);   /* Advance to next branch */
1502   }
1503 while (*code == OP_ALT);
1504 
1505 return yield;
1506 }
1507 
1508 
1509 
1510 /*************************************************
1511 *          Study a compiled expression           *
1512 *************************************************/
1513 
1514 /* This function is handed a compiled expression that it must study to produce
1515 information that will speed up the matching.
1516 
1517 Argument:  points to the compiled expression
1518 Returns:   0 normally; non-zero should never normally occur
1519            1 unknown opcode in set_start_bits
1520            2 missing capturing bracket
1521            3 unknown opcode in find_minlength
1522 */
1523 
1524 int
PRIV(study)1525 PRIV(study)(pcre2_real_code *re)
1526 {
1527 int min;
1528 int count = 0;
1529 PCRE2_UCHAR *code;
1530 BOOL utf = (re->overall_options & PCRE2_UTF) != 0;
1531 
1532 /* Find start of compiled code */
1533 
1534 code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
1535   re->name_entry_size * re->name_count;
1536 
1537 /* For an anchored pattern, or an unanchored pattern that has a first code
1538 unit, or a multiline pattern that matches only at "line start", there is no
1539 point in seeking a list of starting code units. */
1540 
1541 if ((re->overall_options & PCRE2_ANCHORED) == 0 &&
1542     (re->flags & (PCRE2_FIRSTSET|PCRE2_STARTLINE)) == 0)
1543   {
1544   int rc = set_start_bits(re, code, utf);
1545   if (rc == SSB_UNKNOWN) return 1;
1546   if (rc == SSB_DONE) re->flags |= PCRE2_FIRSTMAPSET;
1547   }
1548 
1549 /* Find the minimum length of subject string. If it can match an empty string,
1550 the minimum length is already known. */
1551 
1552 if ((re->flags & PCRE2_MATCH_EMPTY) == 0)
1553   {
1554   switch(min = find_minlength(re, code, code, utf, NULL, &count))
1555     {
1556     case -1:  /* \C in UTF mode or (*ACCEPT) or over-complex regex */
1557     break;    /* Leave minlength unchanged (will be zero) */
1558 
1559     case -2:
1560     return 2; /* missing capturing bracket */
1561 
1562     case -3:
1563     return 3; /* unrecognized opcode */
1564 
1565     default:
1566     if (min > UINT16_MAX) min = UINT16_MAX;
1567     re->minlength = min;
1568     break;
1569     }
1570   }
1571 
1572 return 0;
1573 }
1574 
1575 /* End of pcre2_study.c */
1576