• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9            Copyright (c) 1997-2012 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15     * Redistributions of source code must retain the above copyright notice,
16       this list of conditions and the following disclaimer.
17 
18     * Redistributions in binary form must reproduce the above copyright
19       notice, this list of conditions and the following disclaimer in the
20       documentation and/or other materials provided with the distribution.
21 
22     * Neither the name of the University of Cambridge nor the names of its
23       contributors may be used to endorse or promote products derived from
24       this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_study(), along with local
42 supporting functions. */
43 
44 
45 #include "config.h"
46 
47 #include "pcre_internal.h"
48 
49 #define SET_BIT(c) start_bits[c/8] |= (1 << (c&7))
50 
51 /* Returns from set_start_bits() */
52 
53 enum { SSB_FAIL, SSB_DONE, SSB_CONTINUE, SSB_UNKNOWN };
54 
55 
56 
57 /*************************************************
58 *   Find the minimum subject length for a group  *
59 *************************************************/
60 
61 /* Scan a parenthesized group and compute the minimum length of subject that
62 is needed to match it. This is a lower bound; it does not mean there is a
63 string of that length that matches. In UTF8 mode, the result is in characters
64 rather than bytes.
65 
66 Arguments:
67   code            pointer to start of group (the bracket)
68   startcode       pointer to start of the whole pattern
69   options         the compiling options
70   int             RECURSE depth
71 
72 Returns:   the minimum length
73            -1 if \C in UTF-8 mode or (*ACCEPT) was encountered
74            -2 internal error (missing capturing bracket)
75            -3 internal error (opcode not listed)
76 */
77 
78 static int
find_minlength(const pcre_uchar * code,const pcre_uchar * startcode,int options,int recurse_depth)79 find_minlength(const pcre_uchar *code, const pcre_uchar *startcode, int options,
80   int recurse_depth)
81 {
82 int length = -1;
83 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
84 BOOL utf = (options & PCRE_UTF8) != 0;
85 BOOL had_recurse = FALSE;
86 int branchlength = 0;
87 pcre_uchar *cc = (pcre_uchar *)code + 1 + LINK_SIZE;
88 
89 if (*code == OP_CBRA || *code == OP_SCBRA ||
90     *code == OP_CBRAPOS || *code == OP_SCBRAPOS) cc += IMM2_SIZE;
91 
92 /* Scan along the opcodes for this branch. If we get to the end of the
93 branch, check the length against that of the other branches. */
94 
95 for (;;)
96   {
97   int d, min;
98   pcre_uchar *cs, *ce;
99   int op = *cc;
100 
101   switch (op)
102     {
103     case OP_COND:
104     case OP_SCOND:
105 
106     /* If there is only one branch in a condition, the implied branch has zero
107     length, so we don't add anything. This covers the DEFINE "condition"
108     automatically. */
109 
110     cs = cc + GET(cc, 1);
111     if (*cs != OP_ALT)
112       {
113       cc = cs + 1 + LINK_SIZE;
114       break;
115       }
116 
117     /* Otherwise we can fall through and treat it the same as any other
118     subpattern. */
119 
120     case OP_CBRA:
121     case OP_SCBRA:
122     case OP_BRA:
123     case OP_SBRA:
124     case OP_CBRAPOS:
125     case OP_SCBRAPOS:
126     case OP_BRAPOS:
127     case OP_SBRAPOS:
128     case OP_ONCE:
129     case OP_ONCE_NC:
130     d = find_minlength(cc, startcode, options, recurse_depth);
131     if (d < 0) return d;
132     branchlength += d;
133     do cc += GET(cc, 1); while (*cc == OP_ALT);
134     cc += 1 + LINK_SIZE;
135     break;
136 
137     /* ACCEPT makes things far too complicated; we have to give up. */
138 
139     case OP_ACCEPT:
140     case OP_ASSERT_ACCEPT:
141     return -1;
142 
143     /* Reached end of a branch; if it's a ket it is the end of a nested
144     call. If it's ALT it is an alternation in a nested call. If it is END it's
145     the end of the outer call. All can be handled by the same code. If an
146     ACCEPT was previously encountered, use the length that was in force at that
147     time, and pass back the shortest ACCEPT length. */
148 
149     case OP_ALT:
150     case OP_KET:
151     case OP_KETRMAX:
152     case OP_KETRMIN:
153     case OP_KETRPOS:
154     case OP_END:
155     if (length < 0 || (!had_recurse && branchlength < length))
156       length = branchlength;
157     if (op != OP_ALT) return length;
158     cc += 1 + LINK_SIZE;
159     branchlength = 0;
160     had_recurse = FALSE;
161     break;
162 
163     /* Skip over assertive subpatterns */
164 
165     case OP_ASSERT:
166     case OP_ASSERT_NOT:
167     case OP_ASSERTBACK:
168     case OP_ASSERTBACK_NOT:
169     do cc += GET(cc, 1); while (*cc == OP_ALT);
170     /* Fall through */
171 
172     /* Skip over things that don't match chars */
173 
174     case OP_REVERSE:
175     case OP_CREF:
176     case OP_NCREF:
177     case OP_RREF:
178     case OP_NRREF:
179     case OP_DEF:
180     case OP_CALLOUT:
181     case OP_SOD:
182     case OP_SOM:
183     case OP_EOD:
184     case OP_EODN:
185     case OP_CIRC:
186     case OP_CIRCM:
187     case OP_DOLL:
188     case OP_DOLLM:
189     case OP_NOT_WORD_BOUNDARY:
190     case OP_WORD_BOUNDARY:
191     cc += PRIV(OP_lengths)[*cc];
192     break;
193 
194     /* Skip over a subpattern that has a {0} or {0,x} quantifier */
195 
196     case OP_BRAZERO:
197     case OP_BRAMINZERO:
198     case OP_BRAPOSZERO:
199     case OP_SKIPZERO:
200     cc += PRIV(OP_lengths)[*cc];
201     do cc += GET(cc, 1); while (*cc == OP_ALT);
202     cc += 1 + LINK_SIZE;
203     break;
204 
205     /* Handle literal characters and + repetitions */
206 
207     case OP_CHAR:
208     case OP_CHARI:
209     case OP_NOT:
210     case OP_NOTI:
211     case OP_PLUS:
212     case OP_PLUSI:
213     case OP_MINPLUS:
214     case OP_MINPLUSI:
215     case OP_POSPLUS:
216     case OP_POSPLUSI:
217     case OP_NOTPLUS:
218     case OP_NOTPLUSI:
219     case OP_NOTMINPLUS:
220     case OP_NOTMINPLUSI:
221     case OP_NOTPOSPLUS:
222     case OP_NOTPOSPLUSI:
223     branchlength++;
224     cc += 2;
225 #ifdef SUPPORT_UTF
226     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
227 #endif
228     break;
229 
230     case OP_TYPEPLUS:
231     case OP_TYPEMINPLUS:
232     case OP_TYPEPOSPLUS:
233     branchlength++;
234     cc += (cc[1] == OP_PROP || cc[1] == OP_NOTPROP)? 4 : 2;
235     break;
236 
237     /* Handle exact repetitions. The count is already in characters, but we
238     need to skip over a multibyte character in UTF8 mode.  */
239 
240     case OP_EXACT:
241     case OP_EXACTI:
242     case OP_NOTEXACT:
243     case OP_NOTEXACTI:
244     branchlength += GET2(cc,1);
245     cc += 2 + IMM2_SIZE;
246 #ifdef SUPPORT_UTF
247     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
248 #endif
249     break;
250 
251     case OP_TYPEEXACT:
252     branchlength += GET2(cc,1);
253     cc += 2 + IMM2_SIZE + ((cc[1 + IMM2_SIZE] == OP_PROP
254       || cc[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
255     break;
256 
257     /* Handle single-char non-literal matchers */
258 
259     case OP_PROP:
260     case OP_NOTPROP:
261     cc += 2;
262     /* Fall through */
263 
264     case OP_NOT_DIGIT:
265     case OP_DIGIT:
266     case OP_NOT_WHITESPACE:
267     case OP_WHITESPACE:
268     case OP_NOT_WORDCHAR:
269     case OP_WORDCHAR:
270     case OP_ANY:
271     case OP_ALLANY:
272     case OP_EXTUNI:
273     case OP_HSPACE:
274     case OP_NOT_HSPACE:
275     case OP_VSPACE:
276     case OP_NOT_VSPACE:
277     branchlength++;
278     cc++;
279     break;
280 
281     /* "Any newline" might match two characters, but it also might match just
282     one. */
283 
284     case OP_ANYNL:
285     branchlength += 1;
286     cc++;
287     break;
288 
289     /* The single-byte matcher means we can't proceed in UTF-8 mode. (In
290     non-UTF-8 mode \C will actually be turned into OP_ALLANY, so won't ever
291     appear, but leave the code, just in case.) */
292 
293     case OP_ANYBYTE:
294 #ifdef SUPPORT_UTF
295     if (utf) return -1;
296 #endif
297     branchlength++;
298     cc++;
299     break;
300 
301     /* For repeated character types, we have to test for \p and \P, which have
302     an extra two bytes of parameters. */
303 
304     case OP_TYPESTAR:
305     case OP_TYPEMINSTAR:
306     case OP_TYPEQUERY:
307     case OP_TYPEMINQUERY:
308     case OP_TYPEPOSSTAR:
309     case OP_TYPEPOSQUERY:
310     if (cc[1] == OP_PROP || cc[1] == OP_NOTPROP) cc += 2;
311     cc += PRIV(OP_lengths)[op];
312     break;
313 
314     case OP_TYPEUPTO:
315     case OP_TYPEMINUPTO:
316     case OP_TYPEPOSUPTO:
317     if (cc[1 + IMM2_SIZE] == OP_PROP
318       || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
319     cc += PRIV(OP_lengths)[op];
320     break;
321 
322     /* Check a class for variable quantification */
323 
324 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
325     case OP_XCLASS:
326     cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
327     /* Fall through */
328 #endif
329 
330     case OP_CLASS:
331     case OP_NCLASS:
332     cc += PRIV(OP_lengths)[OP_CLASS];
333 
334     switch (*cc)
335       {
336       case OP_CRPLUS:
337       case OP_CRMINPLUS:
338       branchlength++;
339       /* Fall through */
340 
341       case OP_CRSTAR:
342       case OP_CRMINSTAR:
343       case OP_CRQUERY:
344       case OP_CRMINQUERY:
345       cc++;
346       break;
347 
348       case OP_CRRANGE:
349       case OP_CRMINRANGE:
350       branchlength += GET2(cc,1);
351       cc += 1 + 2 * IMM2_SIZE;
352       break;
353 
354       default:
355       branchlength++;
356       break;
357       }
358     break;
359 
360     /* Backreferences and subroutine calls are treated in the same way: we find
361     the minimum length for the subpattern. A recursion, however, causes an
362     a flag to be set that causes the length of this branch to be ignored. The
363     logic is that a recursion can only make sense if there is another
364     alternation that stops the recursing. That will provide the minimum length
365     (when no recursion happens). A backreference within the group that it is
366     referencing behaves in the same way.
367 
368     If PCRE_JAVASCRIPT_COMPAT is set, a backreference to an unset bracket
369     matches an empty string (by default it causes a matching failure), so in
370     that case we must set the minimum length to zero. */
371 
372     case OP_REF:
373     case OP_REFI:
374     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0)
375       {
376       ce = cs = (pcre_uchar *)PRIV(find_bracket)(startcode, utf, GET2(cc, 1));
377       if (cs == NULL) return -2;
378       do ce += GET(ce, 1); while (*ce == OP_ALT);
379       if (cc > cs && cc < ce)
380         {
381         d = 0;
382         had_recurse = TRUE;
383         }
384       else
385         {
386         d = find_minlength(cs, startcode, options, recurse_depth);
387         }
388       }
389     else d = 0;
390     cc += 1 + IMM2_SIZE;
391 
392     /* Handle repeated back references */
393 
394     switch (*cc)
395       {
396       case OP_CRSTAR:
397       case OP_CRMINSTAR:
398       case OP_CRQUERY:
399       case OP_CRMINQUERY:
400       min = 0;
401       cc++;
402       break;
403 
404       case OP_CRPLUS:
405       case OP_CRMINPLUS:
406       min = 1;
407       cc++;
408       break;
409 
410       case OP_CRRANGE:
411       case OP_CRMINRANGE:
412       min = GET2(cc, 1);
413       cc += 1 + 2 * IMM2_SIZE;
414       break;
415 
416       default:
417       min = 1;
418       break;
419       }
420 
421     branchlength += min * d;
422     break;
423 
424     /* We can easily detect direct recursion, but not mutual recursion. This is
425     caught by a recursion depth count. */
426 
427     case OP_RECURSE:
428     cs = ce = (pcre_uchar *)startcode + GET(cc, 1);
429     do ce += GET(ce, 1); while (*ce == OP_ALT);
430     if ((cc > cs && cc < ce) || recurse_depth > 10)
431       had_recurse = TRUE;
432     else
433       {
434       branchlength += find_minlength(cs, startcode, options, recurse_depth + 1);
435       }
436     cc += 1 + LINK_SIZE;
437     break;
438 
439     /* Anything else does not or need not match a character. We can get the
440     item's length from the table, but for those that can match zero occurrences
441     of a character, we must take special action for UTF-8 characters. As it
442     happens, the "NOT" versions of these opcodes are used at present only for
443     ASCII characters, so they could be omitted from this list. However, in
444     future that may change, so we include them here so as not to leave a
445     gotcha for a future maintainer. */
446 
447     case OP_UPTO:
448     case OP_UPTOI:
449     case OP_NOTUPTO:
450     case OP_NOTUPTOI:
451     case OP_MINUPTO:
452     case OP_MINUPTOI:
453     case OP_NOTMINUPTO:
454     case OP_NOTMINUPTOI:
455     case OP_POSUPTO:
456     case OP_POSUPTOI:
457     case OP_NOTPOSUPTO:
458     case OP_NOTPOSUPTOI:
459 
460     case OP_STAR:
461     case OP_STARI:
462     case OP_NOTSTAR:
463     case OP_NOTSTARI:
464     case OP_MINSTAR:
465     case OP_MINSTARI:
466     case OP_NOTMINSTAR:
467     case OP_NOTMINSTARI:
468     case OP_POSSTAR:
469     case OP_POSSTARI:
470     case OP_NOTPOSSTAR:
471     case OP_NOTPOSSTARI:
472 
473     case OP_QUERY:
474     case OP_QUERYI:
475     case OP_NOTQUERY:
476     case OP_NOTQUERYI:
477     case OP_MINQUERY:
478     case OP_MINQUERYI:
479     case OP_NOTMINQUERY:
480     case OP_NOTMINQUERYI:
481     case OP_POSQUERY:
482     case OP_POSQUERYI:
483     case OP_NOTPOSQUERY:
484     case OP_NOTPOSQUERYI:
485 
486     cc += PRIV(OP_lengths)[op];
487 #ifdef SUPPORT_UTF
488     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
489 #endif
490     break;
491 
492     /* Skip these, but we need to add in the name length. */
493 
494     case OP_MARK:
495     case OP_PRUNE_ARG:
496     case OP_SKIP_ARG:
497     case OP_THEN_ARG:
498     cc += PRIV(OP_lengths)[op] + cc[1];
499     break;
500 
501     /* The remaining opcodes are just skipped over. */
502 
503     case OP_CLOSE:
504     case OP_COMMIT:
505     case OP_FAIL:
506     case OP_PRUNE:
507     case OP_SET_SOM:
508     case OP_SKIP:
509     case OP_THEN:
510     cc += PRIV(OP_lengths)[op];
511     break;
512 
513     /* This should not occur: we list all opcodes explicitly so that when
514     new ones get added they are properly considered. */
515 
516     default:
517     return -3;
518     }
519   }
520 /* Control never gets here */
521 }
522 
523 
524 
525 /*************************************************
526 *      Set a bit and maybe its alternate case    *
527 *************************************************/
528 
529 /* Given a character, set its first byte's bit in the table, and also the
530 corresponding bit for the other version of a letter if we are caseless. In
531 UTF-8 mode, for characters greater than 127, we can only do the caseless thing
532 when Unicode property support is available.
533 
534 Arguments:
535   start_bits    points to the bit map
536   p             points to the character
537   caseless      the caseless flag
538   cd            the block with char table pointers
539   utf           TRUE for UTF-8 / UTF-16 mode
540 
541 Returns:        pointer after the character
542 */
543 
544 static const pcre_uchar *
set_table_bit(pcre_uint8 * start_bits,const pcre_uchar * p,BOOL caseless,compile_data * cd,BOOL utf)545 set_table_bit(pcre_uint8 *start_bits, const pcre_uchar *p, BOOL caseless,
546   compile_data *cd, BOOL utf)
547 {
548 unsigned int c = *p;
549 
550 #ifdef COMPILE_PCRE8
551 SET_BIT(c);
552 
553 #ifdef SUPPORT_UTF
554 if (utf && c > 127)
555   {
556   GETCHARINC(c, p);
557 #ifdef SUPPORT_UCP
558   if (caseless)
559     {
560     pcre_uchar buff[6];
561     c = UCD_OTHERCASE(c);
562     (void)PRIV(ord2utf)(c, buff);
563     SET_BIT(buff[0]);
564     }
565 #endif
566   return p;
567   }
568 #endif
569 
570 /* Not UTF-8 mode, or character is less than 127. */
571 
572 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
573 return p + 1;
574 #endif
575 
576 #ifdef COMPILE_PCRE16
577 if (c > 0xff)
578   {
579   c = 0xff;
580   caseless = FALSE;
581   }
582 SET_BIT(c);
583 
584 #ifdef SUPPORT_UTF
585 if (utf && c > 127)
586   {
587   GETCHARINC(c, p);
588 #ifdef SUPPORT_UCP
589   if (caseless)
590     {
591     c = UCD_OTHERCASE(c);
592     if (c > 0xff)
593       c = 0xff;
594     SET_BIT(c);
595     }
596 #endif
597   return p;
598   }
599 #endif
600 
601 if (caseless && (cd->ctypes[c] & ctype_letter) != 0) SET_BIT(cd->fcc[c]);
602 return p + 1;
603 #endif
604 }
605 
606 
607 
608 /*************************************************
609 *     Set bits for a positive character type     *
610 *************************************************/
611 
612 /* This function sets starting bits for a character type. In UTF-8 mode, we can
613 only do a direct setting for bytes less than 128, as otherwise there can be
614 confusion with bytes in the middle of UTF-8 characters. In a "traditional"
615 environment, the tables will only recognize ASCII characters anyway, but in at
616 least one Windows environment, some higher bytes bits were set in the tables.
617 So we deal with that case by considering the UTF-8 encoding.
618 
619 Arguments:
620   start_bits     the starting bitmap
621   cbit type      the type of character wanted
622   table_limit    32 for non-UTF-8; 16 for UTF-8
623   cd             the block with char table pointers
624 
625 Returns:         nothing
626 */
627 
628 static void
set_type_bits(pcre_uint8 * start_bits,int cbit_type,int table_limit,compile_data * cd)629 set_type_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
630   compile_data *cd)
631 {
632 int c;
633 for (c = 0; c < table_limit; c++) start_bits[c] |= cd->cbits[c+cbit_type];
634 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
635 if (table_limit == 32) return;
636 for (c = 128; c < 256; c++)
637   {
638   if ((cd->cbits[c/8] & (1 << (c&7))) != 0)
639     {
640     pcre_uchar buff[6];
641     (void)PRIV(ord2utf)(c, buff);
642     SET_BIT(buff[0]);
643     }
644   }
645 #endif
646 }
647 
648 
649 /*************************************************
650 *     Set bits for a negative character type     *
651 *************************************************/
652 
653 /* This function sets starting bits for a negative character type such as \D.
654 In UTF-8 mode, we can only do a direct setting for bytes less than 128, as
655 otherwise there can be confusion with bytes in the middle of UTF-8 characters.
656 Unlike in the positive case, where we can set appropriate starting bits for
657 specific high-valued UTF-8 characters, in this case we have to set the bits for
658 all high-valued characters. The lowest is 0xc2, but we overkill by starting at
659 0xc0 (192) for simplicity.
660 
661 Arguments:
662   start_bits     the starting bitmap
663   cbit type      the type of character wanted
664   table_limit    32 for non-UTF-8; 16 for UTF-8
665   cd             the block with char table pointers
666 
667 Returns:         nothing
668 */
669 
670 static void
set_nottype_bits(pcre_uint8 * start_bits,int cbit_type,int table_limit,compile_data * cd)671 set_nottype_bits(pcre_uint8 *start_bits, int cbit_type, int table_limit,
672   compile_data *cd)
673 {
674 int c;
675 for (c = 0; c < table_limit; c++) start_bits[c] |= ~cd->cbits[c+cbit_type];
676 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
677 if (table_limit != 32) for (c = 24; c < 32; c++) start_bits[c] = 0xff;
678 #endif
679 }
680 
681 
682 
683 /*************************************************
684 *          Create bitmap of starting bytes       *
685 *************************************************/
686 
687 /* This function scans a compiled unanchored expression recursively and
688 attempts to build a bitmap of the set of possible starting bytes. As time goes
689 by, we may be able to get more clever at doing this. The SSB_CONTINUE return is
690 useful for parenthesized groups in patterns such as (a*)b where the group
691 provides some optional starting bytes but scanning must continue at the outer
692 level to find at least one mandatory byte. At the outermost level, this
693 function fails unless the result is SSB_DONE.
694 
695 Arguments:
696   code         points to an expression
697   start_bits   points to a 32-byte table, initialized to 0
698   utf          TRUE if in UTF-8 / UTF-16 mode
699   cd           the block with char table pointers
700 
701 Returns:       SSB_FAIL     => Failed to find any starting bytes
702                SSB_DONE     => Found mandatory starting bytes
703                SSB_CONTINUE => Found optional starting bytes
704                SSB_UNKNOWN  => Hit an unrecognized opcode
705 */
706 
707 static int
set_start_bits(const pcre_uchar * code,pcre_uint8 * start_bits,BOOL utf,compile_data * cd)708 set_start_bits(const pcre_uchar *code, pcre_uint8 *start_bits, BOOL utf,
709   compile_data *cd)
710 {
711 int c;
712 int yield = SSB_DONE;
713 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
714 int table_limit = utf? 16:32;
715 #else
716 int table_limit = 32;
717 #endif
718 
719 #if 0
720 /* ========================================================================= */
721 /* The following comment and code was inserted in January 1999. In May 2006,
722 when it was observed to cause compiler warnings about unused values, I took it
723 out again. If anybody is still using OS/2, they will have to put it back
724 manually. */
725 
726 /* This next statement and the later reference to dummy are here in order to
727 trick the optimizer of the IBM C compiler for OS/2 into generating correct
728 code. Apparently IBM isn't going to fix the problem, and we would rather not
729 disable optimization (in this module it actually makes a big difference, and
730 the pcre module can use all the optimization it can get). */
731 
732 volatile int dummy;
733 /* ========================================================================= */
734 #endif
735 
736 do
737   {
738   BOOL try_next = TRUE;
739   const pcre_uchar *tcode = code + 1 + LINK_SIZE;
740 
741   if (*code == OP_CBRA || *code == OP_SCBRA ||
742       *code == OP_CBRAPOS || *code == OP_SCBRAPOS) tcode += IMM2_SIZE;
743 
744   while (try_next)    /* Loop for items in this branch */
745     {
746     int rc;
747 
748     switch(*tcode)
749       {
750       /* If we reach something we don't understand, it means a new opcode has
751       been created that hasn't been added to this code. Hopefully this problem
752       will be discovered during testing. */
753 
754       default:
755       return SSB_UNKNOWN;
756 
757       /* Fail for a valid opcode that implies no starting bits. */
758 
759       case OP_ACCEPT:
760       case OP_ASSERT_ACCEPT:
761       case OP_ALLANY:
762       case OP_ANY:
763       case OP_ANYBYTE:
764       case OP_CIRC:
765       case OP_CIRCM:
766       case OP_CLOSE:
767       case OP_COMMIT:
768       case OP_COND:
769       case OP_CREF:
770       case OP_DEF:
771       case OP_DOLL:
772       case OP_DOLLM:
773       case OP_END:
774       case OP_EOD:
775       case OP_EODN:
776       case OP_EXTUNI:
777       case OP_FAIL:
778       case OP_MARK:
779       case OP_NCREF:
780       case OP_NOT:
781       case OP_NOTEXACT:
782       case OP_NOTEXACTI:
783       case OP_NOTI:
784       case OP_NOTMINPLUS:
785       case OP_NOTMINPLUSI:
786       case OP_NOTMINQUERY:
787       case OP_NOTMINQUERYI:
788       case OP_NOTMINSTAR:
789       case OP_NOTMINSTARI:
790       case OP_NOTMINUPTO:
791       case OP_NOTMINUPTOI:
792       case OP_NOTPLUS:
793       case OP_NOTPLUSI:
794       case OP_NOTPOSPLUS:
795       case OP_NOTPOSPLUSI:
796       case OP_NOTPOSQUERY:
797       case OP_NOTPOSQUERYI:
798       case OP_NOTPOSSTAR:
799       case OP_NOTPOSSTARI:
800       case OP_NOTPOSUPTO:
801       case OP_NOTPOSUPTOI:
802       case OP_NOTPROP:
803       case OP_NOTQUERY:
804       case OP_NOTQUERYI:
805       case OP_NOTSTAR:
806       case OP_NOTSTARI:
807       case OP_NOTUPTO:
808       case OP_NOTUPTOI:
809       case OP_NOT_HSPACE:
810       case OP_NOT_VSPACE:
811       case OP_NRREF:
812       case OP_PROP:
813       case OP_PRUNE:
814       case OP_PRUNE_ARG:
815       case OP_RECURSE:
816       case OP_REF:
817       case OP_REFI:
818       case OP_REVERSE:
819       case OP_RREF:
820       case OP_SCOND:
821       case OP_SET_SOM:
822       case OP_SKIP:
823       case OP_SKIP_ARG:
824       case OP_SOD:
825       case OP_SOM:
826       case OP_THEN:
827       case OP_THEN_ARG:
828 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
829       case OP_XCLASS:
830 #endif
831       return SSB_FAIL;
832 
833       /* We can ignore word boundary tests. */
834 
835       case OP_WORD_BOUNDARY:
836       case OP_NOT_WORD_BOUNDARY:
837       tcode++;
838       break;
839 
840       /* If we hit a bracket or a positive lookahead assertion, recurse to set
841       bits from within the subpattern. If it can't find anything, we have to
842       give up. If it finds some mandatory character(s), we are done for this
843       branch. Otherwise, carry on scanning after the subpattern. */
844 
845       case OP_BRA:
846       case OP_SBRA:
847       case OP_CBRA:
848       case OP_SCBRA:
849       case OP_BRAPOS:
850       case OP_SBRAPOS:
851       case OP_CBRAPOS:
852       case OP_SCBRAPOS:
853       case OP_ONCE:
854       case OP_ONCE_NC:
855       case OP_ASSERT:
856       rc = set_start_bits(tcode, start_bits, utf, cd);
857       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
858       if (rc == SSB_DONE) try_next = FALSE; else
859         {
860         do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
861         tcode += 1 + LINK_SIZE;
862         }
863       break;
864 
865       /* If we hit ALT or KET, it means we haven't found anything mandatory in
866       this branch, though we might have found something optional. For ALT, we
867       continue with the next alternative, but we have to arrange that the final
868       result from subpattern is SSB_CONTINUE rather than SSB_DONE. For KET,
869       return SSB_CONTINUE: if this is the top level, that indicates failure,
870       but after a nested subpattern, it causes scanning to continue. */
871 
872       case OP_ALT:
873       yield = SSB_CONTINUE;
874       try_next = FALSE;
875       break;
876 
877       case OP_KET:
878       case OP_KETRMAX:
879       case OP_KETRMIN:
880       case OP_KETRPOS:
881       return SSB_CONTINUE;
882 
883       /* Skip over callout */
884 
885       case OP_CALLOUT:
886       tcode += 2 + 2*LINK_SIZE;
887       break;
888 
889       /* Skip over lookbehind and negative lookahead assertions */
890 
891       case OP_ASSERT_NOT:
892       case OP_ASSERTBACK:
893       case OP_ASSERTBACK_NOT:
894       do tcode += GET(tcode, 1); while (*tcode == OP_ALT);
895       tcode += 1 + LINK_SIZE;
896       break;
897 
898       /* BRAZERO does the bracket, but carries on. */
899 
900       case OP_BRAZERO:
901       case OP_BRAMINZERO:
902       case OP_BRAPOSZERO:
903       rc = set_start_bits(++tcode, start_bits, utf, cd);
904       if (rc == SSB_FAIL || rc == SSB_UNKNOWN) return rc;
905 /* =========================================================================
906       See the comment at the head of this function concerning the next line,
907       which was an old fudge for the benefit of OS/2.
908       dummy = 1;
909   ========================================================================= */
910       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
911       tcode += 1 + LINK_SIZE;
912       break;
913 
914       /* SKIPZERO skips the bracket. */
915 
916       case OP_SKIPZERO:
917       tcode++;
918       do tcode += GET(tcode,1); while (*tcode == OP_ALT);
919       tcode += 1 + LINK_SIZE;
920       break;
921 
922       /* Single-char * or ? sets the bit and tries the next item */
923 
924       case OP_STAR:
925       case OP_MINSTAR:
926       case OP_POSSTAR:
927       case OP_QUERY:
928       case OP_MINQUERY:
929       case OP_POSQUERY:
930       tcode = set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
931       break;
932 
933       case OP_STARI:
934       case OP_MINSTARI:
935       case OP_POSSTARI:
936       case OP_QUERYI:
937       case OP_MINQUERYI:
938       case OP_POSQUERYI:
939       tcode = set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
940       break;
941 
942       /* Single-char upto sets the bit and tries the next */
943 
944       case OP_UPTO:
945       case OP_MINUPTO:
946       case OP_POSUPTO:
947       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, FALSE, cd, utf);
948       break;
949 
950       case OP_UPTOI:
951       case OP_MINUPTOI:
952       case OP_POSUPTOI:
953       tcode = set_table_bit(start_bits, tcode + 1 + IMM2_SIZE, TRUE, cd, utf);
954       break;
955 
956       /* At least one single char sets the bit and stops */
957 
958       case OP_EXACT:
959       tcode += IMM2_SIZE;
960       /* Fall through */
961       case OP_CHAR:
962       case OP_PLUS:
963       case OP_MINPLUS:
964       case OP_POSPLUS:
965       (void)set_table_bit(start_bits, tcode + 1, FALSE, cd, utf);
966       try_next = FALSE;
967       break;
968 
969       case OP_EXACTI:
970       tcode += IMM2_SIZE;
971       /* Fall through */
972       case OP_CHARI:
973       case OP_PLUSI:
974       case OP_MINPLUSI:
975       case OP_POSPLUSI:
976       (void)set_table_bit(start_bits, tcode + 1, TRUE, cd, utf);
977       try_next = FALSE;
978       break;
979 
980       /* Special spacing and line-terminating items. These recognize specific
981       lists of characters. The difference between VSPACE and ANYNL is that the
982       latter can match the two-character CRLF sequence, but that is not
983       relevant for finding the first character, so their code here is
984       identical. */
985 
986       case OP_HSPACE:
987       SET_BIT(0x09);
988       SET_BIT(0x20);
989 #ifdef SUPPORT_UTF
990       if (utf)
991         {
992 #ifdef COMPILE_PCRE8
993         SET_BIT(0xC2);  /* For U+00A0 */
994         SET_BIT(0xE1);  /* For U+1680, U+180E */
995         SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
996         SET_BIT(0xE3);  /* For U+3000 */
997 #endif
998 #ifdef COMPILE_PCRE16
999         SET_BIT(0xA0);
1000         SET_BIT(0xFF);  /* For characters > 255 */
1001 #endif
1002         }
1003       else
1004 #endif /* SUPPORT_UTF */
1005         {
1006         SET_BIT(0xA0);
1007 #ifdef COMPILE_PCRE16
1008         SET_BIT(0xFF);  /* For characters > 255 */
1009 #endif
1010         }
1011       try_next = FALSE;
1012       break;
1013 
1014       case OP_ANYNL:
1015       case OP_VSPACE:
1016       SET_BIT(0x0A);
1017       SET_BIT(0x0B);
1018       SET_BIT(0x0C);
1019       SET_BIT(0x0D);
1020 #ifdef SUPPORT_UTF
1021       if (utf)
1022         {
1023 #ifdef COMPILE_PCRE8
1024         SET_BIT(0xC2);  /* For U+0085 */
1025         SET_BIT(0xE2);  /* For U+2028, U+2029 */
1026 #endif
1027 #ifdef COMPILE_PCRE16
1028         SET_BIT(0x85);
1029         SET_BIT(0xFF);  /* For characters > 255 */
1030 #endif
1031         }
1032       else
1033 #endif /* SUPPORT_UTF */
1034         {
1035         SET_BIT(0x85);
1036 #ifdef COMPILE_PCRE16
1037         SET_BIT(0xFF);  /* For characters > 255 */
1038 #endif
1039         }
1040       try_next = FALSE;
1041       break;
1042 
1043       /* Single character types set the bits and stop. Note that if PCRE_UCP
1044       is set, we do not see these op codes because \d etc are converted to
1045       properties. Therefore, these apply in the case when only characters less
1046       than 256 are recognized to match the types. */
1047 
1048       case OP_NOT_DIGIT:
1049       set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1050       try_next = FALSE;
1051       break;
1052 
1053       case OP_DIGIT:
1054       set_type_bits(start_bits, cbit_digit, table_limit, cd);
1055       try_next = FALSE;
1056       break;
1057 
1058       /* The cbit_space table has vertical tab as whitespace; we have to
1059       ensure it is set as not whitespace. */
1060 
1061       case OP_NOT_WHITESPACE:
1062       set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1063       start_bits[1] |= 0x08;
1064       try_next = FALSE;
1065       break;
1066 
1067       /* The cbit_space table has vertical tab as whitespace; we have to
1068       not set it from the table. */
1069 
1070       case OP_WHITESPACE:
1071       c = start_bits[1];    /* Save in case it was already set */
1072       set_type_bits(start_bits, cbit_space, table_limit, cd);
1073       start_bits[1] = (start_bits[1] & ~0x08) | c;
1074       try_next = FALSE;
1075       break;
1076 
1077       case OP_NOT_WORDCHAR:
1078       set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1079       try_next = FALSE;
1080       break;
1081 
1082       case OP_WORDCHAR:
1083       set_type_bits(start_bits, cbit_word, table_limit, cd);
1084       try_next = FALSE;
1085       break;
1086 
1087       /* One or more character type fudges the pointer and restarts, knowing
1088       it will hit a single character type and stop there. */
1089 
1090       case OP_TYPEPLUS:
1091       case OP_TYPEMINPLUS:
1092       case OP_TYPEPOSPLUS:
1093       tcode++;
1094       break;
1095 
1096       case OP_TYPEEXACT:
1097       tcode += 1 + IMM2_SIZE;
1098       break;
1099 
1100       /* Zero or more repeats of character types set the bits and then
1101       try again. */
1102 
1103       case OP_TYPEUPTO:
1104       case OP_TYPEMINUPTO:
1105       case OP_TYPEPOSUPTO:
1106       tcode += IMM2_SIZE;  /* Fall through */
1107 
1108       case OP_TYPESTAR:
1109       case OP_TYPEMINSTAR:
1110       case OP_TYPEPOSSTAR:
1111       case OP_TYPEQUERY:
1112       case OP_TYPEMINQUERY:
1113       case OP_TYPEPOSQUERY:
1114       switch(tcode[1])
1115         {
1116         default:
1117         case OP_ANY:
1118         case OP_ALLANY:
1119         return SSB_FAIL;
1120 
1121         case OP_HSPACE:
1122         SET_BIT(0x09);
1123         SET_BIT(0x20);
1124 #ifdef SUPPORT_UTF
1125         if (utf)
1126           {
1127 #ifdef COMPILE_PCRE8
1128           SET_BIT(0xC2);  /* For U+00A0 */
1129           SET_BIT(0xE1);  /* For U+1680, U+180E */
1130           SET_BIT(0xE2);  /* For U+2000 - U+200A, U+202F, U+205F */
1131           SET_BIT(0xE3);  /* For U+3000 */
1132 #endif
1133 #ifdef COMPILE_PCRE16
1134           SET_BIT(0xA0);
1135           SET_BIT(0xFF);  /* For characters > 255 */
1136 #endif
1137           }
1138         else
1139 #endif /* SUPPORT_UTF */
1140           SET_BIT(0xA0);
1141         break;
1142 
1143         case OP_ANYNL:
1144         case OP_VSPACE:
1145         SET_BIT(0x0A);
1146         SET_BIT(0x0B);
1147         SET_BIT(0x0C);
1148         SET_BIT(0x0D);
1149 #ifdef SUPPORT_UTF
1150         if (utf)
1151           {
1152 #ifdef COMPILE_PCRE8
1153           SET_BIT(0xC2);  /* For U+0085 */
1154           SET_BIT(0xE2);  /* For U+2028, U+2029 */
1155 #endif
1156 #ifdef COMPILE_PCRE16
1157           SET_BIT(0x85);
1158           SET_BIT(0xFF);  /* For characters > 255 */
1159 #endif
1160           }
1161         else
1162 #endif /* SUPPORT_UTF */
1163           SET_BIT(0x85);
1164         break;
1165 
1166         case OP_NOT_DIGIT:
1167         set_nottype_bits(start_bits, cbit_digit, table_limit, cd);
1168         break;
1169 
1170         case OP_DIGIT:
1171         set_type_bits(start_bits, cbit_digit, table_limit, cd);
1172         break;
1173 
1174         /* The cbit_space table has vertical tab as whitespace; we have to
1175         ensure it gets set as not whitespace. */
1176 
1177         case OP_NOT_WHITESPACE:
1178         set_nottype_bits(start_bits, cbit_space, table_limit, cd);
1179         start_bits[1] |= 0x08;
1180         break;
1181 
1182         /* The cbit_space table has vertical tab as whitespace; we have to
1183         avoid setting it. */
1184 
1185         case OP_WHITESPACE:
1186         c = start_bits[1];    /* Save in case it was already set */
1187         set_type_bits(start_bits, cbit_space, table_limit, cd);
1188         start_bits[1] = (start_bits[1] & ~0x08) | c;
1189         break;
1190 
1191         case OP_NOT_WORDCHAR:
1192         set_nottype_bits(start_bits, cbit_word, table_limit, cd);
1193         break;
1194 
1195         case OP_WORDCHAR:
1196         set_type_bits(start_bits, cbit_word, table_limit, cd);
1197         break;
1198         }
1199 
1200       tcode += 2;
1201       break;
1202 
1203       /* Character class where all the information is in a bit map: set the
1204       bits and either carry on or not, according to the repeat count. If it was
1205       a negative class, and we are operating with UTF-8 characters, any byte
1206       with a value >= 0xc4 is a potentially valid starter because it starts a
1207       character with a value > 255. */
1208 
1209       case OP_NCLASS:
1210 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1211       if (utf)
1212         {
1213         start_bits[24] |= 0xf0;              /* Bits for 0xc4 - 0xc8 */
1214         memset(start_bits+25, 0xff, 7);      /* Bits for 0xc9 - 0xff */
1215         }
1216 #endif
1217 #ifdef COMPILE_PCRE16
1218       SET_BIT(0xFF);                         /* For characters > 255 */
1219 #endif
1220       /* Fall through */
1221 
1222       case OP_CLASS:
1223         {
1224         pcre_uint8 *map;
1225         tcode++;
1226         map = (pcre_uint8 *)tcode;
1227 
1228         /* In UTF-8 mode, the bits in a bit map correspond to character
1229         values, not to byte values. However, the bit map we are constructing is
1230         for byte values. So we have to do a conversion for characters whose
1231         value is > 127. In fact, there are only two possible starting bytes for
1232         characters in the range 128 - 255. */
1233 
1234 #if defined SUPPORT_UTF && defined COMPILE_PCRE8
1235         if (utf)
1236           {
1237           for (c = 0; c < 16; c++) start_bits[c] |= map[c];
1238           for (c = 128; c < 256; c++)
1239             {
1240             if ((map[c/8] && (1 << (c&7))) != 0)
1241               {
1242               int d = (c >> 6) | 0xc0;            /* Set bit for this starter */
1243               start_bits[d/8] |= (1 << (d&7));    /* and then skip on to the */
1244               c = (c & 0xc0) + 0x40 - 1;          /* next relevant character. */
1245               }
1246             }
1247           }
1248         else
1249 #endif
1250           {
1251           /* In non-UTF-8 mode, the two bit maps are completely compatible. */
1252           for (c = 0; c < 32; c++) start_bits[c] |= map[c];
1253           }
1254 
1255         /* Advance past the bit map, and act on what follows. For a zero
1256         minimum repeat, continue; otherwise stop processing. */
1257 
1258         tcode += 32 / sizeof(pcre_uchar);
1259         switch (*tcode)
1260           {
1261           case OP_CRSTAR:
1262           case OP_CRMINSTAR:
1263           case OP_CRQUERY:
1264           case OP_CRMINQUERY:
1265           tcode++;
1266           break;
1267 
1268           case OP_CRRANGE:
1269           case OP_CRMINRANGE:
1270           if (GET2(tcode, 1) == 0) tcode += 1 + 2 * IMM2_SIZE;
1271             else try_next = FALSE;
1272           break;
1273 
1274           default:
1275           try_next = FALSE;
1276           break;
1277           }
1278         }
1279       break; /* End of bitmap class handling */
1280 
1281       }      /* End of switch */
1282     }        /* End of try_next loop */
1283 
1284   code += GET(code, 1);   /* Advance to next branch */
1285   }
1286 while (*code == OP_ALT);
1287 return yield;
1288 }
1289 
1290 
1291 
1292 
1293 
1294 /*************************************************
1295 *          Study a compiled expression           *
1296 *************************************************/
1297 
1298 /* This function is handed a compiled expression that it must study to produce
1299 information that will speed up the matching. It returns a pcre[16]_extra block
1300 which then gets handed back to pcre_exec().
1301 
1302 Arguments:
1303   re        points to the compiled expression
1304   options   contains option bits
1305   errorptr  points to where to place error messages;
1306             set NULL unless error
1307 
1308 Returns:    pointer to a pcre[16]_extra block, with study_data filled in and
1309               the appropriate flags set;
1310             NULL on error or if no optimization possible
1311 */
1312 
1313 #ifdef COMPILE_PCRE8
1314 PCRE_EXP_DEFN pcre_extra * PCRE_CALL_CONVENTION
pcre_study(const pcre * external_re,int options,const char ** errorptr)1315 pcre_study(const pcre *external_re, int options, const char **errorptr)
1316 #else
1317 PCRE_EXP_DEFN pcre16_extra * PCRE_CALL_CONVENTION
1318 pcre16_study(const pcre16 *external_re, int options, const char **errorptr)
1319 #endif
1320 {
1321 int min;
1322 BOOL bits_set = FALSE;
1323 pcre_uint8 start_bits[32];
1324 PUBL(extra) *extra = NULL;
1325 pcre_study_data *study;
1326 const pcre_uint8 *tables;
1327 pcre_uchar *code;
1328 compile_data compile_block;
1329 const REAL_PCRE *re = (const REAL_PCRE *)external_re;
1330 
1331 *errorptr = NULL;
1332 
1333 if (re == NULL || re->magic_number != MAGIC_NUMBER)
1334   {
1335   *errorptr = "argument is not a compiled regular expression";
1336   return NULL;
1337   }
1338 
1339 if ((re->flags & PCRE_MODE) == 0)
1340   {
1341 #ifdef COMPILE_PCRE8
1342   *errorptr = "argument is compiled in 16 bit mode";
1343 #else
1344   *errorptr = "argument is compiled in 8 bit mode";
1345 #endif
1346   return NULL;
1347   }
1348 
1349 if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
1350   {
1351   *errorptr = "unknown or incorrect option bit(s) set";
1352   return NULL;
1353   }
1354 
1355 code = (pcre_uchar *)re + re->name_table_offset +
1356   (re->name_count * re->name_entry_size);
1357 
1358 /* For an anchored pattern, or an unanchored pattern that has a first char, or
1359 a multiline pattern that matches only at "line starts", there is no point in
1360 seeking a list of starting bytes. */
1361 
1362 if ((re->options & PCRE_ANCHORED) == 0 &&
1363     (re->flags & (PCRE_FIRSTSET|PCRE_STARTLINE)) == 0)
1364   {
1365   int rc;
1366 
1367   /* Set the character tables in the block that is passed around */
1368 
1369   tables = re->tables;
1370 
1371 #ifdef COMPILE_PCRE8
1372   if (tables == NULL)
1373     (void)pcre_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1374     (void *)(&tables));
1375 #else
1376   if (tables == NULL)
1377     (void)pcre16_fullinfo(external_re, NULL, PCRE_INFO_DEFAULT_TABLES,
1378     (void *)(&tables));
1379 #endif
1380 
1381   compile_block.lcc = tables + lcc_offset;
1382   compile_block.fcc = tables + fcc_offset;
1383   compile_block.cbits = tables + cbits_offset;
1384   compile_block.ctypes = tables + ctypes_offset;
1385 
1386   /* See if we can find a fixed set of initial characters for the pattern. */
1387 
1388   memset(start_bits, 0, 32 * sizeof(pcre_uint8));
1389   rc = set_start_bits(code, start_bits, (re->options & PCRE_UTF8) != 0,
1390     &compile_block);
1391   bits_set = rc == SSB_DONE;
1392   if (rc == SSB_UNKNOWN)
1393     {
1394     *errorptr = "internal error: opcode not recognized";
1395     return NULL;
1396     }
1397   }
1398 
1399 /* Find the minimum length of subject string. */
1400 
1401 switch(min = find_minlength(code, code, re->options, 0))
1402   {
1403   case -2: *errorptr = "internal error: missing capturing bracket"; return NULL;
1404   case -3: *errorptr = "internal error: opcode not recognized"; return NULL;
1405   default: break;
1406   }
1407 
1408 /* If a set of starting bytes has been identified, or if the minimum length is
1409 greater than zero, or if JIT optimization has been requested, get a
1410 pcre[16]_extra block and a pcre_study_data block. The study data is put in the
1411 latter, which is pointed to by the former, which may also get additional data
1412 set later by the calling program. At the moment, the size of pcre_study_data
1413 is fixed. We nevertheless save it in a field for returning via the
1414 pcre_fullinfo() function so that if it becomes variable in the future,
1415 we don't have to change that code. */
1416 
1417 if (bits_set || min > 0
1418 #ifdef SUPPORT_JIT
1419     || (options & (PCRE_STUDY_JIT_COMPILE | PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE
1420                  | PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE)) != 0
1421 #endif
1422   )
1423   {
1424   extra = (PUBL(extra) *)(PUBL(malloc))
1425     (sizeof(PUBL(extra)) + sizeof(pcre_study_data));
1426   if (extra == NULL)
1427     {
1428     *errorptr = "failed to get memory";
1429     return NULL;
1430     }
1431 
1432   study = (pcre_study_data *)((char *)extra + sizeof(PUBL(extra)));
1433   extra->flags = PCRE_EXTRA_STUDY_DATA;
1434   extra->study_data = study;
1435 
1436   study->size = sizeof(pcre_study_data);
1437   study->flags = 0;
1438 
1439   /* Set the start bits always, to avoid unset memory errors if the
1440   study data is written to a file, but set the flag only if any of the bits
1441   are set, to save time looking when none are. */
1442 
1443   if (bits_set)
1444     {
1445     study->flags |= PCRE_STUDY_MAPPED;
1446     memcpy(study->start_bits, start_bits, sizeof(start_bits));
1447     }
1448   else memset(study->start_bits, 0, 32 * sizeof(pcre_uint8));
1449 
1450 #ifdef PCRE_DEBUG
1451   if (bits_set)
1452     {
1453     pcre_uint8 *ptr = start_bits;
1454     int i;
1455 
1456     printf("Start bits:\n");
1457     for (i = 0; i < 32; i++)
1458       printf("%3d: %02x%s", i * 8, *ptr++, ((i + 1) & 0x7) != 0? " " : "\n");
1459     }
1460 #endif
1461 
1462   /* Always set the minlength value in the block, because the JIT compiler
1463   makes use of it. However, don't set the bit unless the length is greater than
1464   zero - the interpretive pcre_exec() and pcre_dfa_exec() needn't waste time
1465   checking the zero case. */
1466 
1467   if (min > 0)
1468     {
1469     study->flags |= PCRE_STUDY_MINLEN;
1470     study->minlength = min;
1471     }
1472   else study->minlength = 0;
1473 
1474   /* If JIT support was compiled and requested, attempt the JIT compilation.
1475   If no starting bytes were found, and the minimum length is zero, and JIT
1476   compilation fails, abandon the extra block and return NULL. */
1477 
1478 #ifdef SUPPORT_JIT
1479   extra->executable_jit = NULL;
1480   if ((options & PCRE_STUDY_JIT_COMPILE) != 0)
1481     PRIV(jit_compile)(re, extra, JIT_COMPILE);
1482   if ((options & PCRE_STUDY_JIT_PARTIAL_SOFT_COMPILE) != 0)
1483     PRIV(jit_compile)(re, extra, JIT_PARTIAL_SOFT_COMPILE);
1484   if ((options & PCRE_STUDY_JIT_PARTIAL_HARD_COMPILE) != 0)
1485     PRIV(jit_compile)(re, extra, JIT_PARTIAL_HARD_COMPILE);
1486 
1487   if (study->flags == 0 && (extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) == 0)
1488     {
1489 #ifdef COMPILE_PCRE8
1490     pcre_free_study(extra);
1491 #endif
1492 #ifdef COMPILE_PCRE16
1493     pcre16_free_study(extra);
1494 #endif
1495     extra = NULL;
1496     }
1497 #endif
1498   }
1499 
1500 return extra;
1501 }
1502 
1503 
1504 /*************************************************
1505 *          Free the study data                   *
1506 *************************************************/
1507 
1508 /* This function frees the memory that was obtained by pcre_study().
1509 
1510 Argument:   a pointer to the pcre[16]_extra block
1511 Returns:    nothing
1512 */
1513 
1514 #ifdef COMPILE_PCRE8
1515 PCRE_EXP_DEFN void
pcre_free_study(pcre_extra * extra)1516 pcre_free_study(pcre_extra *extra)
1517 #else
1518 PCRE_EXP_DEFN void
1519 pcre16_free_study(pcre16_extra *extra)
1520 #endif
1521 {
1522 if (extra == NULL)
1523   return;
1524 #ifdef SUPPORT_JIT
1525 if ((extra->flags & PCRE_EXTRA_EXECUTABLE_JIT) != 0 &&
1526      extra->executable_jit != NULL)
1527   PRIV(jit_free)(extra->executable_jit);
1528 #endif
1529 PUBL(free)(extra);
1530 }
1531 
1532 /* End of pcre_study.c */
1533