• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*************************************************
2 *      Perl-Compatible Regular Expressions       *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8                        Written by Philip Hazel
9            Copyright (c) 1997-2012 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15     * Redistributions of source code must retain the above copyright notice,
16       this list of conditions and the following disclaimer.
17 
18     * Redistributions in binary form must reproduce the above copyright
19       notice, this list of conditions and the following disclaimer in the
20       documentation and/or other materials provided with the distribution.
21 
22     * Neither the name of the University of Cambridge nor the names of its
23       contributors may be used to endorse or promote products derived from
24       this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains the external function pcre_compile(), along with
42 supporting internal functions that are not used by other modules. */
43 
44 
45 #include "config.h"
46 
47 #define NLBLOCK cd             /* Block containing newline information */
48 #define PSSTART start_pattern  /* Field containing processed string start */
49 #define PSEND   end_pattern    /* Field containing processed string end */
50 
51 #include "pcre_internal.h"
52 
53 #ifdef GLIB_COMPILATION
54 #include "gstrfuncs.h"
55 #else
56 #include <glib.h>
57 #endif
58 
59 /* When PCRE_DEBUG is defined, we need the pcre(16)_printint() function, which
60 is also used by pcretest. PCRE_DEBUG is not defined when building a production
61 library. We do not need to select pcre16_printint.c specially, because the
62 COMPILE_PCREx macro will already be appropriately set. */
63 
64 #ifdef PCRE_DEBUG
65 /* pcre_printint.c should not include any headers */
66 #define PCRE_INCLUDED
67 #include "pcre_printint.c"
68 #undef PCRE_INCLUDED
69 #endif
70 
71 
72 /* Macro for setting individual bits in class bitmaps. */
73 
74 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
75 
76 /* Maximum length value to check against when making sure that the integer that
77 holds the compiled pattern length does not overflow. We make it a bit less than
78 INT_MAX to allow for adding in group terminating bytes, so that we don't have
79 to check them every time. */
80 
81 #define OFLOW_MAX (INT_MAX - 20)
82 
83 
84 /*************************************************
85 *      Code parameters and static tables         *
86 *************************************************/
87 
88 /* This value specifies the size of stack workspace that is used during the
89 first pre-compile phase that determines how much memory is required. The regex
90 is partly compiled into this space, but the compiled parts are discarded as
91 soon as they can be, so that hopefully there will never be an overrun. The code
92 does, however, check for an overrun. The largest amount I've seen used is 218,
93 so this number is very generous.
94 
95 The same workspace is used during the second, actual compile phase for
96 remembering forward references to groups so that they can be filled in at the
97 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
98 is 4 there is plenty of room for most patterns. However, the memory can get
99 filled up by repetitions of forward references, for example patterns like
100 /(?1){0,1999}(b)/, and one user did hit the limit. The code has been changed so
101 that the workspace is expanded using malloc() in this situation. The value
102 below is therefore a minimum, and we put a maximum on it for safety. The
103 minimum is now also defined in terms of LINK_SIZE so that the use of malloc()
104 kicks in at the same number of forward references in all cases. */
105 
106 #define COMPILE_WORK_SIZE (2048*LINK_SIZE)
107 #define COMPILE_WORK_SIZE_MAX (100*COMPILE_WORK_SIZE)
108 
109 /* The overrun tests check for a slightly smaller size so that they detect the
110 overrun before it actually does run off the end of the data block. */
111 
112 #define WORK_SIZE_SAFETY_MARGIN (100)
113 
114 /* Private flags added to firstchar and reqchar. */
115 
116 #define REQ_CASELESS   0x10000000l      /* Indicates caselessness */
117 #define REQ_VARY       0x20000000l      /* Reqchar followed non-literal item */
118 
119 /* Repeated character flags. */
120 
121 #define UTF_LENGTH     0x10000000l      /* The char contains its length. */
122 
123 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
124 are simple data values; negative values are for special things like \d and so
125 on. Zero means further processing is needed (for things like \x), or the escape
126 is invalid. */
127 
128 #ifndef EBCDIC
129 
130 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
131 in UTF-8 mode. */
132 
133 static const short int escapes[] = {
134      0,                       0,
135      0,                       0,
136      0,                       0,
137      0,                       0,
138      0,                       0,
139      CHAR_COLON,              CHAR_SEMICOLON,
140      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
141      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
142      CHAR_COMMERCIAL_AT,      -ESC_A,
143      -ESC_B,                  -ESC_C,
144      -ESC_D,                  -ESC_E,
145      0,                       -ESC_G,
146      -ESC_H,                  0,
147      0,                       -ESC_K,
148      0,                       0,
149      -ESC_N,                  0,
150      -ESC_P,                  -ESC_Q,
151      -ESC_R,                  -ESC_S,
152      0,                       0,
153      -ESC_V,                  -ESC_W,
154      -ESC_X,                  0,
155      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
156      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
157      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
158      CHAR_GRAVE_ACCENT,       7,
159      -ESC_b,                  0,
160      -ESC_d,                  ESC_e,
161      ESC_f,                   0,
162      -ESC_h,                  0,
163      0,                       -ESC_k,
164      0,                       0,
165      ESC_n,                   0,
166      -ESC_p,                  0,
167      ESC_r,                   -ESC_s,
168      ESC_tee,                 0,
169      -ESC_v,                  -ESC_w,
170      0,                       0,
171      -ESC_z
172 };
173 
174 #else
175 
176 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
177 
178 static const short int escapes[] = {
179 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
180 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
181 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
182 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
183 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
184 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
185 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
186 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
187 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
188 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
189 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
190 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
191 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
192 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
193 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
194 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
195 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
196 /*  D0 */   '}',     0, -ESC_K,       0,      0,-ESC_N,      0, -ESC_P,
197 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
198 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
199 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
200 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
201 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
202 };
203 #endif
204 
205 
206 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
207 searched linearly. Put all the names into a single string, in order to reduce
208 the number of relocations when a shared library is dynamically linked. The
209 string is built from string macros so that it works in UTF-8 mode on EBCDIC
210 platforms. */
211 
212 typedef struct verbitem {
213   int   len;                 /* Length of verb name */
214   int   op;                  /* Op when no arg, or -1 if arg mandatory */
215   int   op_arg;              /* Op when arg present, or -1 if not allowed */
216 } verbitem;
217 
218 static const char verbnames[] =
219   "\0"                       /* Empty name is a shorthand for MARK */
220   STRING_MARK0
221   STRING_ACCEPT0
222   STRING_COMMIT0
223   STRING_F0
224   STRING_FAIL0
225   STRING_PRUNE0
226   STRING_SKIP0
227   STRING_THEN;
228 
229 static const verbitem verbs[] = {
230   { 0, -1,        OP_MARK },
231   { 4, -1,        OP_MARK },
232   { 6, OP_ACCEPT, -1 },
233   { 6, OP_COMMIT, -1 },
234   { 1, OP_FAIL,   -1 },
235   { 4, OP_FAIL,   -1 },
236   { 5, OP_PRUNE,  OP_PRUNE_ARG },
237   { 4, OP_SKIP,   OP_SKIP_ARG  },
238   { 4, OP_THEN,   OP_THEN_ARG  }
239 };
240 
241 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
242 
243 
244 /* Tables of names of POSIX character classes and their lengths. The names are
245 now all in a single string, to reduce the number of relocations when a shared
246 library is dynamically loaded. The list of lengths is terminated by a zero
247 length entry. The first three must be alpha, lower, upper, as this is assumed
248 for handling case independence. */
249 
250 static const char posix_names[] =
251   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
252   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
253   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
254   STRING_word0  STRING_xdigit;
255 
256 static const pcre_uint8 posix_name_lengths[] = {
257   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
258 
259 /* Table of class bit maps for each POSIX class. Each class is formed from a
260 base map, with an optional addition or removal of another map. Then, for some
261 classes, there is some additional tweaking: for [:blank:] the vertical space
262 characters are removed, and for [:alpha:] and [:alnum:] the underscore
263 character is removed. The triples in the table consist of the base map offset,
264 second map offset or -1 if no second map, and a non-negative value for map
265 addition or a negative value for map subtraction (if there are two maps). The
266 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
267 remove vertical space characters, 2 => remove underscore. */
268 
269 static const int posix_class_maps[] = {
270   cbit_word,  cbit_digit, -2,             /* alpha */
271   cbit_lower, -1,          0,             /* lower */
272   cbit_upper, -1,          0,             /* upper */
273   cbit_word,  -1,          2,             /* alnum - word without underscore */
274   cbit_print, cbit_cntrl,  0,             /* ascii */
275   cbit_space, -1,          1,             /* blank - a GNU extension */
276   cbit_cntrl, -1,          0,             /* cntrl */
277   cbit_digit, -1,          0,             /* digit */
278   cbit_graph, -1,          0,             /* graph */
279   cbit_print, -1,          0,             /* print */
280   cbit_punct, -1,          0,             /* punct */
281   cbit_space, -1,          0,             /* space */
282   cbit_word,  -1,          0,             /* word - a Perl extension */
283   cbit_xdigit,-1,          0              /* xdigit */
284 };
285 
286 /* Table of substitutes for \d etc when PCRE_UCP is set. The POSIX class
287 substitutes must be in the order of the names, defined above, and there are
288 both positive and negative cases. NULL means no substitute. */
289 
290 #ifdef SUPPORT_UCP
291 static const pcre_uchar string_PNd[]  = {
292   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
293   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
294 static const pcre_uchar string_pNd[]  = {
295   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
296   CHAR_N, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
297 static const pcre_uchar string_PXsp[] = {
298   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
299   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
300 static const pcre_uchar string_pXsp[] = {
301   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
302   CHAR_X, CHAR_s, CHAR_p, CHAR_RIGHT_CURLY_BRACKET, '\0' };
303 static const pcre_uchar string_PXwd[] = {
304   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
305   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
306 static const pcre_uchar string_pXwd[] = {
307   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
308   CHAR_X, CHAR_w, CHAR_d, CHAR_RIGHT_CURLY_BRACKET, '\0' };
309 
310 static const pcre_uchar *substitutes[] = {
311   string_PNd,           /* \D */
312   string_pNd,           /* \d */
313   string_PXsp,          /* \S */       /* NOTE: Xsp is Perl space */
314   string_pXsp,          /* \s */
315   string_PXwd,          /* \W */
316   string_pXwd           /* \w */
317 };
318 
319 static const pcre_uchar string_pL[] =   {
320   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
321   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
322 static const pcre_uchar string_pLl[] =  {
323   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
324   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
325 static const pcre_uchar string_pLu[] =  {
326   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
327   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
328 static const pcre_uchar string_pXan[] = {
329   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
330   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
331 static const pcre_uchar string_h[] =    {
332   CHAR_BACKSLASH, CHAR_h, '\0' };
333 static const pcre_uchar string_pXps[] = {
334   CHAR_BACKSLASH, CHAR_p, CHAR_LEFT_CURLY_BRACKET,
335   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
336 static const pcre_uchar string_PL[] =   {
337   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
338   CHAR_L, CHAR_RIGHT_CURLY_BRACKET, '\0' };
339 static const pcre_uchar string_PLl[] =  {
340   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
341   CHAR_L, CHAR_l, CHAR_RIGHT_CURLY_BRACKET, '\0' };
342 static const pcre_uchar string_PLu[] =  {
343   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
344   CHAR_L, CHAR_u, CHAR_RIGHT_CURLY_BRACKET, '\0' };
345 static const pcre_uchar string_PXan[] = {
346   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
347   CHAR_X, CHAR_a, CHAR_n, CHAR_RIGHT_CURLY_BRACKET, '\0' };
348 static const pcre_uchar string_H[] =    {
349   CHAR_BACKSLASH, CHAR_H, '\0' };
350 static const pcre_uchar string_PXps[] = {
351   CHAR_BACKSLASH, CHAR_P, CHAR_LEFT_CURLY_BRACKET,
352   CHAR_X, CHAR_p, CHAR_s, CHAR_RIGHT_CURLY_BRACKET, '\0' };
353 
354 static const pcre_uchar *posix_substitutes[] = {
355   string_pL,            /* alpha */
356   string_pLl,           /* lower */
357   string_pLu,           /* upper */
358   string_pXan,          /* alnum */
359   NULL,                 /* ascii */
360   string_h,             /* blank */
361   NULL,                 /* cntrl */
362   string_pNd,           /* digit */
363   NULL,                 /* graph */
364   NULL,                 /* print */
365   NULL,                 /* punct */
366   string_pXps,          /* space */    /* NOTE: Xps is POSIX space */
367   string_pXwd,          /* word */
368   NULL,                 /* xdigit */
369   /* Negated cases */
370   string_PL,            /* ^alpha */
371   string_PLl,           /* ^lower */
372   string_PLu,           /* ^upper */
373   string_PXan,          /* ^alnum */
374   NULL,                 /* ^ascii */
375   string_H,             /* ^blank */
376   NULL,                 /* ^cntrl */
377   string_PNd,           /* ^digit */
378   NULL,                 /* ^graph */
379   NULL,                 /* ^print */
380   NULL,                 /* ^punct */
381   string_PXps,          /* ^space */   /* NOTE: Xps is POSIX space */
382   string_PXwd,          /* ^word */
383   NULL                  /* ^xdigit */
384 };
385 #define POSIX_SUBSIZE (sizeof(posix_substitutes) / sizeof(pcre_uchar *))
386 #endif
387 
388 #define STRING(a)  # a
389 #define XSTRING(s) STRING(s)
390 
391 /* The texts of compile-time error messages. These are "char *" because they
392 are passed to the outside world. Do not ever re-use any error number, because
393 they are documented. Always add a new error instead. Messages marked DEAD below
394 are no longer used. This used to be a table of strings, but in order to reduce
395 the number of relocations needed when a shared library is loaded dynamically,
396 it is now one long string. We cannot use a table of offsets, because the
397 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
398 simply count through to the one we want - this isn't a performance issue
399 because these strings are used only when there is a compilation error.
400 
401 Each substring ends with \0 to insert a null character. This includes the final
402 substring, so that the whole string ends with \0\0, which can be detected when
403 counting through. */
404 
405 static const char error_texts[] =
406   "no error\0"
407   "\\ at end of pattern\0"
408   "\\c at end of pattern\0"
409   "unrecognized character follows \\\0"
410   "numbers out of order in {} quantifier\0"
411   /* 5 */
412   "number too big in {} quantifier\0"
413   "missing terminating ] for character class\0"
414   "invalid escape sequence in character class\0"
415   "range out of order in character class\0"
416   "nothing to repeat\0"
417   /* 10 */
418   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
419   "internal error: unexpected repeat\0"
420   "unrecognized character after (? or (?-\0"
421   "POSIX named classes are supported only within a class\0"
422   "missing )\0"
423   /* 15 */
424   "reference to non-existent subpattern\0"
425   "erroffset passed as NULL\0"
426   "unknown option bit(s) set\0"
427   "missing ) after comment\0"
428   "parentheses nested too deeply\0"  /** DEAD **/
429   /* 20 */
430   "regular expression is too large\0"
431   "failed to get memory\0"
432   "unmatched parentheses\0"
433   "internal error: code overflow\0"
434   "unrecognized character after (?<\0"
435   /* 25 */
436   "lookbehind assertion is not fixed length\0"
437   "malformed number or name after (?(\0"
438   "conditional group contains more than two branches\0"
439   "assertion expected after (?(\0"
440   "(?R or (?[+-]digits must be followed by )\0"
441   /* 30 */
442   "unknown POSIX class name\0"
443   "POSIX collating elements are not supported\0"
444   "this version of PCRE is compiled without UTF support\0"
445   "spare error\0"  /** DEAD **/
446   "character value in \\x{...} sequence is too large\0"
447   /* 35 */
448   "invalid condition (?(0)\0"
449   "\\C not allowed in lookbehind assertion\0"
450   "PCRE does not support \\L, \\l, \\N{name}, \\U, or \\u\0"
451   "number after (?C is > 255\0"
452   "closing ) for (?C expected\0"
453   /* 40 */
454   "recursive call could loop indefinitely\0"
455   "unrecognized character after (?P\0"
456   "syntax error in subpattern name (missing terminator)\0"
457   "two named subpatterns have the same name\0"
458   "invalid UTF-8 string\0"
459   /* 45 */
460   "support for \\P, \\p, and \\X has not been compiled\0"
461   "malformed \\P or \\p sequence\0"
462   "unknown property name after \\P or \\p\0"
463   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
464   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
465   /* 50 */
466   "repeated subpattern is too long\0"    /** DEAD **/
467   "octal value is greater than \\377 in 8-bit non-UTF-8 mode\0"
468   "internal error: overran compiling workspace\0"
469   "internal error: previously-checked referenced subpattern not found\0"
470   "DEFINE group contains more than one branch\0"
471   /* 55 */
472   "repeating a DEFINE group is not allowed\0"  /** DEAD **/
473   "inconsistent NEWLINE options\0"
474   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
475   "a numbered reference must not be zero\0"
476   "an argument is not allowed for (*ACCEPT), (*FAIL), or (*COMMIT)\0"
477   /* 60 */
478   "(*VERB) not recognized\0"
479   "number is too big\0"
480   "subpattern name expected\0"
481   "digit expected after (?+\0"
482   "] is an invalid data character in JavaScript compatibility mode\0"
483   /* 65 */
484   "different names for subpatterns of the same number are not allowed\0"
485   "(*MARK) must have an argument\0"
486   "this version of PCRE is not compiled with Unicode property support\0"
487   "\\c must be followed by an ASCII character\0"
488   "\\k is not followed by a braced, angle-bracketed, or quoted name\0"
489   /* 70 */
490   "internal error: unknown opcode in find_fixedlength()\0"
491   "\\N is not supported in a class\0"
492   "too many forward references\0"
493   "disallowed Unicode code point (>= 0xd800 && <= 0xdfff)\0"
494   "invalid UTF-16 string\0"
495   /* 75 */
496   "name is too long in (*MARK), (*PRUNE), (*SKIP), or (*THEN)\0"
497   "character value in \\u.... sequence is too large\0"
498   ;
499 
500 /* Table to identify digits and hex digits. This is used when compiling
501 patterns. Note that the tables in chartables are dependent on the locale, and
502 may mark arbitrary characters as digits - but the PCRE compiling code expects
503 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
504 a private table here. It costs 256 bytes, but it is a lot faster than doing
505 character value tests (at least in some simple cases I timed), and in some
506 applications one wants PCRE to compile efficiently as well as match
507 efficiently.
508 
509 For convenience, we use the same bit definitions as in chartables:
510 
511   0x04   decimal digit
512   0x08   hexadecimal digit
513 
514 Then we can use ctype_digit and ctype_xdigit in the code. */
515 
516 /* Using a simple comparison for decimal numbers rather than a memory read
517 is much faster, and the resulting code is simpler (the compiler turns it
518 into a subtraction and unsigned comparison). */
519 
520 #define IS_DIGIT(x) ((x) >= CHAR_0 && (x) <= CHAR_9)
521 
522 #if 0
523 #ifndef EBCDIC
524 
525 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
526 UTF-8 mode. */
527 
528 static const pcre_uint8 digitab[] =
529   {
530   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
531   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
532   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
533   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
534   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
535   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
536   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
537   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
538   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
539   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
540   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
541   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
542   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
543   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
544   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
545   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
546   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
547   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
548   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
549   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
550   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
551   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
552   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
553   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
554   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
555   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
556   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
557   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
558   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
559   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
560   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
561   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
562 
563 #else
564 
565 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
566 
567 static const pcre_uint8 digitab[] =
568   {
569   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
570   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
571   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
572   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
573   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
574   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
575   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
576   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
577   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
578   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
579   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
580   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
581   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
582   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
583   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
584   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
585   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
586   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
587   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
588   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
589   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
590   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
591   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
592   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
593   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
594   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
595   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
596   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
597   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
598   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
599   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
600   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
601 
602 static const pcre_uint8 ebcdic_chartab[] = { /* chartable partial dup */
603   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
604   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
605   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
606   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
607   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
608   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
609   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
610   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
611   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
612   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
613   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
614   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
615   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
616   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
617   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
618   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
619   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
620   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
621   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
622   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
623   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
624   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
625   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
626   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
627   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
628   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
629   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
630   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
631   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
632   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
633   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
634   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
635 #endif
636 #endif /* 0 */
637 
638 /* Definition to allow mutual recursion */
639 
640 static BOOL
641   compile_regex(int, pcre_uchar **, const pcre_uchar **, int *, BOOL, BOOL, int, int,
642     int *, int *, branch_chain *, compile_data *, int *);
643 
644 
645 
646 /*************************************************
647 *            Find an error text                  *
648 *************************************************/
649 
650 /* The error texts are now all in one long string, to save on relocations. As
651 some of the text is of unknown length, we can't use a table of offsets.
652 Instead, just count through the strings. This is not a performance issue
653 because it happens only when there has been a compilation error.
654 
655 Argument:   the error number
656 Returns:    pointer to the error string
657 */
658 
659 static const char *
find_error_text(int n)660 find_error_text(int n)
661 {
662 const char *s = error_texts;
663 for (; n > 0; n--)
664   {
665   while (*s++ != 0) {};
666   if (*s == 0) return "Error text not found (please report)";
667   }
668 return s;
669 }
670 
671 
672 /*************************************************
673 *           Expand the workspace                 *
674 *************************************************/
675 
676 /* This function is called during the second compiling phase, if the number of
677 forward references fills the existing workspace, which is originally a block on
678 the stack. A larger block is obtained from malloc() unless the ultimate limit
679 has been reached or the increase will be rather small.
680 
681 Argument: pointer to the compile data block
682 Returns:  0 if all went well, else an error number
683 */
684 
685 static int
expand_workspace(compile_data * cd)686 expand_workspace(compile_data *cd)
687 {
688 pcre_uchar *newspace;
689 int newsize = cd->workspace_size * 2;
690 
691 if (newsize > COMPILE_WORK_SIZE_MAX) newsize = COMPILE_WORK_SIZE_MAX;
692 if (cd->workspace_size >= COMPILE_WORK_SIZE_MAX ||
693     newsize - cd->workspace_size < WORK_SIZE_SAFETY_MARGIN)
694  return ERR72;
695 
696 newspace = (PUBL(malloc))(IN_UCHARS(newsize));
697 if (newspace == NULL) return ERR21;
698 memcpy(newspace, cd->start_workspace, cd->workspace_size * sizeof(pcre_uchar));
699 cd->hwm = (pcre_uchar *)newspace + (cd->hwm - cd->start_workspace);
700 if (cd->workspace_size > COMPILE_WORK_SIZE)
701   (PUBL(free))((void *)cd->start_workspace);
702 cd->start_workspace = newspace;
703 cd->workspace_size = newsize;
704 return 0;
705 }
706 
707 
708 
709 /*************************************************
710 *            Check for counted repeat            *
711 *************************************************/
712 
713 /* This function is called when a '{' is encountered in a place where it might
714 start a quantifier. It looks ahead to see if it really is a quantifier or not.
715 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
716 where the ddds are digits.
717 
718 Arguments:
719   p         pointer to the first char after '{'
720 
721 Returns:    TRUE or FALSE
722 */
723 
724 static BOOL
is_counted_repeat(const pcre_uchar * p)725 is_counted_repeat(const pcre_uchar *p)
726 {
727 if (!IS_DIGIT(*p)) return FALSE;
728 p++;
729 while (IS_DIGIT(*p)) p++;
730 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
731 
732 if (*p++ != CHAR_COMMA) return FALSE;
733 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
734 
735 if (!IS_DIGIT(*p)) return FALSE;
736 p++;
737 while (IS_DIGIT(*p)) p++;
738 
739 return (*p == CHAR_RIGHT_CURLY_BRACKET);
740 }
741 
742 
743 
744 /*************************************************
745 *            Handle escapes                      *
746 *************************************************/
747 
748 /* This function is called when a \ has been encountered. It either returns a
749 positive value for a simple escape such as \n, or a negative value which
750 encodes one of the more complicated things such as \d. A backreference to group
751 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
752 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
753 ptr is pointing at the \. On exit, it is on the final character of the escape
754 sequence.
755 
756 Arguments:
757   ptrptr         points to the pattern position pointer
758   errorcodeptr   points to the errorcode variable
759   bracount       number of previous extracting brackets
760   options        the options bits
761   isclass        TRUE if inside a character class
762 
763 Returns:         zero or positive => a data character
764                  negative => a special escape sequence
765                  on error, errorcodeptr is set
766 */
767 
768 static int
check_escape(const pcre_uchar ** ptrptr,int * errorcodeptr,int bracount,int options,BOOL isclass)769 check_escape(const pcre_uchar **ptrptr, int *errorcodeptr, int bracount,
770   int options, BOOL isclass)
771 {
772 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
773 BOOL utf = (options & PCRE_UTF8) != 0;
774 const pcre_uchar *ptr = *ptrptr + 1;
775 pcre_int32 c;
776 int i;
777 
778 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
779 ptr--;                            /* Set pointer back to the last byte */
780 
781 /* If backslash is at the end of the pattern, it's an error. */
782 
783 if (c == 0) *errorcodeptr = ERR1;
784 
785 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
786 in a table. A non-zero result is something that can be returned immediately.
787 Otherwise further processing may be required. */
788 
789 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
790 /* Not alphanumeric */
791 else if (c < CHAR_0 || c > CHAR_z) {}
792 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
793 
794 #else           /* EBCDIC coding */
795 /* Not alphanumeric */
796 else if (c < 'a' || (!MAX_255(c) || (ebcdic_chartab[c] & 0x0E) == 0)) {}
797 else if ((i = escapes[c - 0x48]) != 0)  c = i;
798 #endif
799 
800 /* Escapes that need further processing, or are illegal. */
801 
802 else
803   {
804   const pcre_uchar *oldptr;
805   BOOL braced, negated;
806 
807   switch (c)
808     {
809     /* A number of Perl escapes are not handled by PCRE. We give an explicit
810     error. */
811 
812     case CHAR_l:
813     case CHAR_L:
814     *errorcodeptr = ERR37;
815     break;
816 
817     case CHAR_u:
818     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
819       {
820       /* In JavaScript, \u must be followed by four hexadecimal numbers.
821       Otherwise it is a lowercase u letter. */
822       if (MAX_255(ptr[1]) && g_ascii_isxdigit(ptr[1]) != 0
823         && MAX_255(ptr[2]) && g_ascii_isxdigit(ptr[2]) != 0
824         && MAX_255(ptr[3]) && g_ascii_isxdigit(ptr[3]) != 0
825         && MAX_255(ptr[4]) && g_ascii_isxdigit(ptr[4]) != 0)
826         {
827         c = 0;
828         for (i = 0; i < 4; ++i)
829           {
830           int cc = *(++ptr);
831 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
832           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
833           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
834 #else           /* EBCDIC coding */
835           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
836           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
837 #endif
838           }
839 
840 #ifdef COMPILE_PCRE8
841         if (c > (utf ? 0x10ffff : 0xff))
842 #else
843 #ifdef COMPILE_PCRE16
844         if (c > (utf ? 0x10ffff : 0xffff))
845 #endif
846 #endif
847           {
848           *errorcodeptr = ERR76;
849           }
850         else if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
851         }
852       }
853     else
854       *errorcodeptr = ERR37;
855     break;
856 
857     case CHAR_U:
858     /* In JavaScript, \U is an uppercase U letter. */
859     if ((options & PCRE_JAVASCRIPT_COMPAT) == 0) *errorcodeptr = ERR37;
860     break;
861 
862     /* In a character class, \g is just a literal "g". Outside a character
863     class, \g must be followed by one of a number of specific things:
864 
865     (1) A number, either plain or braced. If positive, it is an absolute
866     backreference. If negative, it is a relative backreference. This is a Perl
867     5.10 feature.
868 
869     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
870     is part of Perl's movement towards a unified syntax for back references. As
871     this is synonymous with \k{name}, we fudge it up by pretending it really
872     was \k.
873 
874     (3) For Oniguruma compatibility we also support \g followed by a name or a
875     number either in angle brackets or in single quotes. However, these are
876     (possibly recursive) subroutine calls, _not_ backreferences. Just return
877     the -ESC_g code (cf \k). */
878 
879     case CHAR_g:
880     if (isclass) break;
881     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
882       {
883       c = -ESC_g;
884       break;
885       }
886 
887     /* Handle the Perl-compatible cases */
888 
889     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
890       {
891       const pcre_uchar *p;
892       for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
893         if (*p != CHAR_MINUS && !IS_DIGIT(*p)) break;
894       if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
895         {
896         c = -ESC_k;
897         break;
898         }
899       braced = TRUE;
900       ptr++;
901       }
902     else braced = FALSE;
903 
904     if (ptr[1] == CHAR_MINUS)
905       {
906       negated = TRUE;
907       ptr++;
908       }
909     else negated = FALSE;
910 
911     /* The integer range is limited by the machine's int representation. */
912     c = 0;
913     while (IS_DIGIT(ptr[1]))
914       {
915       if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
916         {
917         c = -1;
918         break;
919         }
920       c = c * 10 + *(++ptr) - CHAR_0;
921       }
922     if (((unsigned int)c) > INT_MAX) /* Integer overflow */
923       {
924       while (IS_DIGIT(ptr[1]))
925         ptr++;
926       *errorcodeptr = ERR61;
927       break;
928       }
929 
930     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
931       {
932       *errorcodeptr = ERR57;
933       break;
934       }
935 
936     if (c == 0)
937       {
938       *errorcodeptr = ERR58;
939       break;
940       }
941 
942     if (negated)
943       {
944       if (c > bracount)
945         {
946         *errorcodeptr = ERR15;
947         break;
948         }
949       c = bracount - (c - 1);
950       }
951 
952     c = -(ESC_REF + c);
953     break;
954 
955     /* The handling of escape sequences consisting of a string of digits
956     starting with one that is not zero is not straightforward. By experiment,
957     the way Perl works seems to be as follows:
958 
959     Outside a character class, the digits are read as a decimal number. If the
960     number is less than 10, or if there are that many previous extracting
961     left brackets, then it is a back reference. Otherwise, up to three octal
962     digits are read to form an escaped byte. Thus \123 is likely to be octal
963     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
964     value is greater than 377, the least significant 8 bits are taken. Inside a
965     character class, \ followed by a digit is always an octal number. */
966 
967     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
968     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
969 
970     if (!isclass)
971       {
972       oldptr = ptr;
973       /* The integer range is limited by the machine's int representation. */
974       c -= CHAR_0;
975       while (IS_DIGIT(ptr[1]))
976         {
977         if (((unsigned int)c) > INT_MAX / 10) /* Integer overflow */
978           {
979           c = -1;
980           break;
981           }
982         c = c * 10 + *(++ptr) - CHAR_0;
983         }
984       if (((unsigned int)c) > INT_MAX) /* Integer overflow */
985         {
986         while (IS_DIGIT(ptr[1]))
987           ptr++;
988         *errorcodeptr = ERR61;
989         break;
990         }
991       if (c < 10 || c <= bracount)
992         {
993         c = -(ESC_REF + c);
994         break;
995         }
996       ptr = oldptr;      /* Put the pointer back and fall through */
997       }
998 
999     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
1000     generates a binary zero byte and treats the digit as a following literal.
1001     Thus we have to pull back the pointer by one. */
1002 
1003     if ((c = *ptr) >= CHAR_8)
1004       {
1005       ptr--;
1006       c = 0;
1007       break;
1008       }
1009 
1010     /* \0 always starts an octal number, but we may drop through to here with a
1011     larger first octal digit. The original code used just to take the least
1012     significant 8 bits of octal numbers (I think this is what early Perls used
1013     to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
1014     but no more than 3 octal digits. */
1015 
1016     case CHAR_0:
1017     c -= CHAR_0;
1018     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
1019         c = c * 8 + *(++ptr) - CHAR_0;
1020 #ifdef COMPILE_PCRE8
1021     if (!utf && c > 0xff) *errorcodeptr = ERR51;
1022 #endif
1023     break;
1024 
1025     /* \x is complicated. \x{ddd} is a character number which can be greater
1026     than 0xff in utf or non-8bit mode, but only if the ddd are hex digits.
1027     If not, { is treated as a data character. */
1028 
1029     case CHAR_x:
1030     if ((options & PCRE_JAVASCRIPT_COMPAT) != 0)
1031       {
1032       /* In JavaScript, \x must be followed by two hexadecimal numbers.
1033       Otherwise it is a lowercase x letter. */
1034       if (MAX_255(ptr[1]) && g_ascii_isxdigit(ptr[1]) != 0
1035         && MAX_255(ptr[2]) && g_ascii_isxdigit(ptr[2]) != 0)
1036         {
1037         c = 0;
1038         for (i = 0; i < 2; ++i)
1039           {
1040           int cc = *(++ptr);
1041 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1042           if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1043           c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1044 #else           /* EBCDIC coding */
1045           if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1046           c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1047 #endif
1048           }
1049         }
1050       break;
1051       }
1052 
1053     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
1054       {
1055       const pcre_uchar *pt = ptr + 2;
1056 
1057       c = 0;
1058       while (MAX_255(*pt) && g_ascii_isxdigit(*pt) != 0)
1059         {
1060         int cc = *pt++;
1061         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
1062 
1063 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1064         if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
1065         c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1066 #else           /* EBCDIC coding */
1067         if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
1068         c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1069 #endif
1070 
1071 #ifdef COMPILE_PCRE8
1072         if (c > (utf ? 0x10ffff : 0xff)) { c = -1; break; }
1073 #else
1074 #ifdef COMPILE_PCRE16
1075         if (c > (utf ? 0x10ffff : 0xffff)) { c = -1; break; }
1076 #endif
1077 #endif
1078         }
1079 
1080       if (c < 0)
1081         {
1082         while (MAX_255(*pt) && g_ascii_isxdigit(*pt) != 0) pt++;
1083         *errorcodeptr = ERR34;
1084         }
1085 
1086       if (*pt == CHAR_RIGHT_CURLY_BRACKET)
1087         {
1088         if (utf && c >= 0xd800 && c <= 0xdfff) *errorcodeptr = ERR73;
1089         ptr = pt;
1090         break;
1091         }
1092 
1093       /* If the sequence of hex digits does not end with '}', then we don't
1094       recognize this construct; fall through to the normal \x handling. */
1095       }
1096 
1097     /* Read just a single-byte hex-defined char */
1098 
1099     c = 0;
1100     while (i++ < 2 && MAX_255(ptr[1]) && g_ascii_isxdigit(ptr[1]) != 0)
1101       {
1102       int cc;                                  /* Some compilers don't like */
1103       cc = *(++ptr);                           /* ++ in initializers */
1104 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
1105       if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
1106       c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
1107 #else           /* EBCDIC coding */
1108       if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
1109       c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
1110 #endif
1111       }
1112     break;
1113 
1114     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
1115     An error is given if the byte following \c is not an ASCII character. This
1116     coding is ASCII-specific, but then the whole concept of \cx is
1117     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
1118 
1119     case CHAR_c:
1120     c = *(++ptr);
1121     if (c == 0)
1122       {
1123       *errorcodeptr = ERR2;
1124       break;
1125       }
1126 #ifndef EBCDIC    /* ASCII/UTF-8 coding */
1127     if (c > 127)  /* Excludes all non-ASCII in either mode */
1128       {
1129       *errorcodeptr = ERR68;
1130       break;
1131       }
1132     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
1133     c ^= 0x40;
1134 #else             /* EBCDIC coding */
1135     if (c >= CHAR_a && c <= CHAR_z) c += 64;
1136     c ^= 0xC0;
1137 #endif
1138     break;
1139 
1140     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
1141     other alphanumeric following \ is an error if PCRE_EXTRA was set;
1142     otherwise, for Perl compatibility, it is a literal. This code looks a bit
1143     odd, but there used to be some cases other than the default, and there may
1144     be again in future, so I haven't "optimized" it. */
1145 
1146     default:
1147     if ((options & PCRE_EXTRA) != 0) switch(c)
1148       {
1149       default:
1150       *errorcodeptr = ERR3;
1151       break;
1152       }
1153     break;
1154     }
1155   }
1156 
1157 /* Perl supports \N{name} for character names, as well as plain \N for "not
1158 newline". PCRE does not support \N{name}. However, it does support
1159 quantification such as \N{2,3}. */
1160 
1161 if (c == -ESC_N && ptr[1] == CHAR_LEFT_CURLY_BRACKET &&
1162      !is_counted_repeat(ptr+2))
1163   *errorcodeptr = ERR37;
1164 
1165 /* If PCRE_UCP is set, we change the values for \d etc. */
1166 
1167 if ((options & PCRE_UCP) != 0 && c <= -ESC_D && c >= -ESC_w)
1168   c -= (ESC_DU - ESC_D);
1169 
1170 /* Set the pointer to the final character before returning. */
1171 
1172 *ptrptr = ptr;
1173 return c;
1174 }
1175 
1176 
1177 
1178 #ifdef SUPPORT_UCP
1179 /*************************************************
1180 *               Handle \P and \p                 *
1181 *************************************************/
1182 
1183 /* This function is called after \P or \p has been encountered, provided that
1184 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
1185 pointing at the P or p. On exit, it is pointing at the final character of the
1186 escape sequence.
1187 
1188 Argument:
1189   ptrptr         points to the pattern position pointer
1190   negptr         points to a boolean that is set TRUE for negation else FALSE
1191   dptr           points to an int that is set to the detailed property value
1192   errorcodeptr   points to the error code variable
1193 
1194 Returns:         type value from ucp_type_table, or -1 for an invalid type
1195 */
1196 
1197 static int
get_ucp(const pcre_uchar ** ptrptr,BOOL * negptr,int * dptr,int * errorcodeptr)1198 get_ucp(const pcre_uchar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
1199 {
1200 int c, i, bot, top;
1201 const pcre_uchar *ptr = *ptrptr;
1202 pcre_uchar name[32];
1203 
1204 c = *(++ptr);
1205 if (c == 0) goto ERROR_RETURN;
1206 
1207 *negptr = FALSE;
1208 
1209 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
1210 negation. */
1211 
1212 if (c == CHAR_LEFT_CURLY_BRACKET)
1213   {
1214   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1215     {
1216     *negptr = TRUE;
1217     ptr++;
1218     }
1219   for (i = 0; i < (int)(sizeof(name) / sizeof(pcre_uchar)) - 1; i++)
1220     {
1221     c = *(++ptr);
1222     if (c == 0) goto ERROR_RETURN;
1223     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
1224     name[i] = c;
1225     }
1226   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
1227   name[i] = 0;
1228   }
1229 
1230 /* Otherwise there is just one following character */
1231 
1232 else
1233   {
1234   name[0] = c;
1235   name[1] = 0;
1236   }
1237 
1238 *ptrptr = ptr;
1239 
1240 /* Search for a recognized property name using binary chop */
1241 
1242 bot = 0;
1243 top = PRIV(utt_size);
1244 
1245 while (bot < top)
1246   {
1247   i = (bot + top) >> 1;
1248   c = STRCMP_UC_C8(name, PRIV(utt_names) + PRIV(utt)[i].name_offset);
1249   if (c == 0)
1250     {
1251     *dptr = PRIV(utt)[i].value;
1252     return PRIV(utt)[i].type;
1253     }
1254   if (c > 0) bot = i + 1; else top = i;
1255   }
1256 
1257 *errorcodeptr = ERR47;
1258 *ptrptr = ptr;
1259 return -1;
1260 
1261 ERROR_RETURN:
1262 *errorcodeptr = ERR46;
1263 *ptrptr = ptr;
1264 return -1;
1265 }
1266 #endif
1267 
1268 
1269 
1270 
1271 /*************************************************
1272 *         Read repeat counts                     *
1273 *************************************************/
1274 
1275 /* Read an item of the form {n,m} and return the values. This is called only
1276 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
1277 so the syntax is guaranteed to be correct, but we need to check the values.
1278 
1279 Arguments:
1280   p              pointer to first char after '{'
1281   minp           pointer to int for min
1282   maxp           pointer to int for max
1283                  returned as -1 if no max
1284   errorcodeptr   points to error code variable
1285 
1286 Returns:         pointer to '}' on success;
1287                  current ptr on error, with errorcodeptr set non-zero
1288 */
1289 
1290 static const pcre_uchar *
read_repeat_counts(const pcre_uchar * p,int * minp,int * maxp,int * errorcodeptr)1291 read_repeat_counts(const pcre_uchar *p, int *minp, int *maxp, int *errorcodeptr)
1292 {
1293 int min = 0;
1294 int max = -1;
1295 
1296 /* Read the minimum value and do a paranoid check: a negative value indicates
1297 an integer overflow. */
1298 
1299 while (IS_DIGIT(*p)) min = min * 10 + *p++ - CHAR_0;
1300 if (min < 0 || min > 65535)
1301   {
1302   *errorcodeptr = ERR5;
1303   return p;
1304   }
1305 
1306 /* Read the maximum value if there is one, and again do a paranoid on its size.
1307 Also, max must not be less than min. */
1308 
1309 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
1310   {
1311   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
1312     {
1313     max = 0;
1314     while(IS_DIGIT(*p)) max = max * 10 + *p++ - CHAR_0;
1315     if (max < 0 || max > 65535)
1316       {
1317       *errorcodeptr = ERR5;
1318       return p;
1319       }
1320     if (max < min)
1321       {
1322       *errorcodeptr = ERR4;
1323       return p;
1324       }
1325     }
1326   }
1327 
1328 /* Fill in the required variables, and pass back the pointer to the terminating
1329 '}'. */
1330 
1331 *minp = min;
1332 *maxp = max;
1333 return p;
1334 }
1335 
1336 
1337 
1338 /*************************************************
1339 *  Subroutine for finding forward reference      *
1340 *************************************************/
1341 
1342 /* This recursive function is called only from find_parens() below. The
1343 top-level call starts at the beginning of the pattern. All other calls must
1344 start at a parenthesis. It scans along a pattern's text looking for capturing
1345 subpatterns, and counting them. If it finds a named pattern that matches the
1346 name it is given, it returns its number. Alternatively, if the name is NULL, it
1347 returns when it reaches a given numbered subpattern. Recursion is used to keep
1348 track of subpatterns that reset the capturing group numbers - the (?| feature.
1349 
1350 This function was originally called only from the second pass, in which we know
1351 that if (?< or (?' or (?P< is encountered, the name will be correctly
1352 terminated because that is checked in the first pass. There is now one call to
1353 this function in the first pass, to check for a recursive back reference by
1354 name (so that we can make the whole group atomic). In this case, we need check
1355 only up to the current position in the pattern, and that is still OK because
1356 and previous occurrences will have been checked. To make this work, the test
1357 for "end of pattern" is a check against cd->end_pattern in the main loop,
1358 instead of looking for a binary zero. This means that the special first-pass
1359 call can adjust cd->end_pattern temporarily. (Checks for binary zero while
1360 processing items within the loop are OK, because afterwards the main loop will
1361 terminate.)
1362 
1363 Arguments:
1364   ptrptr       address of the current character pointer (updated)
1365   cd           compile background data
1366   name         name to seek, or NULL if seeking a numbered subpattern
1367   lorn         name length, or subpattern number if name is NULL
1368   xmode        TRUE if we are in /x mode
1369   utf          TRUE if we are in UTF-8 / UTF-16 mode
1370   count        pointer to the current capturing subpattern number (updated)
1371 
1372 Returns:       the number of the named subpattern, or -1 if not found
1373 */
1374 
1375 static int
find_parens_sub(pcre_uchar ** ptrptr,compile_data * cd,const pcre_uchar * name,int lorn,BOOL xmode,BOOL utf,int * count)1376 find_parens_sub(pcre_uchar **ptrptr, compile_data *cd, const pcre_uchar *name, int lorn,
1377   BOOL xmode, BOOL utf, int *count)
1378 {
1379 pcre_uchar *ptr = *ptrptr;
1380 int start_count = *count;
1381 int hwm_count = start_count;
1382 BOOL dup_parens = FALSE;
1383 
1384 /* If the first character is a parenthesis, check on the type of group we are
1385 dealing with. The very first call may not start with a parenthesis. */
1386 
1387 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1388   {
1389   /* Handle specials such as (*SKIP) or (*UTF8) etc. */
1390 
1391   if (ptr[1] == CHAR_ASTERISK) ptr += 2;
1392 
1393   /* Handle a normal, unnamed capturing parenthesis. */
1394 
1395   else if (ptr[1] != CHAR_QUESTION_MARK)
1396     {
1397     *count += 1;
1398     if (name == NULL && *count == lorn) return *count;
1399     ptr++;
1400     }
1401 
1402   /* All cases now have (? at the start. Remember when we are in a group
1403   where the parenthesis numbers are duplicated. */
1404 
1405   else if (ptr[2] == CHAR_VERTICAL_LINE)
1406     {
1407     ptr += 3;
1408     dup_parens = TRUE;
1409     }
1410 
1411   /* Handle comments; all characters are allowed until a ket is reached. */
1412 
1413   else if (ptr[2] == CHAR_NUMBER_SIGN)
1414     {
1415     for (ptr += 3; *ptr != 0; ptr++) if (*ptr == CHAR_RIGHT_PARENTHESIS) break;
1416     goto FAIL_EXIT;
1417     }
1418 
1419   /* Handle a condition. If it is an assertion, just carry on so that it
1420   is processed as normal. If not, skip to the closing parenthesis of the
1421   condition (there can't be any nested parens). */
1422 
1423   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1424     {
1425     ptr += 2;
1426     if (ptr[1] != CHAR_QUESTION_MARK)
1427       {
1428       while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1429       if (*ptr != 0) ptr++;
1430       }
1431     }
1432 
1433   /* Start with (? but not a condition. */
1434 
1435   else
1436     {
1437     ptr += 2;
1438     if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1439 
1440     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1441 
1442     if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1443         ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1444       {
1445       int term;
1446       const pcre_uchar *thisname;
1447       *count += 1;
1448       if (name == NULL && *count == lorn) return *count;
1449       term = *ptr++;
1450       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1451       thisname = ptr;
1452       while (*ptr != term) ptr++;
1453       if (name != NULL && lorn == ptr - thisname &&
1454           STRNCMP_UC_UC(name, thisname, lorn) == 0)
1455         return *count;
1456       term++;
1457       }
1458     }
1459   }
1460 
1461 /* Past any initial parenthesis handling, scan for parentheses or vertical
1462 bars. Stop if we get to cd->end_pattern. Note that this is important for the
1463 first-pass call when this value is temporarily adjusted to stop at the current
1464 position. So DO NOT change this to a test for binary zero. */
1465 
1466 for (; ptr < cd->end_pattern; ptr++)
1467   {
1468   /* Skip over backslashed characters and also entire \Q...\E */
1469 
1470   if (*ptr == CHAR_BACKSLASH)
1471     {
1472     if (*(++ptr) == 0) goto FAIL_EXIT;
1473     if (*ptr == CHAR_Q) for (;;)
1474       {
1475       while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1476       if (*ptr == 0) goto FAIL_EXIT;
1477       if (*(++ptr) == CHAR_E) break;
1478       }
1479     continue;
1480     }
1481 
1482   /* Skip over character classes; this logic must be similar to the way they
1483   are handled for real. If the first character is '^', skip it. Also, if the
1484   first few characters (either before or after ^) are \Q\E or \E we skip them
1485   too. This makes for compatibility with Perl. Note the use of STR macros to
1486   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1487 
1488   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1489     {
1490     BOOL negate_class = FALSE;
1491     for (;;)
1492       {
1493       if (ptr[1] == CHAR_BACKSLASH)
1494         {
1495         if (ptr[2] == CHAR_E)
1496           ptr+= 2;
1497         else if (STRNCMP_UC_C8(ptr + 2,
1498                  STR_Q STR_BACKSLASH STR_E, 3) == 0)
1499           ptr += 4;
1500         else
1501           break;
1502         }
1503       else if (!negate_class && ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
1504         {
1505         negate_class = TRUE;
1506         ptr++;
1507         }
1508       else break;
1509       }
1510 
1511     /* If the next character is ']', it is a data character that must be
1512     skipped, except in JavaScript compatibility mode. */
1513 
1514     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1515         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1516       ptr++;
1517 
1518     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1519       {
1520       if (*ptr == 0) return -1;
1521       if (*ptr == CHAR_BACKSLASH)
1522         {
1523         if (*(++ptr) == 0) goto FAIL_EXIT;
1524         if (*ptr == CHAR_Q) for (;;)
1525           {
1526           while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1527           if (*ptr == 0) goto FAIL_EXIT;
1528           if (*(++ptr) == CHAR_E) break;
1529           }
1530         continue;
1531         }
1532       }
1533     continue;
1534     }
1535 
1536   /* Skip comments in /x mode */
1537 
1538   if (xmode && *ptr == CHAR_NUMBER_SIGN)
1539     {
1540     ptr++;
1541     while (*ptr != 0)
1542       {
1543       if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
1544       ptr++;
1545 #ifdef SUPPORT_UTF
1546       if (utf) FORWARDCHAR(ptr);
1547 #endif
1548       }
1549     if (*ptr == 0) goto FAIL_EXIT;
1550     continue;
1551     }
1552 
1553   /* Check for the special metacharacters */
1554 
1555   if (*ptr == CHAR_LEFT_PARENTHESIS)
1556     {
1557     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, count);
1558     if (rc > 0) return rc;
1559     if (*ptr == 0) goto FAIL_EXIT;
1560     }
1561 
1562   else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1563     {
1564     if (dup_parens && *count < hwm_count) *count = hwm_count;
1565     goto FAIL_EXIT;
1566     }
1567 
1568   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1569     {
1570     if (*count > hwm_count) hwm_count = *count;
1571     *count = start_count;
1572     }
1573   }
1574 
1575 FAIL_EXIT:
1576 *ptrptr = ptr;
1577 return -1;
1578 }
1579 
1580 
1581 
1582 
1583 /*************************************************
1584 *       Find forward referenced subpattern       *
1585 *************************************************/
1586 
1587 /* This function scans along a pattern's text looking for capturing
1588 subpatterns, and counting them. If it finds a named pattern that matches the
1589 name it is given, it returns its number. Alternatively, if the name is NULL, it
1590 returns when it reaches a given numbered subpattern. This is used for forward
1591 references to subpatterns. We used to be able to start this scan from the
1592 current compiling point, using the current count value from cd->bracount, and
1593 do it all in a single loop, but the addition of the possibility of duplicate
1594 subpattern numbers means that we have to scan from the very start, in order to
1595 take account of such duplicates, and to use a recursive function to keep track
1596 of the different types of group.
1597 
1598 Arguments:
1599   cd           compile background data
1600   name         name to seek, or NULL if seeking a numbered subpattern
1601   lorn         name length, or subpattern number if name is NULL
1602   xmode        TRUE if we are in /x mode
1603   utf          TRUE if we are in UTF-8 / UTF-16 mode
1604 
1605 Returns:       the number of the found subpattern, or -1 if not found
1606 */
1607 
1608 static int
find_parens(compile_data * cd,const pcre_uchar * name,int lorn,BOOL xmode,BOOL utf)1609 find_parens(compile_data *cd, const pcre_uchar *name, int lorn, BOOL xmode,
1610   BOOL utf)
1611 {
1612 pcre_uchar *ptr = (pcre_uchar *)cd->start_pattern;
1613 int count = 0;
1614 int rc;
1615 
1616 /* If the pattern does not start with an opening parenthesis, the first call
1617 to find_parens_sub() will scan right to the end (if necessary). However, if it
1618 does start with a parenthesis, find_parens_sub() will return when it hits the
1619 matching closing parens. That is why we have to have a loop. */
1620 
1621 for (;;)
1622   {
1623   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, utf, &count);
1624   if (rc > 0 || *ptr++ == 0) break;
1625   }
1626 
1627 return rc;
1628 }
1629 
1630 
1631 
1632 
1633 /*************************************************
1634 *      Find first significant op code            *
1635 *************************************************/
1636 
1637 /* This is called by several functions that scan a compiled expression looking
1638 for a fixed first character, or an anchoring op code etc. It skips over things
1639 that do not influence this. For some calls, it makes sense to skip negative
1640 forward and all backward assertions, and also the \b assertion; for others it
1641 does not.
1642 
1643 Arguments:
1644   code         pointer to the start of the group
1645   skipassert   TRUE if certain assertions are to be skipped
1646 
1647 Returns:       pointer to the first significant opcode
1648 */
1649 
1650 static const pcre_uchar*
first_significant_code(const pcre_uchar * code,BOOL skipassert)1651 first_significant_code(const pcre_uchar *code, BOOL skipassert)
1652 {
1653 for (;;)
1654   {
1655   switch ((int)*code)
1656     {
1657     case OP_ASSERT_NOT:
1658     case OP_ASSERTBACK:
1659     case OP_ASSERTBACK_NOT:
1660     if (!skipassert) return code;
1661     do code += GET(code, 1); while (*code == OP_ALT);
1662     code += PRIV(OP_lengths)[*code];
1663     break;
1664 
1665     case OP_WORD_BOUNDARY:
1666     case OP_NOT_WORD_BOUNDARY:
1667     if (!skipassert) return code;
1668     /* Fall through */
1669 
1670     case OP_CALLOUT:
1671     case OP_CREF:
1672     case OP_NCREF:
1673     case OP_RREF:
1674     case OP_NRREF:
1675     case OP_DEF:
1676     code += PRIV(OP_lengths)[*code];
1677     break;
1678 
1679     default:
1680     return code;
1681     }
1682   }
1683 /* Control never reaches here */
1684 }
1685 
1686 
1687 
1688 
1689 /*************************************************
1690 *        Find the fixed length of a branch       *
1691 *************************************************/
1692 
1693 /* Scan a branch and compute the fixed length of subject that will match it,
1694 if the length is fixed. This is needed for dealing with backward assertions.
1695 In UTF8 mode, the result is in characters rather than bytes. The branch is
1696 temporarily terminated with OP_END when this function is called.
1697 
1698 This function is called when a backward assertion is encountered, so that if it
1699 fails, the error message can point to the correct place in the pattern.
1700 However, we cannot do this when the assertion contains subroutine calls,
1701 because they can be forward references. We solve this by remembering this case
1702 and doing the check at the end; a flag specifies which mode we are running in.
1703 
1704 Arguments:
1705   code     points to the start of the pattern (the bracket)
1706   utf      TRUE in UTF-8 / UTF-16 mode
1707   atend    TRUE if called when the pattern is complete
1708   cd       the "compile data" structure
1709 
1710 Returns:   the fixed length,
1711              or -1 if there is no fixed length,
1712              or -2 if \C was encountered (in UTF-8 mode only)
1713              or -3 if an OP_RECURSE item was encountered and atend is FALSE
1714              or -4 if an unknown opcode was encountered (internal error)
1715 */
1716 
1717 static int
find_fixedlength(pcre_uchar * code,BOOL utf,BOOL atend,compile_data * cd)1718 find_fixedlength(pcre_uchar *code, BOOL utf, BOOL atend, compile_data *cd)
1719 {
1720 int length = -1;
1721 
1722 int branchlength = 0;
1723 pcre_uchar *cc = code + 1 + LINK_SIZE;
1724 
1725 /* Scan along the opcodes for this branch. If we get to the end of the
1726 branch, check the length against that of the other branches. */
1727 
1728 for (;;)
1729   {
1730   int d;
1731   pcre_uchar *ce, *cs;
1732   int op = *cc;
1733 
1734   switch (op)
1735     {
1736     /* We only need to continue for OP_CBRA (normal capturing bracket) and
1737     OP_BRA (normal non-capturing bracket) because the other variants of these
1738     opcodes are all concerned with unlimited repeated groups, which of course
1739     are not of fixed length. */
1740 
1741     case OP_CBRA:
1742     case OP_BRA:
1743     case OP_ONCE:
1744     case OP_ONCE_NC:
1745     case OP_COND:
1746     d = find_fixedlength(cc + ((op == OP_CBRA)? IMM2_SIZE : 0), utf, atend, cd);
1747     if (d < 0) return d;
1748     branchlength += d;
1749     do cc += GET(cc, 1); while (*cc == OP_ALT);
1750     cc += 1 + LINK_SIZE;
1751     break;
1752 
1753     /* Reached end of a branch; if it's a ket it is the end of a nested call.
1754     If it's ALT it is an alternation in a nested call. An ACCEPT is effectively
1755     an ALT. If it is END it's the end of the outer call. All can be handled by
1756     the same code. Note that we must not include the OP_KETRxxx opcodes here,
1757     because they all imply an unlimited repeat. */
1758 
1759     case OP_ALT:
1760     case OP_KET:
1761     case OP_END:
1762     case OP_ACCEPT:
1763     case OP_ASSERT_ACCEPT:
1764     if (length < 0) length = branchlength;
1765       else if (length != branchlength) return -1;
1766     if (*cc != OP_ALT) return length;
1767     cc += 1 + LINK_SIZE;
1768     branchlength = 0;
1769     break;
1770 
1771     /* A true recursion implies not fixed length, but a subroutine call may
1772     be OK. If the subroutine is a forward reference, we can't deal with
1773     it until the end of the pattern, so return -3. */
1774 
1775     case OP_RECURSE:
1776     if (!atend) return -3;
1777     cs = ce = (pcre_uchar *)cd->start_code + GET(cc, 1);  /* Start subpattern */
1778     do ce += GET(ce, 1); while (*ce == OP_ALT);           /* End subpattern */
1779     if (cc > cs && cc < ce) return -1;                    /* Recursion */
1780     d = find_fixedlength(cs + IMM2_SIZE, utf, atend, cd);
1781     if (d < 0) return d;
1782     branchlength += d;
1783     cc += 1 + LINK_SIZE;
1784     break;
1785 
1786     /* Skip over assertive subpatterns */
1787 
1788     case OP_ASSERT:
1789     case OP_ASSERT_NOT:
1790     case OP_ASSERTBACK:
1791     case OP_ASSERTBACK_NOT:
1792     do cc += GET(cc, 1); while (*cc == OP_ALT);
1793     cc += PRIV(OP_lengths)[*cc];
1794     break;
1795 
1796     /* Skip over things that don't match chars */
1797 
1798     case OP_MARK:
1799     case OP_PRUNE_ARG:
1800     case OP_SKIP_ARG:
1801     case OP_THEN_ARG:
1802     cc += cc[1] + PRIV(OP_lengths)[*cc];
1803     break;
1804 
1805     case OP_CALLOUT:
1806     case OP_CIRC:
1807     case OP_CIRCM:
1808     case OP_CLOSE:
1809     case OP_COMMIT:
1810     case OP_CREF:
1811     case OP_DEF:
1812     case OP_DOLL:
1813     case OP_DOLLM:
1814     case OP_EOD:
1815     case OP_EODN:
1816     case OP_FAIL:
1817     case OP_NCREF:
1818     case OP_NRREF:
1819     case OP_NOT_WORD_BOUNDARY:
1820     case OP_PRUNE:
1821     case OP_REVERSE:
1822     case OP_RREF:
1823     case OP_SET_SOM:
1824     case OP_SKIP:
1825     case OP_SOD:
1826     case OP_SOM:
1827     case OP_THEN:
1828     case OP_WORD_BOUNDARY:
1829     cc += PRIV(OP_lengths)[*cc];
1830     break;
1831 
1832     /* Handle literal characters */
1833 
1834     case OP_CHAR:
1835     case OP_CHARI:
1836     case OP_NOT:
1837     case OP_NOTI:
1838     branchlength++;
1839     cc += 2;
1840 #ifdef SUPPORT_UTF
1841     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1842 #endif
1843     break;
1844 
1845     /* Handle exact repetitions. The count is already in characters, but we
1846     need to skip over a multibyte character in UTF8 mode.  */
1847 
1848     case OP_EXACT:
1849     case OP_EXACTI:
1850     case OP_NOTEXACT:
1851     case OP_NOTEXACTI:
1852     branchlength += GET2(cc,1);
1853     cc += 2 + IMM2_SIZE;
1854 #ifdef SUPPORT_UTF
1855     if (utf && HAS_EXTRALEN(cc[-1])) cc += GET_EXTRALEN(cc[-1]);
1856 #endif
1857     break;
1858 
1859     case OP_TYPEEXACT:
1860     branchlength += GET2(cc,1);
1861     if (cc[1 + IMM2_SIZE] == OP_PROP || cc[1 + IMM2_SIZE] == OP_NOTPROP) cc += 2;
1862     cc += 1 + IMM2_SIZE + 1;
1863     break;
1864 
1865     /* Handle single-char matchers */
1866 
1867     case OP_PROP:
1868     case OP_NOTPROP:
1869     cc += 2;
1870     /* Fall through */
1871 
1872     case OP_HSPACE:
1873     case OP_VSPACE:
1874     case OP_NOT_HSPACE:
1875     case OP_NOT_VSPACE:
1876     case OP_NOT_DIGIT:
1877     case OP_DIGIT:
1878     case OP_NOT_WHITESPACE:
1879     case OP_WHITESPACE:
1880     case OP_NOT_WORDCHAR:
1881     case OP_WORDCHAR:
1882     case OP_ANY:
1883     case OP_ALLANY:
1884     branchlength++;
1885     cc++;
1886     break;
1887 
1888     /* The single-byte matcher isn't allowed. This only happens in UTF-8 mode;
1889     otherwise \C is coded as OP_ALLANY. */
1890 
1891     case OP_ANYBYTE:
1892     return -2;
1893 
1894     /* Check a class for variable quantification */
1895 
1896 #if defined SUPPORT_UTF || defined COMPILE_PCRE16
1897     case OP_XCLASS:
1898     cc += GET(cc, 1) - PRIV(OP_lengths)[OP_CLASS];
1899     /* Fall through */
1900 #endif
1901 
1902     case OP_CLASS:
1903     case OP_NCLASS:
1904     cc += PRIV(OP_lengths)[OP_CLASS];
1905 
1906     switch (*cc)
1907       {
1908       case OP_CRPLUS:
1909       case OP_CRMINPLUS:
1910       case OP_CRSTAR:
1911       case OP_CRMINSTAR:
1912       case OP_CRQUERY:
1913       case OP_CRMINQUERY:
1914       return -1;
1915 
1916       case OP_CRRANGE:
1917       case OP_CRMINRANGE:
1918       if (GET2(cc,1) != GET2(cc,1+IMM2_SIZE)) return -1;
1919       branchlength += GET2(cc,1);
1920       cc += 1 + 2 * IMM2_SIZE;
1921       break;
1922 
1923       default:
1924       branchlength++;
1925       }
1926     break;
1927 
1928     /* Anything else is variable length */
1929 
1930     case OP_ANYNL:
1931     case OP_BRAMINZERO:
1932     case OP_BRAPOS:
1933     case OP_BRAPOSZERO:
1934     case OP_BRAZERO:
1935     case OP_CBRAPOS:
1936     case OP_EXTUNI:
1937     case OP_KETRMAX:
1938     case OP_KETRMIN:
1939     case OP_KETRPOS:
1940     case OP_MINPLUS:
1941     case OP_MINPLUSI:
1942     case OP_MINQUERY:
1943     case OP_MINQUERYI:
1944     case OP_MINSTAR:
1945     case OP_MINSTARI:
1946     case OP_MINUPTO:
1947     case OP_MINUPTOI:
1948     case OP_NOTMINPLUS:
1949     case OP_NOTMINPLUSI:
1950     case OP_NOTMINQUERY:
1951     case OP_NOTMINQUERYI:
1952     case OP_NOTMINSTAR:
1953     case OP_NOTMINSTARI:
1954     case OP_NOTMINUPTO:
1955     case OP_NOTMINUPTOI:
1956     case OP_NOTPLUS:
1957     case OP_NOTPLUSI:
1958     case OP_NOTPOSPLUS:
1959     case OP_NOTPOSPLUSI:
1960     case OP_NOTPOSQUERY:
1961     case OP_NOTPOSQUERYI:
1962     case OP_NOTPOSSTAR:
1963     case OP_NOTPOSSTARI:
1964     case OP_NOTPOSUPTO:
1965     case OP_NOTPOSUPTOI:
1966     case OP_NOTQUERY:
1967     case OP_NOTQUERYI:
1968     case OP_NOTSTAR:
1969     case OP_NOTSTARI:
1970     case OP_NOTUPTO:
1971     case OP_NOTUPTOI:
1972     case OP_PLUS:
1973     case OP_PLUSI:
1974     case OP_POSPLUS:
1975     case OP_POSPLUSI:
1976     case OP_POSQUERY:
1977     case OP_POSQUERYI:
1978     case OP_POSSTAR:
1979     case OP_POSSTARI:
1980     case OP_POSUPTO:
1981     case OP_POSUPTOI:
1982     case OP_QUERY:
1983     case OP_QUERYI:
1984     case OP_REF:
1985     case OP_REFI:
1986     case OP_SBRA:
1987     case OP_SBRAPOS:
1988     case OP_SCBRA:
1989     case OP_SCBRAPOS:
1990     case OP_SCOND:
1991     case OP_SKIPZERO:
1992     case OP_STAR:
1993     case OP_STARI:
1994     case OP_TYPEMINPLUS:
1995     case OP_TYPEMINQUERY:
1996     case OP_TYPEMINSTAR:
1997     case OP_TYPEMINUPTO:
1998     case OP_TYPEPLUS:
1999     case OP_TYPEPOSPLUS:
2000     case OP_TYPEPOSQUERY:
2001     case OP_TYPEPOSSTAR:
2002     case OP_TYPEPOSUPTO:
2003     case OP_TYPEQUERY:
2004     case OP_TYPESTAR:
2005     case OP_TYPEUPTO:
2006     case OP_UPTO:
2007     case OP_UPTOI:
2008     return -1;
2009 
2010     /* Catch unrecognized opcodes so that when new ones are added they
2011     are not forgotten, as has happened in the past. */
2012 
2013     default:
2014     return -4;
2015     }
2016   }
2017 /* Control never gets here */
2018 }
2019 
2020 
2021 
2022 
2023 /*************************************************
2024 *    Scan compiled regex for specific bracket    *
2025 *************************************************/
2026 
2027 /* This little function scans through a compiled pattern until it finds a
2028 capturing bracket with the given number, or, if the number is negative, an
2029 instance of OP_REVERSE for a lookbehind. The function is global in the C sense
2030 so that it can be called from pcre_study() when finding the minimum matching
2031 length.
2032 
2033 Arguments:
2034   code        points to start of expression
2035   utf         TRUE in UTF-8 / UTF-16 mode
2036   number      the required bracket number or negative to find a lookbehind
2037 
2038 Returns:      pointer to the opcode for the bracket, or NULL if not found
2039 */
2040 
2041 const pcre_uchar *
PRIV(find_bracket)2042 PRIV(find_bracket)(const pcre_uchar *code, BOOL utf, int number)
2043 {
2044 for (;;)
2045   {
2046   int c = *code;
2047 
2048   if (c == OP_END) return NULL;
2049 
2050   /* XCLASS is used for classes that cannot be represented just by a bit
2051   map. This includes negated single high-valued characters. The length in
2052   the table is zero; the actual length is stored in the compiled code. */
2053 
2054   if (c == OP_XCLASS) code += GET(code, 1);
2055 
2056   /* Handle recursion */
2057 
2058   else if (c == OP_REVERSE)
2059     {
2060     if (number < 0) return (pcre_uchar *)code;
2061     code += PRIV(OP_lengths)[c];
2062     }
2063 
2064   /* Handle capturing bracket */
2065 
2066   else if (c == OP_CBRA || c == OP_SCBRA ||
2067            c == OP_CBRAPOS || c == OP_SCBRAPOS)
2068     {
2069     int n = GET2(code, 1+LINK_SIZE);
2070     if (n == number) return (pcre_uchar *)code;
2071     code += PRIV(OP_lengths)[c];
2072     }
2073 
2074   /* Otherwise, we can get the item's length from the table, except that for
2075   repeated character types, we have to test for \p and \P, which have an extra
2076   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2077   must add in its length. */
2078 
2079   else
2080     {
2081     switch(c)
2082       {
2083       case OP_TYPESTAR:
2084       case OP_TYPEMINSTAR:
2085       case OP_TYPEPLUS:
2086       case OP_TYPEMINPLUS:
2087       case OP_TYPEQUERY:
2088       case OP_TYPEMINQUERY:
2089       case OP_TYPEPOSSTAR:
2090       case OP_TYPEPOSPLUS:
2091       case OP_TYPEPOSQUERY:
2092       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2093       break;
2094 
2095       case OP_TYPEUPTO:
2096       case OP_TYPEMINUPTO:
2097       case OP_TYPEEXACT:
2098       case OP_TYPEPOSUPTO:
2099       if (code[1 + IMM2_SIZE] == OP_PROP
2100         || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2101       break;
2102 
2103       case OP_MARK:
2104       case OP_PRUNE_ARG:
2105       case OP_SKIP_ARG:
2106       code += code[1];
2107       break;
2108 
2109       case OP_THEN_ARG:
2110       code += code[1];
2111       break;
2112       }
2113 
2114     /* Add in the fixed length from the table */
2115 
2116     code += PRIV(OP_lengths)[c];
2117 
2118   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
2119   a multi-byte character. The length in the table is a minimum, so we have to
2120   arrange to skip the extra bytes. */
2121 
2122 #ifdef SUPPORT_UTF
2123     if (utf) switch(c)
2124       {
2125       case OP_CHAR:
2126       case OP_CHARI:
2127       case OP_EXACT:
2128       case OP_EXACTI:
2129       case OP_UPTO:
2130       case OP_UPTOI:
2131       case OP_MINUPTO:
2132       case OP_MINUPTOI:
2133       case OP_POSUPTO:
2134       case OP_POSUPTOI:
2135       case OP_STAR:
2136       case OP_STARI:
2137       case OP_MINSTAR:
2138       case OP_MINSTARI:
2139       case OP_POSSTAR:
2140       case OP_POSSTARI:
2141       case OP_PLUS:
2142       case OP_PLUSI:
2143       case OP_MINPLUS:
2144       case OP_MINPLUSI:
2145       case OP_POSPLUS:
2146       case OP_POSPLUSI:
2147       case OP_QUERY:
2148       case OP_QUERYI:
2149       case OP_MINQUERY:
2150       case OP_MINQUERYI:
2151       case OP_POSQUERY:
2152       case OP_POSQUERYI:
2153       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2154       break;
2155       }
2156 #else
2157     (void)(utf);  /* Keep compiler happy by referencing function argument */
2158 #endif
2159     }
2160   }
2161 }
2162 
2163 
2164 
2165 /*************************************************
2166 *   Scan compiled regex for recursion reference  *
2167 *************************************************/
2168 
2169 /* This little function scans through a compiled pattern until it finds an
2170 instance of OP_RECURSE.
2171 
2172 Arguments:
2173   code        points to start of expression
2174   utf         TRUE in UTF-8 / UTF-16 mode
2175 
2176 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
2177 */
2178 
2179 static const pcre_uchar *
find_recurse(const pcre_uchar * code,BOOL utf)2180 find_recurse(const pcre_uchar *code, BOOL utf)
2181 {
2182 for (;;)
2183   {
2184   int c = *code;
2185   if (c == OP_END) return NULL;
2186   if (c == OP_RECURSE) return code;
2187 
2188   /* XCLASS is used for classes that cannot be represented just by a bit
2189   map. This includes negated single high-valued characters. The length in
2190   the table is zero; the actual length is stored in the compiled code. */
2191 
2192   if (c == OP_XCLASS) code += GET(code, 1);
2193 
2194   /* Otherwise, we can get the item's length from the table, except that for
2195   repeated character types, we have to test for \p and \P, which have an extra
2196   two bytes of parameters, and for MARK/PRUNE/SKIP/THEN with an argument, we
2197   must add in its length. */
2198 
2199   else
2200     {
2201     switch(c)
2202       {
2203       case OP_TYPESTAR:
2204       case OP_TYPEMINSTAR:
2205       case OP_TYPEPLUS:
2206       case OP_TYPEMINPLUS:
2207       case OP_TYPEQUERY:
2208       case OP_TYPEMINQUERY:
2209       case OP_TYPEPOSSTAR:
2210       case OP_TYPEPOSPLUS:
2211       case OP_TYPEPOSQUERY:
2212       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2213       break;
2214 
2215       case OP_TYPEPOSUPTO:
2216       case OP_TYPEUPTO:
2217       case OP_TYPEMINUPTO:
2218       case OP_TYPEEXACT:
2219       if (code[1 + IMM2_SIZE] == OP_PROP
2220         || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2221       break;
2222 
2223       case OP_MARK:
2224       case OP_PRUNE_ARG:
2225       case OP_SKIP_ARG:
2226       code += code[1];
2227       break;
2228 
2229       case OP_THEN_ARG:
2230       code += code[1];
2231       break;
2232       }
2233 
2234     /* Add in the fixed length from the table */
2235 
2236     code += PRIV(OP_lengths)[c];
2237 
2238     /* In UTF-8 mode, opcodes that are followed by a character may be followed
2239     by a multi-byte character. The length in the table is a minimum, so we have
2240     to arrange to skip the extra bytes. */
2241 
2242 #ifdef SUPPORT_UTF
2243     if (utf) switch(c)
2244       {
2245       case OP_CHAR:
2246       case OP_CHARI:
2247       case OP_NOT:
2248       case OP_NOTI:
2249       case OP_EXACT:
2250       case OP_EXACTI:
2251       case OP_NOTEXACT:
2252       case OP_NOTEXACTI:
2253       case OP_UPTO:
2254       case OP_UPTOI:
2255       case OP_NOTUPTO:
2256       case OP_NOTUPTOI:
2257       case OP_MINUPTO:
2258       case OP_MINUPTOI:
2259       case OP_NOTMINUPTO:
2260       case OP_NOTMINUPTOI:
2261       case OP_POSUPTO:
2262       case OP_POSUPTOI:
2263       case OP_NOTPOSUPTO:
2264       case OP_NOTPOSUPTOI:
2265       case OP_STAR:
2266       case OP_STARI:
2267       case OP_NOTSTAR:
2268       case OP_NOTSTARI:
2269       case OP_MINSTAR:
2270       case OP_MINSTARI:
2271       case OP_NOTMINSTAR:
2272       case OP_NOTMINSTARI:
2273       case OP_POSSTAR:
2274       case OP_POSSTARI:
2275       case OP_NOTPOSSTAR:
2276       case OP_NOTPOSSTARI:
2277       case OP_PLUS:
2278       case OP_PLUSI:
2279       case OP_NOTPLUS:
2280       case OP_NOTPLUSI:
2281       case OP_MINPLUS:
2282       case OP_MINPLUSI:
2283       case OP_NOTMINPLUS:
2284       case OP_NOTMINPLUSI:
2285       case OP_POSPLUS:
2286       case OP_POSPLUSI:
2287       case OP_NOTPOSPLUS:
2288       case OP_NOTPOSPLUSI:
2289       case OP_QUERY:
2290       case OP_QUERYI:
2291       case OP_NOTQUERY:
2292       case OP_NOTQUERYI:
2293       case OP_MINQUERY:
2294       case OP_MINQUERYI:
2295       case OP_NOTMINQUERY:
2296       case OP_NOTMINQUERYI:
2297       case OP_POSQUERY:
2298       case OP_POSQUERYI:
2299       case OP_NOTPOSQUERY:
2300       case OP_NOTPOSQUERYI:
2301       if (HAS_EXTRALEN(code[-1])) code += GET_EXTRALEN(code[-1]);
2302       break;
2303       }
2304 #else
2305     (void)(utf);  /* Keep compiler happy by referencing function argument */
2306 #endif
2307     }
2308   }
2309 }
2310 
2311 
2312 
2313 /*************************************************
2314 *    Scan compiled branch for non-emptiness      *
2315 *************************************************/
2316 
2317 /* This function scans through a branch of a compiled pattern to see whether it
2318 can match the empty string or not. It is called from could_be_empty()
2319 below and from compile_branch() when checking for an unlimited repeat of a
2320 group that can match nothing. Note that first_significant_code() skips over
2321 backward and negative forward assertions when its final argument is TRUE. If we
2322 hit an unclosed bracket, we return "empty" - this means we've struck an inner
2323 bracket whose current branch will already have been scanned.
2324 
2325 Arguments:
2326   code        points to start of search
2327   endcode     points to where to stop
2328   utf         TRUE if in UTF-8 / UTF-16 mode
2329   cd          contains pointers to tables etc.
2330 
2331 Returns:      TRUE if what is matched could be empty
2332 */
2333 
2334 static BOOL
could_be_empty_branch(const pcre_uchar * code,const pcre_uchar * endcode,BOOL utf,compile_data * cd)2335 could_be_empty_branch(const pcre_uchar *code, const pcre_uchar *endcode,
2336   BOOL utf, compile_data *cd)
2337 {
2338 int c;
2339 for (code = first_significant_code(code + PRIV(OP_lengths)[*code], TRUE);
2340      code < endcode;
2341      code = first_significant_code(code + PRIV(OP_lengths)[c], TRUE))
2342   {
2343   const pcre_uchar *ccode;
2344 
2345   c = *code;
2346 
2347   /* Skip over forward assertions; the other assertions are skipped by
2348   first_significant_code() with a TRUE final argument. */
2349 
2350   if (c == OP_ASSERT)
2351     {
2352     do code += GET(code, 1); while (*code == OP_ALT);
2353     c = *code;
2354     continue;
2355     }
2356 
2357   /* For a recursion/subroutine call, if its end has been reached, which
2358   implies a backward reference subroutine call, we can scan it. If it's a
2359   forward reference subroutine call, we can't. To detect forward reference
2360   we have to scan up the list that is kept in the workspace. This function is
2361   called only when doing the real compile, not during the pre-compile that
2362   measures the size of the compiled pattern. */
2363 
2364   if (c == OP_RECURSE)
2365     {
2366     const pcre_uchar *scode;
2367     BOOL empty_branch;
2368 
2369     /* Test for forward reference */
2370 
2371     for (scode = cd->start_workspace; scode < cd->hwm; scode += LINK_SIZE)
2372       if (GET(scode, 0) == code + 1 - cd->start_code) return TRUE;
2373 
2374     /* Not a forward reference, test for completed backward reference */
2375 
2376     empty_branch = FALSE;
2377     scode = cd->start_code + GET(code, 1);
2378     if (GET(scode, 1) == 0) return TRUE;    /* Unclosed */
2379 
2380     /* Completed backwards reference */
2381 
2382     do
2383       {
2384       if (could_be_empty_branch(scode, endcode, utf, cd))
2385         {
2386         empty_branch = TRUE;
2387         break;
2388         }
2389       scode += GET(scode, 1);
2390       }
2391     while (*scode == OP_ALT);
2392 
2393     if (!empty_branch) return FALSE;  /* All branches are non-empty */
2394     continue;
2395     }
2396 
2397   /* Groups with zero repeats can of course be empty; skip them. */
2398 
2399   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO ||
2400       c == OP_BRAPOSZERO)
2401     {
2402     code += PRIV(OP_lengths)[c];
2403     do code += GET(code, 1); while (*code == OP_ALT);
2404     c = *code;
2405     continue;
2406     }
2407 
2408   /* A nested group that is already marked as "could be empty" can just be
2409   skipped. */
2410 
2411   if (c == OP_SBRA  || c == OP_SBRAPOS ||
2412       c == OP_SCBRA || c == OP_SCBRAPOS)
2413     {
2414     do code += GET(code, 1); while (*code == OP_ALT);
2415     c = *code;
2416     continue;
2417     }
2418 
2419   /* For other groups, scan the branches. */
2420 
2421   if (c == OP_BRA  || c == OP_BRAPOS ||
2422       c == OP_CBRA || c == OP_CBRAPOS ||
2423       c == OP_ONCE || c == OP_ONCE_NC ||
2424       c == OP_COND)
2425     {
2426     BOOL empty_branch;
2427     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
2428 
2429     /* If a conditional group has only one branch, there is a second, implied,
2430     empty branch, so just skip over the conditional, because it could be empty.
2431     Otherwise, scan the individual branches of the group. */
2432 
2433     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
2434       code += GET(code, 1);
2435     else
2436       {
2437       empty_branch = FALSE;
2438       do
2439         {
2440         if (!empty_branch && could_be_empty_branch(code, endcode, utf, cd))
2441           empty_branch = TRUE;
2442         code += GET(code, 1);
2443         }
2444       while (*code == OP_ALT);
2445       if (!empty_branch) return FALSE;   /* All branches are non-empty */
2446       }
2447 
2448     c = *code;
2449     continue;
2450     }
2451 
2452   /* Handle the other opcodes */
2453 
2454   switch (c)
2455     {
2456     /* Check for quantifiers after a class. XCLASS is used for classes that
2457     cannot be represented just by a bit map. This includes negated single
2458     high-valued characters. The length in PRIV(OP_lengths)[] is zero; the
2459     actual length is stored in the compiled code, so we must update "code"
2460     here. */
2461 
2462 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2463     case OP_XCLASS:
2464     ccode = code += GET(code, 1);
2465     goto CHECK_CLASS_REPEAT;
2466 #endif
2467 
2468     case OP_CLASS:
2469     case OP_NCLASS:
2470     ccode = code + PRIV(OP_lengths)[OP_CLASS];
2471 
2472 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
2473     CHECK_CLASS_REPEAT:
2474 #endif
2475 
2476     switch (*ccode)
2477       {
2478       case OP_CRSTAR:            /* These could be empty; continue */
2479       case OP_CRMINSTAR:
2480       case OP_CRQUERY:
2481       case OP_CRMINQUERY:
2482       break;
2483 
2484       default:                   /* Non-repeat => class must match */
2485       case OP_CRPLUS:            /* These repeats aren't empty */
2486       case OP_CRMINPLUS:
2487       return FALSE;
2488 
2489       case OP_CRRANGE:
2490       case OP_CRMINRANGE:
2491       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
2492       break;
2493       }
2494     break;
2495 
2496     /* Opcodes that must match a character */
2497 
2498     case OP_PROP:
2499     case OP_NOTPROP:
2500     case OP_EXTUNI:
2501     case OP_NOT_DIGIT:
2502     case OP_DIGIT:
2503     case OP_NOT_WHITESPACE:
2504     case OP_WHITESPACE:
2505     case OP_NOT_WORDCHAR:
2506     case OP_WORDCHAR:
2507     case OP_ANY:
2508     case OP_ALLANY:
2509     case OP_ANYBYTE:
2510     case OP_CHAR:
2511     case OP_CHARI:
2512     case OP_NOT:
2513     case OP_NOTI:
2514     case OP_PLUS:
2515     case OP_MINPLUS:
2516     case OP_POSPLUS:
2517     case OP_EXACT:
2518     case OP_NOTPLUS:
2519     case OP_NOTMINPLUS:
2520     case OP_NOTPOSPLUS:
2521     case OP_NOTEXACT:
2522     case OP_TYPEPLUS:
2523     case OP_TYPEMINPLUS:
2524     case OP_TYPEPOSPLUS:
2525     case OP_TYPEEXACT:
2526     return FALSE;
2527 
2528     /* These are going to continue, as they may be empty, but we have to
2529     fudge the length for the \p and \P cases. */
2530 
2531     case OP_TYPESTAR:
2532     case OP_TYPEMINSTAR:
2533     case OP_TYPEPOSSTAR:
2534     case OP_TYPEQUERY:
2535     case OP_TYPEMINQUERY:
2536     case OP_TYPEPOSQUERY:
2537     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
2538     break;
2539 
2540     /* Same for these */
2541 
2542     case OP_TYPEUPTO:
2543     case OP_TYPEMINUPTO:
2544     case OP_TYPEPOSUPTO:
2545     if (code[1 + IMM2_SIZE] == OP_PROP
2546       || code[1 + IMM2_SIZE] == OP_NOTPROP) code += 2;
2547     break;
2548 
2549     /* End of branch */
2550 
2551     case OP_KET:
2552     case OP_KETRMAX:
2553     case OP_KETRMIN:
2554     case OP_KETRPOS:
2555     case OP_ALT:
2556     return TRUE;
2557 
2558     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
2559     MINUPTO, and POSUPTO may be followed by a multibyte character */
2560 
2561 #ifdef SUPPORT_UTF
2562     case OP_STAR:
2563     case OP_STARI:
2564     case OP_MINSTAR:
2565     case OP_MINSTARI:
2566     case OP_POSSTAR:
2567     case OP_POSSTARI:
2568     case OP_QUERY:
2569     case OP_QUERYI:
2570     case OP_MINQUERY:
2571     case OP_MINQUERYI:
2572     case OP_POSQUERY:
2573     case OP_POSQUERYI:
2574     if (utf && HAS_EXTRALEN(code[1])) code += GET_EXTRALEN(code[1]);
2575     break;
2576 
2577     case OP_UPTO:
2578     case OP_UPTOI:
2579     case OP_MINUPTO:
2580     case OP_MINUPTOI:
2581     case OP_POSUPTO:
2582     case OP_POSUPTOI:
2583     if (utf && HAS_EXTRALEN(code[1 + IMM2_SIZE])) code += GET_EXTRALEN(code[1 + IMM2_SIZE]);
2584     break;
2585 #endif
2586 
2587     /* MARK, and PRUNE/SKIP/THEN with an argument must skip over the argument
2588     string. */
2589 
2590     case OP_MARK:
2591     case OP_PRUNE_ARG:
2592     case OP_SKIP_ARG:
2593     code += code[1];
2594     break;
2595 
2596     case OP_THEN_ARG:
2597     code += code[1];
2598     break;
2599 
2600     /* None of the remaining opcodes are required to match a character. */
2601 
2602     default:
2603     break;
2604     }
2605   }
2606 
2607 return TRUE;
2608 }
2609 
2610 
2611 
2612 /*************************************************
2613 *    Scan compiled regex for non-emptiness       *
2614 *************************************************/
2615 
2616 /* This function is called to check for left recursive calls. We want to check
2617 the current branch of the current pattern to see if it could match the empty
2618 string. If it could, we must look outwards for branches at other levels,
2619 stopping when we pass beyond the bracket which is the subject of the recursion.
2620 This function is called only during the real compile, not during the
2621 pre-compile.
2622 
2623 Arguments:
2624   code        points to start of the recursion
2625   endcode     points to where to stop (current RECURSE item)
2626   bcptr       points to the chain of current (unclosed) branch starts
2627   utf         TRUE if in UTF-8 / UTF-16 mode
2628   cd          pointers to tables etc
2629 
2630 Returns:      TRUE if what is matched could be empty
2631 */
2632 
2633 static BOOL
could_be_empty(const pcre_uchar * code,const pcre_uchar * endcode,branch_chain * bcptr,BOOL utf,compile_data * cd)2634 could_be_empty(const pcre_uchar *code, const pcre_uchar *endcode,
2635   branch_chain *bcptr, BOOL utf, compile_data *cd)
2636 {
2637 while (bcptr != NULL && bcptr->current_branch >= code)
2638   {
2639   if (!could_be_empty_branch(bcptr->current_branch, endcode, utf, cd))
2640     return FALSE;
2641   bcptr = bcptr->outer;
2642   }
2643 return TRUE;
2644 }
2645 
2646 
2647 
2648 /*************************************************
2649 *           Check for POSIX class syntax         *
2650 *************************************************/
2651 
2652 /* This function is called when the sequence "[:" or "[." or "[=" is
2653 encountered in a character class. It checks whether this is followed by a
2654 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
2655 reach an unescaped ']' without the special preceding character, return FALSE.
2656 
2657 Originally, this function only recognized a sequence of letters between the
2658 terminators, but it seems that Perl recognizes any sequence of characters,
2659 though of course unknown POSIX names are subsequently rejected. Perl gives an
2660 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
2661 didn't consider this to be a POSIX class. Likewise for [:1234:].
2662 
2663 The problem in trying to be exactly like Perl is in the handling of escapes. We
2664 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
2665 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
2666 below handles the special case of \], but does not try to do any other escape
2667 processing. This makes it different from Perl for cases such as [:l\ower:]
2668 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
2669 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
2670 I think.
2671 
2672 A user pointed out that PCRE was rejecting [:a[:digit:]] whereas Perl was not.
2673 It seems that the appearance of a nested POSIX class supersedes an apparent
2674 external class. For example, [:a[:digit:]b:] matches "a", "b", ":", or
2675 a digit.
2676 
2677 In Perl, unescaped square brackets may also appear as part of class names. For
2678 example, [:a[:abc]b:] gives unknown POSIX class "[:abc]b:]". However, for
2679 [:a[:abc]b][b:] it gives unknown POSIX class "[:abc]b][b:]", which does not
2680 seem right at all. PCRE does not allow closing square brackets in POSIX class
2681 names.
2682 
2683 Arguments:
2684   ptr      pointer to the initial [
2685   endptr   where to return the end pointer
2686 
2687 Returns:   TRUE or FALSE
2688 */
2689 
2690 static BOOL
check_posix_syntax(const pcre_uchar * ptr,const pcre_uchar ** endptr)2691 check_posix_syntax(const pcre_uchar *ptr, const pcre_uchar **endptr)
2692 {
2693 int terminator;          /* Don't combine these lines; the Solaris cc */
2694 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
2695 for (++ptr; *ptr != 0; ptr++)
2696   {
2697   if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2698     ptr++;
2699   else if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2700   else
2701     {
2702     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2703       {
2704       *endptr = ptr;
2705       return TRUE;
2706       }
2707     if (*ptr == CHAR_LEFT_SQUARE_BRACKET &&
2708          (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2709           ptr[1] == CHAR_EQUALS_SIGN) &&
2710         check_posix_syntax(ptr, endptr))
2711       return FALSE;
2712     }
2713   }
2714 return FALSE;
2715 }
2716 
2717 
2718 
2719 
2720 /*************************************************
2721 *          Check POSIX class name                *
2722 *************************************************/
2723 
2724 /* This function is called to check the name given in a POSIX-style class entry
2725 such as [:alnum:].
2726 
2727 Arguments:
2728   ptr        points to the first letter
2729   len        the length of the name
2730 
2731 Returns:     a value representing the name, or -1 if unknown
2732 */
2733 
2734 static int
check_posix_name(const pcre_uchar * ptr,int len)2735 check_posix_name(const pcre_uchar *ptr, int len)
2736 {
2737 const char *pn = posix_names;
2738 int yield = 0;
2739 while (posix_name_lengths[yield] != 0)
2740   {
2741   if (len == posix_name_lengths[yield] &&
2742     STRNCMP_UC_C8(ptr, pn, len) == 0) return yield;
2743   pn += posix_name_lengths[yield] + 1;
2744   yield++;
2745   }
2746 return -1;
2747 }
2748 
2749 
2750 /*************************************************
2751 *    Adjust OP_RECURSE items in repeated group   *
2752 *************************************************/
2753 
2754 /* OP_RECURSE items contain an offset from the start of the regex to the group
2755 that is referenced. This means that groups can be replicated for fixed
2756 repetition simply by copying (because the recursion is allowed to refer to
2757 earlier groups that are outside the current group). However, when a group is
2758 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2759 inserted before it, after it has been compiled. This means that any OP_RECURSE
2760 items within it that refer to the group itself or any contained groups have to
2761 have their offsets adjusted. That one of the jobs of this function. Before it
2762 is called, the partially compiled regex must be temporarily terminated with
2763 OP_END.
2764 
2765 This function has been extended with the possibility of forward references for
2766 recursions and subroutine calls. It must also check the list of such references
2767 for the group we are dealing with. If it finds that one of the recursions in
2768 the current group is on this list, it adjusts the offset in the list, not the
2769 value in the reference (which is a group number).
2770 
2771 Arguments:
2772   group      points to the start of the group
2773   adjust     the amount by which the group is to be moved
2774   utf        TRUE in UTF-8 / UTF-16 mode
2775   cd         contains pointers to tables etc.
2776   save_hwm   the hwm forward reference pointer at the start of the group
2777 
2778 Returns:     nothing
2779 */
2780 
2781 static void
adjust_recurse(pcre_uchar * group,int adjust,BOOL utf,compile_data * cd,pcre_uchar * save_hwm)2782 adjust_recurse(pcre_uchar *group, int adjust, BOOL utf, compile_data *cd,
2783   pcre_uchar *save_hwm)
2784 {
2785 pcre_uchar *ptr = group;
2786 
2787 while ((ptr = (pcre_uchar *)find_recurse(ptr, utf)) != NULL)
2788   {
2789   int offset;
2790   pcre_uchar *hc;
2791 
2792   /* See if this recursion is on the forward reference list. If so, adjust the
2793   reference. */
2794 
2795   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2796     {
2797     offset = GET(hc, 0);
2798     if (cd->start_code + offset == ptr + 1)
2799       {
2800       PUT(hc, 0, offset + adjust);
2801       break;
2802       }
2803     }
2804 
2805   /* Otherwise, adjust the recursion offset if it's after the start of this
2806   group. */
2807 
2808   if (hc >= cd->hwm)
2809     {
2810     offset = GET(ptr, 1);
2811     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2812     }
2813 
2814   ptr += 1 + LINK_SIZE;
2815   }
2816 }
2817 
2818 
2819 
2820 /*************************************************
2821 *        Insert an automatic callout point       *
2822 *************************************************/
2823 
2824 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2825 callout points before each pattern item.
2826 
2827 Arguments:
2828   code           current code pointer
2829   ptr            current pattern pointer
2830   cd             pointers to tables etc
2831 
2832 Returns:         new code pointer
2833 */
2834 
2835 static pcre_uchar *
auto_callout(pcre_uchar * code,const pcre_uchar * ptr,compile_data * cd)2836 auto_callout(pcre_uchar *code, const pcre_uchar *ptr, compile_data *cd)
2837 {
2838 *code++ = OP_CALLOUT;
2839 *code++ = 255;
2840 PUT(code, 0, (int)(ptr - cd->start_pattern));  /* Pattern offset */
2841 PUT(code, LINK_SIZE, 0);                       /* Default length */
2842 return code + 2 * LINK_SIZE;
2843 }
2844 
2845 
2846 
2847 /*************************************************
2848 *         Complete a callout item                *
2849 *************************************************/
2850 
2851 /* A callout item contains the length of the next item in the pattern, which
2852 we can't fill in till after we have reached the relevant point. This is used
2853 for both automatic and manual callouts.
2854 
2855 Arguments:
2856   previous_callout   points to previous callout item
2857   ptr                current pattern pointer
2858   cd                 pointers to tables etc
2859 
2860 Returns:             nothing
2861 */
2862 
2863 static void
complete_callout(pcre_uchar * previous_callout,const pcre_uchar * ptr,compile_data * cd)2864 complete_callout(pcre_uchar *previous_callout, const pcre_uchar *ptr, compile_data *cd)
2865 {
2866 int length = (int)(ptr - cd->start_pattern - GET(previous_callout, 2));
2867 PUT(previous_callout, 2 + LINK_SIZE, length);
2868 }
2869 
2870 
2871 
2872 #ifdef SUPPORT_UCP
2873 /*************************************************
2874 *           Get othercase range                  *
2875 *************************************************/
2876 
2877 /* This function is passed the start and end of a class range, in UTF-8 mode
2878 with UCP support. It searches up the characters, looking for internal ranges of
2879 characters in the "other" case. Each call returns the next one, updating the
2880 start address.
2881 
2882 Arguments:
2883   cptr        points to starting character value; updated
2884   d           end value
2885   ocptr       where to put start of othercase range
2886   odptr       where to put end of othercase range
2887 
2888 Yield:        TRUE when range returned; FALSE when no more
2889 */
2890 
2891 static BOOL
get_othercase_range(unsigned int * cptr,unsigned int d,unsigned int * ocptr,unsigned int * odptr)2892 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2893   unsigned int *odptr)
2894 {
2895 unsigned int c, othercase, next;
2896 
2897 for (c = *cptr; c <= d; c++)
2898   { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2899 
2900 if (c > d) return FALSE;
2901 
2902 *ocptr = othercase;
2903 next = othercase + 1;
2904 
2905 for (++c; c <= d; c++)
2906   {
2907   if (UCD_OTHERCASE(c) != next) break;
2908   next++;
2909   }
2910 
2911 *odptr = next - 1;
2912 *cptr = c;
2913 
2914 return TRUE;
2915 }
2916 
2917 
2918 
2919 /*************************************************
2920 *        Check a character and a property        *
2921 *************************************************/
2922 
2923 /* This function is called by check_auto_possessive() when a property item
2924 is adjacent to a fixed character.
2925 
2926 Arguments:
2927   c            the character
2928   ptype        the property type
2929   pdata        the data for the type
2930   negated      TRUE if it's a negated property (\P or \p{^)
2931 
2932 Returns:       TRUE if auto-possessifying is OK
2933 */
2934 
2935 static BOOL
check_char_prop(int c,int ptype,int pdata,BOOL negated)2936 check_char_prop(int c, int ptype, int pdata, BOOL negated)
2937 {
2938 const pcre_uint8 chartype = UCD_CHARTYPE(c);
2939 switch(ptype)
2940   {
2941   case PT_LAMP:
2942   return (chartype == ucp_Lu ||
2943           chartype == ucp_Ll ||
2944           chartype == ucp_Lt) == negated;
2945 
2946   case PT_GC:
2947   return (pdata == PRIV(ucp_gentype)[chartype]) == negated;
2948 
2949   case PT_PC:
2950   return (pdata == chartype) == negated;
2951 
2952   case PT_SC:
2953   return (pdata == UCD_SCRIPT(c)) == negated;
2954 
2955   /* These are specials */
2956 
2957   case PT_ALNUM:
2958   return (PRIV(ucp_gentype)[chartype] == ucp_L ||
2959           PRIV(ucp_gentype)[chartype] == ucp_N) == negated;
2960 
2961   case PT_SPACE:    /* Perl space */
2962   return (PRIV(ucp_gentype)[chartype] == ucp_Z ||
2963           c == CHAR_HT || c == CHAR_NL || c == CHAR_FF || c == CHAR_CR)
2964           == negated;
2965 
2966   case PT_PXSPACE:  /* POSIX space */
2967   return (PRIV(ucp_gentype)[chartype] == ucp_Z ||
2968           c == CHAR_HT || c == CHAR_NL || c == CHAR_VT ||
2969           c == CHAR_FF || c == CHAR_CR)
2970           == negated;
2971 
2972   case PT_WORD:
2973   return (PRIV(ucp_gentype)[chartype] == ucp_L ||
2974           PRIV(ucp_gentype)[chartype] == ucp_N ||
2975           c == CHAR_UNDERSCORE) == negated;
2976   }
2977 return FALSE;
2978 }
2979 #endif  /* SUPPORT_UCP */
2980 
2981 
2982 
2983 /*************************************************
2984 *     Check if auto-possessifying is possible    *
2985 *************************************************/
2986 
2987 /* This function is called for unlimited repeats of certain items, to see
2988 whether the next thing could possibly match the repeated item. If not, it makes
2989 sense to automatically possessify the repeated item.
2990 
2991 Arguments:
2992   previous      pointer to the repeated opcode
2993   utf           TRUE in UTF-8 / UTF-16 mode
2994   ptr           next character in pattern
2995   options       options bits
2996   cd            contains pointers to tables etc.
2997 
2998 Returns:        TRUE if possessifying is wanted
2999 */
3000 
3001 static BOOL
check_auto_possessive(const pcre_uchar * previous,BOOL utf,const pcre_uchar * ptr,int options,compile_data * cd)3002 check_auto_possessive(const pcre_uchar *previous, BOOL utf,
3003   const pcre_uchar *ptr, int options, compile_data *cd)
3004 {
3005 pcre_int32 c, next;
3006 int op_code = *previous++;
3007 
3008 /* Skip whitespace and comments in extended mode */
3009 
3010 if ((options & PCRE_EXTENDED) != 0)
3011   {
3012   for (;;)
3013     {
3014     while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3015     if (*ptr == CHAR_NUMBER_SIGN)
3016       {
3017       ptr++;
3018       while (*ptr != 0)
3019         {
3020         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3021         ptr++;
3022 #ifdef SUPPORT_UTF
3023         if (utf) FORWARDCHAR(ptr);
3024 #endif
3025         }
3026       }
3027     else break;
3028     }
3029   }
3030 
3031 /* If the next item is one that we can handle, get its value. A non-negative
3032 value is a character, a negative value is an escape value. */
3033 
3034 if (*ptr == CHAR_BACKSLASH)
3035   {
3036   int temperrorcode = 0;
3037   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
3038   if (temperrorcode != 0) return FALSE;
3039   ptr++;    /* Point after the escape sequence */
3040   }
3041 else if (!MAX_255(*ptr) || (cd->ctypes[*ptr] & ctype_meta) == 0)
3042   {
3043 #ifdef SUPPORT_UTF
3044   if (utf) { GETCHARINC(next, ptr); } else
3045 #endif
3046   next = *ptr++;
3047   }
3048 else return FALSE;
3049 
3050 /* Skip whitespace and comments in extended mode */
3051 
3052 if ((options & PCRE_EXTENDED) != 0)
3053   {
3054   for (;;)
3055     {
3056     while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
3057     if (*ptr == CHAR_NUMBER_SIGN)
3058       {
3059       ptr++;
3060       while (*ptr != 0)
3061         {
3062         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
3063         ptr++;
3064 #ifdef SUPPORT_UTF
3065         if (utf) FORWARDCHAR(ptr);
3066 #endif
3067         }
3068       }
3069     else break;
3070     }
3071   }
3072 
3073 /* If the next thing is itself optional, we have to give up. */
3074 
3075 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3076   STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3077     return FALSE;
3078 
3079 /* Now compare the next item with the previous opcode. First, handle cases when
3080 the next item is a character. */
3081 
3082 if (next >= 0) switch(op_code)
3083   {
3084   case OP_CHAR:
3085 #ifdef SUPPORT_UTF
3086   GETCHARTEST(c, previous);
3087 #else
3088   c = *previous;
3089 #endif
3090   return c != next;
3091 
3092   /* For CHARI (caseless character) we must check the other case. If we have
3093   Unicode property support, we can use it to test the other case of
3094   high-valued characters. */
3095 
3096   case OP_CHARI:
3097 #ifdef SUPPORT_UTF
3098   GETCHARTEST(c, previous);
3099 #else
3100   c = *previous;
3101 #endif
3102   if (c == next) return FALSE;
3103 #ifdef SUPPORT_UTF
3104   if (utf)
3105     {
3106     unsigned int othercase;
3107     if (next < 128) othercase = cd->fcc[next]; else
3108 #ifdef SUPPORT_UCP
3109     othercase = UCD_OTHERCASE((unsigned int)next);
3110 #else
3111     othercase = NOTACHAR;
3112 #endif
3113     return (unsigned int)c != othercase;
3114     }
3115   else
3116 #endif  /* SUPPORT_UTF */
3117   return (c != TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
3118 
3119   case OP_NOT:
3120 #ifdef SUPPORT_UTF
3121   GETCHARTEST(c, previous);
3122 #else
3123   c = *previous;
3124 #endif
3125   return c == next;
3126 
3127   case OP_NOTI:
3128 #ifdef SUPPORT_UTF
3129   GETCHARTEST(c, previous);
3130 #else
3131   c = *previous;
3132 #endif
3133   if (c == next) return TRUE;
3134 #ifdef SUPPORT_UTF
3135   if (utf)
3136     {
3137     unsigned int othercase;
3138     if (next < 128) othercase = cd->fcc[next]; else
3139 #ifdef SUPPORT_UCP
3140     othercase = UCD_OTHERCASE((unsigned int)next);
3141 #else
3142     othercase = NOTACHAR;
3143 #endif
3144     return (unsigned int)c == othercase;
3145     }
3146   else
3147 #endif  /* SUPPORT_UTF */
3148   return (c == TABLE_GET((unsigned int)next, cd->fcc, next));  /* Non-UTF-8 mode */
3149 
3150   /* Note that OP_DIGIT etc. are generated only when PCRE_UCP is *not* set.
3151   When it is set, \d etc. are converted into OP_(NOT_)PROP codes. */
3152 
3153   case OP_DIGIT:
3154   return next > 255 || (cd->ctypes[next] & ctype_digit) == 0;
3155 
3156   case OP_NOT_DIGIT:
3157   return next <= 255 && (cd->ctypes[next] & ctype_digit) != 0;
3158 
3159   case OP_WHITESPACE:
3160   return next > 255 || (cd->ctypes[next] & ctype_space) == 0;
3161 
3162   case OP_NOT_WHITESPACE:
3163   return next <= 255 && (cd->ctypes[next] & ctype_space) != 0;
3164 
3165   case OP_WORDCHAR:
3166   return next > 255 || (cd->ctypes[next] & ctype_word) == 0;
3167 
3168   case OP_NOT_WORDCHAR:
3169   return next <= 255 && (cd->ctypes[next] & ctype_word) != 0;
3170 
3171   case OP_HSPACE:
3172   case OP_NOT_HSPACE:
3173   switch(next)
3174     {
3175     case 0x09:
3176     case 0x20:
3177     case 0xa0:
3178     case 0x1680:
3179     case 0x180e:
3180     case 0x2000:
3181     case 0x2001:
3182     case 0x2002:
3183     case 0x2003:
3184     case 0x2004:
3185     case 0x2005:
3186     case 0x2006:
3187     case 0x2007:
3188     case 0x2008:
3189     case 0x2009:
3190     case 0x200A:
3191     case 0x202f:
3192     case 0x205f:
3193     case 0x3000:
3194     return op_code == OP_NOT_HSPACE;
3195     default:
3196     return op_code != OP_NOT_HSPACE;
3197     }
3198 
3199   case OP_ANYNL:
3200   case OP_VSPACE:
3201   case OP_NOT_VSPACE:
3202   switch(next)
3203     {
3204     case 0x0a:
3205     case 0x0b:
3206     case 0x0c:
3207     case 0x0d:
3208     case 0x85:
3209     case 0x2028:
3210     case 0x2029:
3211     return op_code == OP_NOT_VSPACE;
3212     default:
3213     return op_code != OP_NOT_VSPACE;
3214     }
3215 
3216 #ifdef SUPPORT_UCP
3217   case OP_PROP:
3218   return check_char_prop(next, previous[0], previous[1], FALSE);
3219 
3220   case OP_NOTPROP:
3221   return check_char_prop(next, previous[0], previous[1], TRUE);
3222 #endif
3223 
3224   default:
3225   return FALSE;
3226   }
3227 
3228 
3229 /* Handle the case when the next item is \d, \s, etc. Note that when PCRE_UCP
3230 is set, \d turns into ESC_du rather than ESC_d, etc., so ESC_d etc. are
3231 generated only when PCRE_UCP is *not* set, that is, when only ASCII
3232 characteristics are recognized. Similarly, the opcodes OP_DIGIT etc. are
3233 replaced by OP_PROP codes when PCRE_UCP is set. */
3234 
3235 switch(op_code)
3236   {
3237   case OP_CHAR:
3238   case OP_CHARI:
3239 #ifdef SUPPORT_UTF
3240   GETCHARTEST(c, previous);
3241 #else
3242   c = *previous;
3243 #endif
3244   switch(-next)
3245     {
3246     case ESC_d:
3247     return c > 255 || (cd->ctypes[c] & ctype_digit) == 0;
3248 
3249     case ESC_D:
3250     return c <= 255 && (cd->ctypes[c] & ctype_digit) != 0;
3251 
3252     case ESC_s:
3253     return c > 255 || (cd->ctypes[c] & ctype_space) == 0;
3254 
3255     case ESC_S:
3256     return c <= 255 && (cd->ctypes[c] & ctype_space) != 0;
3257 
3258     case ESC_w:
3259     return c > 255 || (cd->ctypes[c] & ctype_word) == 0;
3260 
3261     case ESC_W:
3262     return c <= 255 && (cd->ctypes[c] & ctype_word) != 0;
3263 
3264     case ESC_h:
3265     case ESC_H:
3266     switch(c)
3267       {
3268       case 0x09:
3269       case 0x20:
3270       case 0xa0:
3271       case 0x1680:
3272       case 0x180e:
3273       case 0x2000:
3274       case 0x2001:
3275       case 0x2002:
3276       case 0x2003:
3277       case 0x2004:
3278       case 0x2005:
3279       case 0x2006:
3280       case 0x2007:
3281       case 0x2008:
3282       case 0x2009:
3283       case 0x200A:
3284       case 0x202f:
3285       case 0x205f:
3286       case 0x3000:
3287       return -next != ESC_h;
3288       default:
3289       return -next == ESC_h;
3290       }
3291 
3292     case ESC_v:
3293     case ESC_V:
3294     switch(c)
3295       {
3296       case 0x0a:
3297       case 0x0b:
3298       case 0x0c:
3299       case 0x0d:
3300       case 0x85:
3301       case 0x2028:
3302       case 0x2029:
3303       return -next != ESC_v;
3304       default:
3305       return -next == ESC_v;
3306       }
3307 
3308     /* When PCRE_UCP is set, these values get generated for \d etc. Find
3309     their substitutions and process them. The result will always be either
3310     -ESC_p or -ESC_P. Then fall through to process those values. */
3311 
3312 #ifdef SUPPORT_UCP
3313     case ESC_du:
3314     case ESC_DU:
3315     case ESC_wu:
3316     case ESC_WU:
3317     case ESC_su:
3318     case ESC_SU:
3319       {
3320       int temperrorcode = 0;
3321       ptr = substitutes[-next - ESC_DU];
3322       next = check_escape(&ptr, &temperrorcode, 0, options, FALSE);
3323       if (temperrorcode != 0) return FALSE;
3324       ptr++;    /* For compatibility */
3325       }
3326     /* Fall through */
3327 
3328     case ESC_p:
3329     case ESC_P:
3330       {
3331       int ptype, pdata, errorcodeptr;
3332       BOOL negated;
3333 
3334       ptr--;      /* Make ptr point at the p or P */
3335       ptype = get_ucp(&ptr, &negated, &pdata, &errorcodeptr);
3336       if (ptype < 0) return FALSE;
3337       ptr++;      /* Point past the final curly ket */
3338 
3339       /* If the property item is optional, we have to give up. (When generated
3340       from \d etc by PCRE_UCP, this test will have been applied much earlier,
3341       to the original \d etc. At this point, ptr will point to a zero byte. */
3342 
3343       if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
3344         STRNCMP_UC_C8(ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
3345           return FALSE;
3346 
3347       /* Do the property check. */
3348 
3349       return check_char_prop(c, ptype, pdata, (next == -ESC_P) != negated);
3350       }
3351 #endif
3352 
3353     default:
3354     return FALSE;
3355     }
3356 
3357   /* In principle, support for Unicode properties should be integrated here as
3358   well. It means re-organizing the above code so as to get hold of the property
3359   values before switching on the op-code. However, I wonder how many patterns
3360   combine ASCII \d etc with Unicode properties? (Note that if PCRE_UCP is set,
3361   these op-codes are never generated.) */
3362 
3363   case OP_DIGIT:
3364   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
3365          next == -ESC_h || next == -ESC_v || next == -ESC_R;
3366 
3367   case OP_NOT_DIGIT:
3368   return next == -ESC_d;
3369 
3370   case OP_WHITESPACE:
3371   return next == -ESC_S || next == -ESC_d || next == -ESC_w;
3372 
3373   case OP_NOT_WHITESPACE:
3374   return next == -ESC_s || next == -ESC_h || next == -ESC_v || next == -ESC_R;
3375 
3376   case OP_HSPACE:
3377   return next == -ESC_S || next == -ESC_H || next == -ESC_d ||
3378          next == -ESC_w || next == -ESC_v || next == -ESC_R;
3379 
3380   case OP_NOT_HSPACE:
3381   return next == -ESC_h;
3382 
3383   /* Can't have \S in here because VT matches \S (Perl anomaly) */
3384   case OP_ANYNL:
3385   case OP_VSPACE:
3386   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
3387 
3388   case OP_NOT_VSPACE:
3389   return next == -ESC_v || next == -ESC_R;
3390 
3391   case OP_WORDCHAR:
3392   return next == -ESC_W || next == -ESC_s || next == -ESC_h ||
3393          next == -ESC_v || next == -ESC_R;
3394 
3395   case OP_NOT_WORDCHAR:
3396   return next == -ESC_w || next == -ESC_d;
3397 
3398   default:
3399   return FALSE;
3400   }
3401 
3402 /* Control does not reach here */
3403 }
3404 
3405 
3406 
3407 /*************************************************
3408 *           Compile one branch                   *
3409 *************************************************/
3410 
3411 /* Scan the pattern, compiling it into the a vector. If the options are
3412 changed during the branch, the pointer is used to change the external options
3413 bits. This function is used during the pre-compile phase when we are trying
3414 to find out the amount of memory needed, as well as during the real compile
3415 phase. The value of lengthptr distinguishes the two phases.
3416 
3417 Arguments:
3418   optionsptr     pointer to the option bits
3419   codeptr        points to the pointer to the current code point
3420   ptrptr         points to the current pattern pointer
3421   errorcodeptr   points to error code variable
3422   firstcharptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
3423   reqcharptr     set to the last literal character required, else < 0
3424   bcptr          points to current branch chain
3425   cond_depth     conditional nesting depth
3426   cd             contains pointers to tables etc.
3427   lengthptr      NULL during the real compile phase
3428                  points to length accumulator during pre-compile phase
3429 
3430 Returns:         TRUE on success
3431                  FALSE, with *errorcodeptr set non-zero on error
3432 */
3433 
3434 static BOOL
compile_branch(int * optionsptr,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,pcre_int32 * firstcharptr,pcre_int32 * reqcharptr,branch_chain * bcptr,int cond_depth,compile_data * cd,int * lengthptr)3435 compile_branch(int *optionsptr, pcre_uchar **codeptr,
3436   const pcre_uchar **ptrptr, int *errorcodeptr, pcre_int32 *firstcharptr,
3437   pcre_int32 *reqcharptr, branch_chain *bcptr, int cond_depth,
3438   compile_data *cd, int *lengthptr)
3439 {
3440 int repeat_type, op_type;
3441 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
3442 int bravalue = 0;
3443 int greedy_default, greedy_non_default;
3444 pcre_int32 firstchar, reqchar;
3445 pcre_int32 zeroreqchar, zerofirstchar;
3446 pcre_int32 req_caseopt, reqvary, tempreqvary;
3447 int options = *optionsptr;               /* May change dynamically */
3448 int after_manual_callout = 0;
3449 int length_prevgroup = 0;
3450 int c;
3451 pcre_uchar *code = *codeptr;
3452 pcre_uchar *last_code = code;
3453 pcre_uchar *orig_code = code;
3454 pcre_uchar *tempcode;
3455 BOOL inescq = FALSE;
3456 BOOL groupsetfirstchar = FALSE;
3457 const pcre_uchar *ptr = *ptrptr;
3458 const pcre_uchar *tempptr;
3459 const pcre_uchar *nestptr = NULL;
3460 pcre_uchar *previous = NULL;
3461 pcre_uchar *previous_callout = NULL;
3462 pcre_uchar *save_hwm = NULL;
3463 pcre_uint8 classbits[32];
3464 
3465 /* We can fish out the UTF-8 setting once and for all into a BOOL, but we
3466 must not do this for other options (e.g. PCRE_EXTENDED) because they may change
3467 dynamically as we process the pattern. */
3468 
3469 #ifdef SUPPORT_UTF
3470 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
3471 BOOL utf = (options & PCRE_UTF8) != 0;
3472 pcre_uchar utf_chars[6];
3473 #else
3474 BOOL utf = FALSE;
3475 #endif
3476 
3477 /* Helper variables for OP_XCLASS opcode (for characters > 255). */
3478 
3479 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3480 BOOL xclass;
3481 pcre_uchar *class_uchardata;
3482 pcre_uchar *class_uchardata_base;
3483 #endif
3484 
3485 #ifdef PCRE_DEBUG
3486 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
3487 #endif
3488 
3489 /* Set up the default and non-default settings for greediness */
3490 
3491 greedy_default = ((options & PCRE_UNGREEDY) != 0);
3492 greedy_non_default = greedy_default ^ 1;
3493 
3494 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
3495 matching encountered yet". It gets changed to REQ_NONE if we hit something that
3496 matches a non-fixed char first char; reqchar just remains unset if we never
3497 find one.
3498 
3499 When we hit a repeat whose minimum is zero, we may have to adjust these values
3500 to take the zero repeat into account. This is implemented by setting them to
3501 zerofirstbyte and zeroreqchar when such a repeat is encountered. The individual
3502 item types that can be repeated set these backoff variables appropriately. */
3503 
3504 firstchar = reqchar = zerofirstchar = zeroreqchar = REQ_UNSET;
3505 
3506 /* The variable req_caseopt contains either the REQ_CASELESS value
3507 or zero, according to the current setting of the caseless flag. The
3508 REQ_CASELESS leaves the lower 28 bit empty. It is added into the
3509 firstchar or reqchar variables to record the case status of the
3510 value. This is used only for ASCII characters. */
3511 
3512 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
3513 
3514 /* Switch on next character until the end of the branch */
3515 
3516 for (;; ptr++)
3517   {
3518   BOOL negate_class;
3519   BOOL should_flip_negation;
3520   BOOL possessive_quantifier;
3521   BOOL is_quantifier;
3522   BOOL is_recurse;
3523   BOOL reset_bracount;
3524   int class_has_8bitchar;
3525   int class_single_char;
3526   int newoptions;
3527   int recno;
3528   int refsign;
3529   int skipbytes;
3530   int subreqchar;
3531   int subfirstchar;
3532   int terminator;
3533   int mclength;
3534   int tempbracount;
3535   pcre_uchar mcbuffer[8];
3536 
3537   /* Get next character in the pattern */
3538 
3539   c = *ptr;
3540 
3541   /* If we are at the end of a nested substitution, revert to the outer level
3542   string. Nesting only happens one level deep. */
3543 
3544   if (c == 0 && nestptr != NULL)
3545     {
3546     ptr = nestptr;
3547     nestptr = NULL;
3548     c = *ptr;
3549     }
3550 
3551   /* If we are in the pre-compile phase, accumulate the length used for the
3552   previous cycle of this loop. */
3553 
3554   if (lengthptr != NULL)
3555     {
3556 #ifdef PCRE_DEBUG
3557     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
3558 #endif
3559     if (code > cd->start_workspace + cd->workspace_size -
3560         WORK_SIZE_SAFETY_MARGIN)                       /* Check for overrun */
3561       {
3562       *errorcodeptr = ERR52;
3563       goto FAILED;
3564       }
3565 
3566     /* There is at least one situation where code goes backwards: this is the
3567     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
3568     the class is simply eliminated. However, it is created first, so we have to
3569     allow memory for it. Therefore, don't ever reduce the length at this point.
3570     */
3571 
3572     if (code < last_code) code = last_code;
3573 
3574     /* Paranoid check for integer overflow */
3575 
3576     if (OFLOW_MAX - *lengthptr < code - last_code)
3577       {
3578       *errorcodeptr = ERR20;
3579       goto FAILED;
3580       }
3581 
3582     *lengthptr += (int)(code - last_code);
3583     DPRINTF(("length=%d added %d c=%c (0x%x)\n", *lengthptr,
3584       (int)(code - last_code), c, c));
3585 
3586     /* If "previous" is set and it is not at the start of the work space, move
3587     it back to there, in order to avoid filling up the work space. Otherwise,
3588     if "previous" is NULL, reset the current code pointer to the start. */
3589 
3590     if (previous != NULL)
3591       {
3592       if (previous > orig_code)
3593         {
3594         memmove(orig_code, previous, IN_UCHARS(code - previous));
3595         code -= previous - orig_code;
3596         previous = orig_code;
3597         }
3598       }
3599     else code = orig_code;
3600 
3601     /* Remember where this code item starts so we can pick up the length
3602     next time round. */
3603 
3604     last_code = code;
3605     }
3606 
3607   /* In the real compile phase, just check the workspace used by the forward
3608   reference list. */
3609 
3610   else if (cd->hwm > cd->start_workspace + cd->workspace_size -
3611            WORK_SIZE_SAFETY_MARGIN)
3612     {
3613     *errorcodeptr = ERR52;
3614     goto FAILED;
3615     }
3616 
3617   /* If in \Q...\E, check for the end; if not, we have a literal */
3618 
3619   if (inescq && c != 0)
3620     {
3621     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3622       {
3623       inescq = FALSE;
3624       ptr++;
3625       continue;
3626       }
3627     else
3628       {
3629       if (previous_callout != NULL)
3630         {
3631         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
3632           complete_callout(previous_callout, ptr, cd);
3633         previous_callout = NULL;
3634         }
3635       if ((options & PCRE_AUTO_CALLOUT) != 0)
3636         {
3637         previous_callout = code;
3638         code = auto_callout(code, ptr, cd);
3639         }
3640       goto NORMAL_CHAR;
3641       }
3642     }
3643 
3644   /* Fill in length of a previous callout, except when the next thing is
3645   a quantifier. */
3646 
3647   is_quantifier =
3648     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
3649     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
3650 
3651   if (!is_quantifier && previous_callout != NULL &&
3652        after_manual_callout-- <= 0)
3653     {
3654     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
3655       complete_callout(previous_callout, ptr, cd);
3656     previous_callout = NULL;
3657     }
3658 
3659   /* In extended mode, skip white space and comments. */
3660 
3661   if ((options & PCRE_EXTENDED) != 0)
3662     {
3663     if (MAX_255(*ptr) && (cd->ctypes[c] & ctype_space) != 0) continue;
3664     if (c == CHAR_NUMBER_SIGN)
3665       {
3666       ptr++;
3667       while (*ptr != 0)
3668         {
3669         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
3670         ptr++;
3671 #ifdef SUPPORT_UTF
3672         if (utf) FORWARDCHAR(ptr);
3673 #endif
3674         }
3675       if (*ptr != 0) continue;
3676 
3677       /* Else fall through to handle end of string */
3678       c = 0;
3679       }
3680     }
3681 
3682   /* No auto callout for quantifiers. */
3683 
3684   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
3685     {
3686     previous_callout = code;
3687     code = auto_callout(code, ptr, cd);
3688     }
3689 
3690   switch(c)
3691     {
3692     /* ===================================================================*/
3693     case 0:                        /* The branch terminates at string end */
3694     case CHAR_VERTICAL_LINE:       /* or | or ) */
3695     case CHAR_RIGHT_PARENTHESIS:
3696     *firstcharptr = firstchar;
3697     *reqcharptr = reqchar;
3698     *codeptr = code;
3699     *ptrptr = ptr;
3700     if (lengthptr != NULL)
3701       {
3702       if (OFLOW_MAX - *lengthptr < code - last_code)
3703         {
3704         *errorcodeptr = ERR20;
3705         goto FAILED;
3706         }
3707       *lengthptr += (int)(code - last_code);   /* To include callout length */
3708       DPRINTF((">> end branch\n"));
3709       }
3710     return TRUE;
3711 
3712 
3713     /* ===================================================================*/
3714     /* Handle single-character metacharacters. In multiline mode, ^ disables
3715     the setting of any following char as a first character. */
3716 
3717     case CHAR_CIRCUMFLEX_ACCENT:
3718     previous = NULL;
3719     if ((options & PCRE_MULTILINE) != 0)
3720       {
3721       if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3722       *code++ = OP_CIRCM;
3723       }
3724     else *code++ = OP_CIRC;
3725     break;
3726 
3727     case CHAR_DOLLAR_SIGN:
3728     previous = NULL;
3729     *code++ = ((options & PCRE_MULTILINE) != 0)? OP_DOLLM : OP_DOLL;
3730     break;
3731 
3732     /* There can never be a first char if '.' is first, whatever happens about
3733     repeats. The value of reqchar doesn't change either. */
3734 
3735     case CHAR_DOT:
3736     if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3737     zerofirstchar = firstchar;
3738     zeroreqchar = reqchar;
3739     previous = code;
3740     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
3741     break;
3742 
3743 
3744     /* ===================================================================*/
3745     /* Character classes. If the included characters are all < 256, we build a
3746     32-byte bitmap of the permitted characters, except in the special case
3747     where there is only one such character. For negated classes, we build the
3748     map as usual, then invert it at the end. However, we use a different opcode
3749     so that data characters > 255 can be handled correctly.
3750 
3751     If the class contains characters outside the 0-255 range, a different
3752     opcode is compiled. It may optionally have a bit map for characters < 256,
3753     but those above are are explicitly listed afterwards. A flag byte tells
3754     whether the bitmap is present, and whether this is a negated class or not.
3755 
3756     In JavaScript compatibility mode, an isolated ']' causes an error. In
3757     default (Perl) mode, it is treated as a data character. */
3758 
3759     case CHAR_RIGHT_SQUARE_BRACKET:
3760     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3761       {
3762       *errorcodeptr = ERR64;
3763       goto FAILED;
3764       }
3765     goto NORMAL_CHAR;
3766 
3767     case CHAR_LEFT_SQUARE_BRACKET:
3768     previous = code;
3769 
3770     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
3771     they are encountered at the top level, so we'll do that too. */
3772 
3773     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3774          ptr[1] == CHAR_EQUALS_SIGN) &&
3775         check_posix_syntax(ptr, &tempptr))
3776       {
3777       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
3778       goto FAILED;
3779       }
3780 
3781     /* If the first character is '^', set the negation flag and skip it. Also,
3782     if the first few characters (either before or after ^) are \Q\E or \E we
3783     skip them too. This makes for compatibility with Perl. */
3784 
3785     negate_class = FALSE;
3786     for (;;)
3787       {
3788       c = *(++ptr);
3789       if (c == CHAR_BACKSLASH)
3790         {
3791         if (ptr[1] == CHAR_E)
3792           ptr++;
3793         else if (STRNCMP_UC_C8(ptr + 1, STR_Q STR_BACKSLASH STR_E, 3) == 0)
3794           ptr += 3;
3795         else
3796           break;
3797         }
3798       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
3799         negate_class = TRUE;
3800       else break;
3801       }
3802 
3803     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
3804     an initial ']' is taken as a data character -- the code below handles
3805     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
3806     [^] must match any character, so generate OP_ALLANY. */
3807 
3808     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
3809         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
3810       {
3811       *code++ = negate_class? OP_ALLANY : OP_FAIL;
3812       if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
3813       zerofirstchar = firstchar;
3814       break;
3815       }
3816 
3817     /* If a class contains a negative special such as \S, we need to flip the
3818     negation flag at the end, so that support for characters > 255 works
3819     correctly (they are all included in the class). */
3820 
3821     should_flip_negation = FALSE;
3822 
3823     /* For optimization purposes, we track some properties of the class.
3824     class_has_8bitchar will be non-zero, if the class contains at least one
3825     < 256 character. class_single_char will be 1 if the class contains only
3826     a single character. */
3827 
3828     class_has_8bitchar = 0;
3829     class_single_char = 0;
3830 
3831     /* Initialize the 32-char bit map to all zeros. We build the map in a
3832     temporary bit of memory, in case the class contains only 1 character (less
3833     than 256), because in that case the compiled code doesn't use the bit map.
3834     */
3835 
3836     memset(classbits, 0, 32 * sizeof(pcre_uint8));
3837 
3838 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3839     xclass = FALSE;                           /* No chars >= 256 */
3840     class_uchardata = code + LINK_SIZE + 2;   /* For UTF-8 items */
3841     class_uchardata_base = class_uchardata;   /* For resetting in pass 1 */
3842 #endif
3843 
3844     /* Process characters until ] is reached. By writing this as a "do" it
3845     means that an initial ] is taken as a data character. At the start of the
3846     loop, c contains the first byte of the character. */
3847 
3848     if (c != 0) do
3849       {
3850       const pcre_uchar *oldptr;
3851 
3852 #ifdef SUPPORT_UTF
3853       if (utf && HAS_EXTRALEN(c))
3854         {                           /* Braces are required because the */
3855         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
3856         }
3857 #endif
3858 
3859 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
3860       /* In the pre-compile phase, accumulate the length of any extra
3861       data and reset the pointer. This is so that very large classes that
3862       contain a zillion > 255 characters no longer overwrite the work space
3863       (which is on the stack). */
3864 
3865       if (lengthptr != NULL)
3866         {
3867         *lengthptr += class_uchardata - class_uchardata_base;
3868         class_uchardata = class_uchardata_base;
3869         }
3870 #endif
3871 
3872       /* Inside \Q...\E everything is literal except \E */
3873 
3874       if (inescq)
3875         {
3876         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
3877           {
3878           inescq = FALSE;                   /* Reset literal state */
3879           ptr++;                            /* Skip the 'E' */
3880           continue;                         /* Carry on with next */
3881           }
3882         goto CHECK_RANGE;                   /* Could be range if \E follows */
3883         }
3884 
3885       /* Handle POSIX class names. Perl allows a negation extension of the
3886       form [:^name:]. A square bracket that doesn't match the syntax is
3887       treated as a literal. We also recognize the POSIX constructions
3888       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
3889       5.6 and 5.8 do. */
3890 
3891       if (c == CHAR_LEFT_SQUARE_BRACKET &&
3892           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3893            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3894         {
3895         BOOL local_negate = FALSE;
3896         int posix_class, taboffset, tabopt;
3897         const pcre_uint8 *cbits = cd->cbits;
3898         pcre_uint8 pbits[32];
3899 
3900         if (ptr[1] != CHAR_COLON)
3901           {
3902           *errorcodeptr = ERR31;
3903           goto FAILED;
3904           }
3905 
3906         ptr += 2;
3907         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3908           {
3909           local_negate = TRUE;
3910           should_flip_negation = TRUE;  /* Note negative special */
3911           ptr++;
3912           }
3913 
3914         posix_class = check_posix_name(ptr, (int)(tempptr - ptr));
3915         if (posix_class < 0)
3916           {
3917           *errorcodeptr = ERR30;
3918           goto FAILED;
3919           }
3920 
3921         /* If matching is caseless, upper and lower are converted to
3922         alpha. This relies on the fact that the class table starts with
3923         alpha, lower, upper as the first 3 entries. */
3924 
3925         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3926           posix_class = 0;
3927 
3928         /* When PCRE_UCP is set, some of the POSIX classes are converted to
3929         different escape sequences that use Unicode properties. */
3930 
3931 #ifdef SUPPORT_UCP
3932         if ((options & PCRE_UCP) != 0)
3933           {
3934           int pc = posix_class + ((local_negate)? POSIX_SUBSIZE/2 : 0);
3935           if (posix_substitutes[pc] != NULL)
3936             {
3937             nestptr = tempptr + 1;
3938             ptr = posix_substitutes[pc] - 1;
3939             continue;
3940             }
3941           }
3942 #endif
3943         /* In the non-UCP case, we build the bit map for the POSIX class in a
3944         chunk of local store because we may be adding and subtracting from it,
3945         and we don't want to subtract bits that may be in the main map already.
3946         At the end we or the result into the bit map that is being built. */
3947 
3948         posix_class *= 3;
3949 
3950         /* Copy in the first table (always present) */
3951 
3952         memcpy(pbits, cbits + posix_class_maps[posix_class],
3953           32 * sizeof(pcre_uint8));
3954 
3955         /* If there is a second table, add or remove it as required. */
3956 
3957         taboffset = posix_class_maps[posix_class + 1];
3958         tabopt = posix_class_maps[posix_class + 2];
3959 
3960         if (taboffset >= 0)
3961           {
3962           if (tabopt >= 0)
3963             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3964           else
3965             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3966           }
3967 
3968         /* Not see if we need to remove any special characters. An option
3969         value of 1 removes vertical space and 2 removes underscore. */
3970 
3971         if (tabopt < 0) tabopt = -tabopt;
3972         if (tabopt == 1) pbits[1] &= ~0x3c;
3973           else if (tabopt == 2) pbits[11] &= 0x7f;
3974 
3975         /* Add the POSIX table or its complement into the main table that is
3976         being built and we are done. */
3977 
3978         if (local_negate)
3979           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3980         else
3981           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3982 
3983         ptr = tempptr + 1;
3984         /* Every class contains at least one < 256 characters. */
3985         class_has_8bitchar = 1;
3986         /* Every class contains at least two characters. */
3987         class_single_char = 2;
3988         continue;    /* End of POSIX syntax handling */
3989         }
3990 
3991       /* Backslash may introduce a single character, or it may introduce one
3992       of the specials, which just set a flag. The sequence \b is a special
3993       case. Inside a class (and only there) it is treated as backspace. We
3994       assume that other escapes have more than one character in them, so
3995       speculatively set both class_has_8bitchar and class_single_char bigger
3996       than one. Unrecognized escapes fall through and are either treated
3997       as literal characters (by default), or are faulted if
3998       PCRE_EXTRA is set. */
3999 
4000       if (c == CHAR_BACKSLASH)
4001         {
4002         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
4003         if (*errorcodeptr != 0) goto FAILED;
4004 
4005         if (-c == ESC_b) c = CHAR_BS;    /* \b is backspace in a class */
4006         else if (-c == ESC_N)            /* \N is not supported in a class */
4007           {
4008           *errorcodeptr = ERR71;
4009           goto FAILED;
4010           }
4011         else if (-c == ESC_Q)            /* Handle start of quoted string */
4012           {
4013           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4014             {
4015             ptr += 2; /* avoid empty string */
4016             }
4017           else inescq = TRUE;
4018           continue;
4019           }
4020         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
4021 
4022         if (c < 0)
4023           {
4024           const pcre_uint8 *cbits = cd->cbits;
4025           /* Every class contains at least two < 256 characters. */
4026           class_has_8bitchar++;
4027           /* Every class contains at least two characters. */
4028           class_single_char += 2;
4029 
4030           switch (-c)
4031             {
4032 #ifdef SUPPORT_UCP
4033             case ESC_du:     /* These are the values given for \d etc */
4034             case ESC_DU:     /* when PCRE_UCP is set. We replace the */
4035             case ESC_wu:     /* escape sequence with an appropriate \p */
4036             case ESC_WU:     /* or \P to test Unicode properties instead */
4037             case ESC_su:     /* of the default ASCII testing. */
4038             case ESC_SU:
4039             nestptr = ptr;
4040             ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
4041             class_has_8bitchar--;                /* Undo! */
4042             continue;
4043 #endif
4044             case ESC_d:
4045             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
4046             continue;
4047 
4048             case ESC_D:
4049             should_flip_negation = TRUE;
4050             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
4051             continue;
4052 
4053             case ESC_w:
4054             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
4055             continue;
4056 
4057             case ESC_W:
4058             should_flip_negation = TRUE;
4059             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
4060             continue;
4061 
4062             /* Perl 5.004 onwards omits VT from \s, but we must preserve it
4063             if it was previously set by something earlier in the character
4064             class. */
4065 
4066             case ESC_s:
4067             classbits[0] |= cbits[cbit_space];
4068             classbits[1] |= cbits[cbit_space+1] & ~0x08;
4069             for (c = 2; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
4070             continue;
4071 
4072             case ESC_S:
4073             should_flip_negation = TRUE;
4074             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
4075             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
4076             continue;
4077 
4078             case ESC_h:
4079             SETBIT(classbits, 0x09); /* VT */
4080             SETBIT(classbits, 0x20); /* SPACE */
4081             SETBIT(classbits, 0xa0); /* NSBP */
4082 #ifndef COMPILE_PCRE8
4083             xclass = TRUE;
4084             *class_uchardata++ = XCL_SINGLE;
4085             *class_uchardata++ = 0x1680;
4086             *class_uchardata++ = XCL_SINGLE;
4087             *class_uchardata++ = 0x180e;
4088             *class_uchardata++ = XCL_RANGE;
4089             *class_uchardata++ = 0x2000;
4090             *class_uchardata++ = 0x200a;
4091             *class_uchardata++ = XCL_SINGLE;
4092             *class_uchardata++ = 0x202f;
4093             *class_uchardata++ = XCL_SINGLE;
4094             *class_uchardata++ = 0x205f;
4095             *class_uchardata++ = XCL_SINGLE;
4096             *class_uchardata++ = 0x3000;
4097 #elif defined SUPPORT_UTF
4098             if (utf)
4099               {
4100               xclass = TRUE;
4101               *class_uchardata++ = XCL_SINGLE;
4102               class_uchardata += PRIV(ord2utf)(0x1680, class_uchardata);
4103               *class_uchardata++ = XCL_SINGLE;
4104               class_uchardata += PRIV(ord2utf)(0x180e, class_uchardata);
4105               *class_uchardata++ = XCL_RANGE;
4106               class_uchardata += PRIV(ord2utf)(0x2000, class_uchardata);
4107               class_uchardata += PRIV(ord2utf)(0x200a, class_uchardata);
4108               *class_uchardata++ = XCL_SINGLE;
4109               class_uchardata += PRIV(ord2utf)(0x202f, class_uchardata);
4110               *class_uchardata++ = XCL_SINGLE;
4111               class_uchardata += PRIV(ord2utf)(0x205f, class_uchardata);
4112               *class_uchardata++ = XCL_SINGLE;
4113               class_uchardata += PRIV(ord2utf)(0x3000, class_uchardata);
4114               }
4115 #endif
4116             continue;
4117 
4118             case ESC_H:
4119             for (c = 0; c < 32; c++)
4120               {
4121               int x = 0xff;
4122               switch (c)
4123                 {
4124                 case 0x09/8: x ^= 1 << (0x09%8); break;
4125                 case 0x20/8: x ^= 1 << (0x20%8); break;
4126                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
4127                 default: break;
4128                 }
4129               classbits[c] |= x;
4130               }
4131 #ifndef COMPILE_PCRE8
4132             xclass = TRUE;
4133             *class_uchardata++ = XCL_RANGE;
4134             *class_uchardata++ = 0x0100;
4135             *class_uchardata++ = 0x167f;
4136             *class_uchardata++ = XCL_RANGE;
4137             *class_uchardata++ = 0x1681;
4138             *class_uchardata++ = 0x180d;
4139             *class_uchardata++ = XCL_RANGE;
4140             *class_uchardata++ = 0x180f;
4141             *class_uchardata++ = 0x1fff;
4142             *class_uchardata++ = XCL_RANGE;
4143             *class_uchardata++ = 0x200b;
4144             *class_uchardata++ = 0x202e;
4145             *class_uchardata++ = XCL_RANGE;
4146             *class_uchardata++ = 0x2030;
4147             *class_uchardata++ = 0x205e;
4148             *class_uchardata++ = XCL_RANGE;
4149             *class_uchardata++ = 0x2060;
4150             *class_uchardata++ = 0x2fff;
4151             *class_uchardata++ = XCL_RANGE;
4152             *class_uchardata++ = 0x3001;
4153 #ifdef SUPPORT_UTF
4154             if (utf)
4155               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4156             else
4157 #endif
4158               *class_uchardata++ = 0xffff;
4159 #elif defined SUPPORT_UTF
4160             if (utf)
4161               {
4162               xclass = TRUE;
4163               *class_uchardata++ = XCL_RANGE;
4164               class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4165               class_uchardata += PRIV(ord2utf)(0x167f, class_uchardata);
4166               *class_uchardata++ = XCL_RANGE;
4167               class_uchardata += PRIV(ord2utf)(0x1681, class_uchardata);
4168               class_uchardata += PRIV(ord2utf)(0x180d, class_uchardata);
4169               *class_uchardata++ = XCL_RANGE;
4170               class_uchardata += PRIV(ord2utf)(0x180f, class_uchardata);
4171               class_uchardata += PRIV(ord2utf)(0x1fff, class_uchardata);
4172               *class_uchardata++ = XCL_RANGE;
4173               class_uchardata += PRIV(ord2utf)(0x200b, class_uchardata);
4174               class_uchardata += PRIV(ord2utf)(0x202e, class_uchardata);
4175               *class_uchardata++ = XCL_RANGE;
4176               class_uchardata += PRIV(ord2utf)(0x2030, class_uchardata);
4177               class_uchardata += PRIV(ord2utf)(0x205e, class_uchardata);
4178               *class_uchardata++ = XCL_RANGE;
4179               class_uchardata += PRIV(ord2utf)(0x2060, class_uchardata);
4180               class_uchardata += PRIV(ord2utf)(0x2fff, class_uchardata);
4181               *class_uchardata++ = XCL_RANGE;
4182               class_uchardata += PRIV(ord2utf)(0x3001, class_uchardata);
4183               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4184               }
4185 #endif
4186             continue;
4187 
4188             case ESC_v:
4189             SETBIT(classbits, 0x0a); /* LF */
4190             SETBIT(classbits, 0x0b); /* VT */
4191             SETBIT(classbits, 0x0c); /* FF */
4192             SETBIT(classbits, 0x0d); /* CR */
4193             SETBIT(classbits, 0x85); /* NEL */
4194 #ifndef COMPILE_PCRE8
4195             xclass = TRUE;
4196             *class_uchardata++ = XCL_RANGE;
4197             *class_uchardata++ = 0x2028;
4198             *class_uchardata++ = 0x2029;
4199 #elif defined SUPPORT_UTF
4200             if (utf)
4201               {
4202               xclass = TRUE;
4203               *class_uchardata++ = XCL_RANGE;
4204               class_uchardata += PRIV(ord2utf)(0x2028, class_uchardata);
4205               class_uchardata += PRIV(ord2utf)(0x2029, class_uchardata);
4206               }
4207 #endif
4208             continue;
4209 
4210             case ESC_V:
4211             for (c = 0; c < 32; c++)
4212               {
4213               int x = 0xff;
4214               switch (c)
4215                 {
4216                 case 0x0a/8: x ^= 1 << (0x0a%8);
4217                              x ^= 1 << (0x0b%8);
4218                              x ^= 1 << (0x0c%8);
4219                              x ^= 1 << (0x0d%8);
4220                              break;
4221                 case 0x85/8: x ^= 1 << (0x85%8); break;
4222                 default: break;
4223                 }
4224               classbits[c] |= x;
4225               }
4226 
4227 #ifndef COMPILE_PCRE8
4228             xclass = TRUE;
4229             *class_uchardata++ = XCL_RANGE;
4230             *class_uchardata++ = 0x0100;
4231             *class_uchardata++ = 0x2027;
4232             *class_uchardata++ = XCL_RANGE;
4233             *class_uchardata++ = 0x202a;
4234 #ifdef SUPPORT_UTF
4235             if (utf)
4236               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4237             else
4238 #endif
4239               *class_uchardata++ = 0xffff;
4240 #elif defined SUPPORT_UTF
4241             if (utf)
4242               {
4243               xclass = TRUE;
4244               *class_uchardata++ = XCL_RANGE;
4245               class_uchardata += PRIV(ord2utf)(0x0100, class_uchardata);
4246               class_uchardata += PRIV(ord2utf)(0x2027, class_uchardata);
4247               *class_uchardata++ = XCL_RANGE;
4248               class_uchardata += PRIV(ord2utf)(0x202a, class_uchardata);
4249               class_uchardata += PRIV(ord2utf)(0x10ffff, class_uchardata);
4250               }
4251 #endif
4252             continue;
4253 
4254 #ifdef SUPPORT_UCP
4255             case ESC_p:
4256             case ESC_P:
4257               {
4258               BOOL negated;
4259               int pdata;
4260               int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
4261               if (ptype < 0) goto FAILED;
4262               xclass = TRUE;
4263               *class_uchardata++ = ((-c == ESC_p) != negated)?
4264                 XCL_PROP : XCL_NOTPROP;
4265               *class_uchardata++ = ptype;
4266               *class_uchardata++ = pdata;
4267               class_has_8bitchar--;                /* Undo! */
4268               continue;
4269               }
4270 #endif
4271             /* Unrecognized escapes are faulted if PCRE is running in its
4272             strict mode. By default, for compatibility with Perl, they are
4273             treated as literals. */
4274 
4275             default:
4276             if ((options & PCRE_EXTRA) != 0)
4277               {
4278               *errorcodeptr = ERR7;
4279               goto FAILED;
4280               }
4281             class_has_8bitchar--;    /* Undo the speculative increase. */
4282             class_single_char -= 2;  /* Undo the speculative increase. */
4283             c = *ptr;                /* Get the final character and fall through */
4284             break;
4285             }
4286           }
4287 
4288         /* Fall through if we have a single character (c >= 0). This may be
4289         greater than 256. */
4290 
4291         }   /* End of backslash handling */
4292 
4293       /* A single character may be followed by '-' to form a range. However,
4294       Perl does not permit ']' to be the end of the range. A '-' character
4295       at the end is treated as a literal. Perl ignores orphaned \E sequences
4296       entirely. The code for handling \Q and \E is messy. */
4297 
4298       CHECK_RANGE:
4299       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
4300         {
4301         inescq = FALSE;
4302         ptr += 2;
4303         }
4304 
4305       oldptr = ptr;
4306 
4307       /* Remember \r or \n */
4308 
4309       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4310 
4311       /* Check for range */
4312 
4313       if (!inescq && ptr[1] == CHAR_MINUS)
4314         {
4315         int d;
4316         ptr += 2;
4317         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
4318 
4319         /* If we hit \Q (not followed by \E) at this point, go into escaped
4320         mode. */
4321 
4322         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
4323           {
4324           ptr += 2;
4325           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
4326             { ptr += 2; continue; }
4327           inescq = TRUE;
4328           break;
4329           }
4330 
4331         if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
4332           {
4333           ptr = oldptr;
4334           goto LONE_SINGLE_CHARACTER;
4335           }
4336 
4337 #ifdef SUPPORT_UTF
4338         if (utf)
4339           {                           /* Braces are required because the */
4340           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
4341           }
4342         else
4343 #endif
4344         d = *ptr;  /* Not UTF-8 mode */
4345 
4346         /* The second part of a range can be a single-character escape, but
4347         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
4348         in such circumstances. */
4349 
4350         if (!inescq && d == CHAR_BACKSLASH)
4351           {
4352           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
4353           if (*errorcodeptr != 0) goto FAILED;
4354 
4355           /* \b is backspace; any other special means the '-' was literal */
4356 
4357           if (d < 0)
4358             {
4359             if (d == -ESC_b) d = CHAR_BS; else
4360               {
4361               ptr = oldptr;
4362               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
4363               }
4364             }
4365           }
4366 
4367         /* Check that the two values are in the correct order. Optimize
4368         one-character ranges */
4369 
4370         if (d < c)
4371           {
4372           *errorcodeptr = ERR8;
4373           goto FAILED;
4374           }
4375 
4376         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
4377 
4378         /* Remember \r or \n */
4379 
4380         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
4381 
4382         /* Since we found a character range, single character optimizations
4383         cannot be done anymore. */
4384         class_single_char = 2;
4385 
4386         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
4387         matching, we have to use an XCLASS with extra data items. Caseless
4388         matching for characters > 127 is available only if UCP support is
4389         available. */
4390 
4391 #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4392         if ((d > 255) || (utf && ((options & PCRE_CASELESS) != 0 && d > 127)))
4393 #elif defined  SUPPORT_UTF
4394         if (utf && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
4395 #elif !(defined COMPILE_PCRE8)
4396         if (d > 255)
4397 #endif
4398 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4399           {
4400           xclass = TRUE;
4401 
4402           /* With UCP support, we can find the other case equivalents of
4403           the relevant characters. There may be several ranges. Optimize how
4404           they fit with the basic range. */
4405 
4406 #ifdef SUPPORT_UCP
4407 #ifndef COMPILE_PCRE8
4408           if (utf && (options & PCRE_CASELESS) != 0)
4409 #else
4410           if ((options & PCRE_CASELESS) != 0)
4411 #endif
4412             {
4413             unsigned int occ, ocd;
4414             unsigned int cc = c;
4415             unsigned int origd = d;
4416             while (get_othercase_range(&cc, origd, &occ, &ocd))
4417               {
4418               if (occ >= (unsigned int)c &&
4419                   ocd <= (unsigned int)d)
4420                 continue;                          /* Skip embedded ranges */
4421 
4422               if (occ < (unsigned int)c  &&
4423                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
4424                 {                                  /* if there is overlap,   */
4425                 c = occ;                           /* noting that if occ < c */
4426                 continue;                          /* we can't have ocd > d  */
4427                 }                                  /* because a subrange is  */
4428               if (ocd > (unsigned int)d &&
4429                   occ <= (unsigned int)d + 1)      /* always shorter than    */
4430                 {                                  /* the basic range.       */
4431                 d = ocd;
4432                 continue;
4433                 }
4434 
4435               if (occ == ocd)
4436                 {
4437                 *class_uchardata++ = XCL_SINGLE;
4438                 }
4439               else
4440                 {
4441                 *class_uchardata++ = XCL_RANGE;
4442                 class_uchardata += PRIV(ord2utf)(occ, class_uchardata);
4443                 }
4444               class_uchardata += PRIV(ord2utf)(ocd, class_uchardata);
4445               }
4446             }
4447 #endif  /* SUPPORT_UCP */
4448 
4449           /* Now record the original range, possibly modified for UCP caseless
4450           overlapping ranges. */
4451 
4452           *class_uchardata++ = XCL_RANGE;
4453 #ifdef SUPPORT_UTF
4454 #ifndef COMPILE_PCRE8
4455           if (utf)
4456             {
4457             class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4458             class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4459             }
4460           else
4461             {
4462             *class_uchardata++ = c;
4463             *class_uchardata++ = d;
4464             }
4465 #else
4466           class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4467           class_uchardata += PRIV(ord2utf)(d, class_uchardata);
4468 #endif
4469 #else /* SUPPORT_UTF */
4470           *class_uchardata++ = c;
4471           *class_uchardata++ = d;
4472 #endif /* SUPPORT_UTF */
4473 
4474           /* With UCP support, we are done. Without UCP support, there is no
4475           caseless matching for UTF characters > 127; we can use the bit map
4476           for the smaller ones. As for 16 bit characters without UTF, we
4477           can still use  */
4478 
4479 #ifdef SUPPORT_UCP
4480 #ifndef COMPILE_PCRE8
4481           if (utf)
4482 #endif
4483             continue;    /* With next character in the class */
4484 #endif  /* SUPPORT_UCP */
4485 
4486 #if defined SUPPORT_UTF && !defined(SUPPORT_UCP) && !(defined COMPILE_PCRE8)
4487           if (utf)
4488             {
4489             if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4490             /* Adjust upper limit and fall through to set up the map */
4491             d = 127;
4492             }
4493           else
4494             {
4495             if (c > 255) continue;
4496             /* Adjust upper limit and fall through to set up the map */
4497             d = 255;
4498             }
4499 #elif defined SUPPORT_UTF && !defined(SUPPORT_UCP)
4500           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
4501           /* Adjust upper limit and fall through to set up the map */
4502           d = 127;
4503 #else
4504           if (c > 255) continue;
4505           /* Adjust upper limit and fall through to set up the map */
4506           d = 255;
4507 #endif  /* SUPPORT_UTF && !SUPPORT_UCP && !COMPILE_PCRE8 */
4508           }
4509 #endif  /* SUPPORT_UTF || !COMPILE_PCRE8 */
4510 
4511         /* We use the bit map for 8 bit mode, or when the characters fall
4512         partially or entirely to [0-255] ([0-127] for UCP) ranges. */
4513 
4514         class_has_8bitchar = 1;
4515 
4516         /* We can save a bit of time by skipping this in the pre-compile. */
4517 
4518         if (lengthptr == NULL) for (; c <= d; c++)
4519           {
4520           classbits[c/8] |= (1 << (c&7));
4521           if ((options & PCRE_CASELESS) != 0)
4522             {
4523             int uc = cd->fcc[c]; /* flip case */
4524             classbits[uc/8] |= (1 << (uc&7));
4525             }
4526           }
4527 
4528         continue;   /* Go get the next char in the class */
4529         }
4530 
4531       /* Handle a lone single character - we can get here for a normal
4532       non-escape char, or after \ that introduces a single character or for an
4533       apparent range that isn't. */
4534 
4535       LONE_SINGLE_CHARACTER:
4536 
4537       /* Only the value of 1 matters for class_single_char. */
4538 
4539       if (class_single_char < 2) class_single_char++;
4540 
4541       /* If class_charcount is 1, we saw precisely one character. As long as
4542       there was no use of \p or \P, in other words, no use of any XCLASS
4543       features, we can optimize.
4544 
4545       The optimization throws away the bit map. We turn the item into a
4546       1-character OP_CHAR[I] if it's positive, or OP_NOT[I] if it's negative.
4547       In the positive case, it can cause firstchar to be set. Otherwise, there
4548       can be no first char if this item is first, whatever repeat count may
4549       follow. In the case of reqchar, save the previous value for reinstating. */
4550 
4551       if (class_single_char == 1 && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
4552         {
4553         ptr++;
4554         zeroreqchar = reqchar;
4555 
4556         if (negate_class)
4557           {
4558           if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4559           zerofirstchar = firstchar;
4560           *code++ = ((options & PCRE_CASELESS) != 0)? OP_NOTI: OP_NOT;
4561 #ifdef SUPPORT_UTF
4562           if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4563             code += PRIV(ord2utf)(c, code);
4564           else
4565 #endif
4566             *code++ = c;
4567           goto NOT_CHAR;
4568           }
4569 
4570         /* For a single, positive character, get the value into mcbuffer, and
4571         then we can handle this with the normal one-character code. */
4572 
4573 #ifdef SUPPORT_UTF
4574         if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
4575           mclength = PRIV(ord2utf)(c, mcbuffer);
4576         else
4577 #endif
4578           {
4579           mcbuffer[0] = c;
4580           mclength = 1;
4581           }
4582         goto ONE_CHAR;
4583         }       /* End of 1-char optimization */
4584 
4585       /* Handle a character that cannot go in the bit map. */
4586 
4587 #if defined SUPPORT_UTF && !(defined COMPILE_PCRE8)
4588       if ((c > 255) || (utf && ((options & PCRE_CASELESS) != 0 && c > 127)))
4589 #elif defined SUPPORT_UTF
4590       if (utf && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
4591 #elif !(defined COMPILE_PCRE8)
4592       if (c > 255)
4593 #endif
4594 
4595 #if defined SUPPORT_UTF || !(defined COMPILE_PCRE8)
4596         {
4597         xclass = TRUE;
4598         *class_uchardata++ = XCL_SINGLE;
4599 #ifdef SUPPORT_UTF
4600 #ifndef COMPILE_PCRE8
4601         /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4602         if (!utf)
4603           *class_uchardata++ = c;
4604         else
4605 #endif
4606           class_uchardata += PRIV(ord2utf)(c, class_uchardata);
4607 #else /* SUPPORT_UTF */
4608         *class_uchardata++ = c;
4609 #endif /* SUPPORT_UTF */
4610 
4611 #ifdef SUPPORT_UCP
4612 #ifdef COMPILE_PCRE8
4613         if ((options & PCRE_CASELESS) != 0)
4614 #else
4615         /* In non 8 bit mode, we can get here even if we are not in UTF mode. */
4616         if (utf && (options & PCRE_CASELESS) != 0)
4617 #endif
4618           {
4619           unsigned int othercase;
4620           if ((int)(othercase = UCD_OTHERCASE(c)) != c)
4621             {
4622             *class_uchardata++ = XCL_SINGLE;
4623             class_uchardata += PRIV(ord2utf)(othercase, class_uchardata);
4624             }
4625           }
4626 #endif  /* SUPPORT_UCP */
4627 
4628         }
4629       else
4630 #endif  /* SUPPORT_UTF || COMPILE_PCRE16 */
4631 
4632       /* Handle a single-byte character */
4633         {
4634         class_has_8bitchar = 1;
4635         classbits[c/8] |= (1 << (c&7));
4636         if ((options & PCRE_CASELESS) != 0)
4637           {
4638           c = cd->fcc[c]; /* flip case */
4639           classbits[c/8] |= (1 << (c&7));
4640           }
4641         }
4642       }
4643 
4644     /* Loop until ']' reached. This "while" is the end of the "do" far above.
4645     If we are at the end of an internal nested string, revert to the outer
4646     string. */
4647 
4648     while (((c = *(++ptr)) != 0 ||
4649            (nestptr != NULL &&
4650              (ptr = nestptr, nestptr = NULL, c = *(++ptr)) != 0)) &&
4651            (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
4652 
4653     /* Check for missing terminating ']' */
4654 
4655     if (c == 0)
4656       {
4657       *errorcodeptr = ERR6;
4658       goto FAILED;
4659       }
4660 
4661     /* If this is the first thing in the branch, there can be no first char
4662     setting, whatever the repeat count. Any reqchar setting must remain
4663     unchanged after any kind of repeat. */
4664 
4665     if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
4666     zerofirstchar = firstchar;
4667     zeroreqchar = reqchar;
4668 
4669     /* If there are characters with values > 255, we have to compile an
4670     extended class, with its own opcode, unless there was a negated special
4671     such as \S in the class, and PCRE_UCP is not set, because in that case all
4672     characters > 255 are in the class, so any that were explicitly given as
4673     well can be ignored. If (when there are explicit characters > 255 that must
4674     be listed) there are no characters < 256, we can omit the bitmap in the
4675     actual compiled code. */
4676 
4677 #ifdef SUPPORT_UTF
4678     if (xclass && (!should_flip_negation || (options & PCRE_UCP) != 0))
4679 #elif !defined COMPILE_PCRE8
4680     if (xclass && !should_flip_negation)
4681 #endif
4682 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
4683       {
4684       *class_uchardata++ = XCL_END;    /* Marks the end of extra data */
4685       *code++ = OP_XCLASS;
4686       code += LINK_SIZE;
4687       *code = negate_class? XCL_NOT:0;
4688 
4689       /* If the map is required, move up the extra data to make room for it;
4690       otherwise just move the code pointer to the end of the extra data. */
4691 
4692       if (class_has_8bitchar > 0)
4693         {
4694         *code++ |= XCL_MAP;
4695         memmove(code + (32 / sizeof(pcre_uchar)), code,
4696           IN_UCHARS(class_uchardata - code));
4697         memcpy(code, classbits, 32);
4698         code = class_uchardata + (32 / sizeof(pcre_uchar));
4699         }
4700       else code = class_uchardata;
4701 
4702       /* Now fill in the complete length of the item */
4703 
4704       PUT(previous, 1, (int)(code - previous));
4705       break;   /* End of class handling */
4706       }
4707 #endif
4708 
4709     /* If there are no characters > 255, or they are all to be included or
4710     excluded, set the opcode to OP_CLASS or OP_NCLASS, depending on whether the
4711     whole class was negated and whether there were negative specials such as \S
4712     (non-UCP) in the class. Then copy the 32-byte map into the code vector,
4713     negating it if necessary. */
4714 
4715     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
4716     if (lengthptr == NULL)    /* Save time in the pre-compile phase */
4717       {
4718       if (negate_class)
4719         for (c = 0; c < 32; c++) classbits[c] = ~classbits[c];
4720       memcpy(code, classbits, 32);
4721       }
4722     code += 32 / sizeof(pcre_uchar);
4723     NOT_CHAR:
4724     break;
4725 
4726 
4727     /* ===================================================================*/
4728     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
4729     has been tested above. */
4730 
4731     case CHAR_LEFT_CURLY_BRACKET:
4732     if (!is_quantifier) goto NORMAL_CHAR;
4733     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
4734     if (*errorcodeptr != 0) goto FAILED;
4735     goto REPEAT;
4736 
4737     case CHAR_ASTERISK:
4738     repeat_min = 0;
4739     repeat_max = -1;
4740     goto REPEAT;
4741 
4742     case CHAR_PLUS:
4743     repeat_min = 1;
4744     repeat_max = -1;
4745     goto REPEAT;
4746 
4747     case CHAR_QUESTION_MARK:
4748     repeat_min = 0;
4749     repeat_max = 1;
4750 
4751     REPEAT:
4752     if (previous == NULL)
4753       {
4754       *errorcodeptr = ERR9;
4755       goto FAILED;
4756       }
4757 
4758     if (repeat_min == 0)
4759       {
4760       firstchar = zerofirstchar;    /* Adjust for zero repeat */
4761       reqchar = zeroreqchar;        /* Ditto */
4762       }
4763 
4764     /* Remember whether this is a variable length repeat */
4765 
4766     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
4767 
4768     op_type = 0;                    /* Default single-char op codes */
4769     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
4770 
4771     /* Save start of previous item, in case we have to move it up in order to
4772     insert something before it. */
4773 
4774     tempcode = previous;
4775 
4776     /* If the next character is '+', we have a possessive quantifier. This
4777     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
4778     If the next character is '?' this is a minimizing repeat, by default,
4779     but if PCRE_UNGREEDY is set, it works the other way round. We change the
4780     repeat type to the non-default. */
4781 
4782     if (ptr[1] == CHAR_PLUS)
4783       {
4784       repeat_type = 0;                  /* Force greedy */
4785       possessive_quantifier = TRUE;
4786       ptr++;
4787       }
4788     else if (ptr[1] == CHAR_QUESTION_MARK)
4789       {
4790       repeat_type = greedy_non_default;
4791       ptr++;
4792       }
4793     else repeat_type = greedy_default;
4794 
4795     /* If previous was a recursion call, wrap it in atomic brackets so that
4796     previous becomes the atomic group. All recursions were so wrapped in the
4797     past, but it no longer happens for non-repeated recursions. In fact, the
4798     repeated ones could be re-implemented independently so as not to need this,
4799     but for the moment we rely on the code for repeating groups. */
4800 
4801     if (*previous == OP_RECURSE)
4802       {
4803       memmove(previous + 1 + LINK_SIZE, previous, IN_UCHARS(1 + LINK_SIZE));
4804       *previous = OP_ONCE;
4805       PUT(previous, 1, 2 + 2*LINK_SIZE);
4806       previous[2 + 2*LINK_SIZE] = OP_KET;
4807       PUT(previous, 3 + 2*LINK_SIZE, 2 + 2*LINK_SIZE);
4808       code += 2 + 2 * LINK_SIZE;
4809       length_prevgroup = 3 + 3*LINK_SIZE;
4810 
4811       /* When actually compiling, we need to check whether this was a forward
4812       reference, and if so, adjust the offset. */
4813 
4814       if (lengthptr == NULL && cd->hwm >= cd->start_workspace + LINK_SIZE)
4815         {
4816         int offset = GET(cd->hwm, -LINK_SIZE);
4817         if (offset == previous + 1 - cd->start_code)
4818           PUT(cd->hwm, -LINK_SIZE, offset + 1 + LINK_SIZE);
4819         }
4820       }
4821 
4822     /* Now handle repetition for the different types of item. */
4823 
4824     /* If previous was a character or negated character match, abolish the item
4825     and generate a repeat item instead. If a char item has a minimum of more
4826     than one, ensure that it is set in reqchar - it might not be if a sequence
4827     such as x{3} is the first thing in a branch because the x will have gone
4828     into firstchar instead.  */
4829 
4830     if (*previous == OP_CHAR || *previous == OP_CHARI
4831         || *previous == OP_NOT || *previous == OP_NOTI)
4832       {
4833       switch (*previous)
4834         {
4835         default: /* Make compiler happy. */
4836         case OP_CHAR:  op_type = OP_STAR - OP_STAR; break;
4837         case OP_CHARI: op_type = OP_STARI - OP_STAR; break;
4838         case OP_NOT:   op_type = OP_NOTSTAR - OP_STAR; break;
4839         case OP_NOTI:  op_type = OP_NOTSTARI - OP_STAR; break;
4840         }
4841 
4842       /* Deal with UTF characters that take up more than one character. It's
4843       easier to write this out separately than try to macrify it. Use c to
4844       hold the length of the character in bytes, plus UTF_LENGTH to flag that
4845       it's a length rather than a small character. */
4846 
4847 #ifdef SUPPORT_UTF
4848       if (utf && NOT_FIRSTCHAR(code[-1]))
4849         {
4850         pcre_uchar *lastchar = code - 1;
4851         BACKCHAR(lastchar);
4852         c = (int)(code - lastchar);     /* Length of UTF-8 character */
4853         memcpy(utf_chars, lastchar, IN_UCHARS(c)); /* Save the char */
4854         c |= UTF_LENGTH;                /* Flag c as a length */
4855         }
4856       else
4857 #endif /* SUPPORT_UTF */
4858 
4859       /* Handle the case of a single character - either with no UTF support, or
4860       with UTF disabled, or for a single character UTF character. */
4861         {
4862         c = code[-1];
4863         if (*previous <= OP_CHARI && repeat_min > 1)
4864           reqchar = c | req_caseopt | cd->req_varyopt;
4865         }
4866 
4867       /* If the repetition is unlimited, it pays to see if the next thing on
4868       the line is something that cannot possibly match this character. If so,
4869       automatically possessifying this item gains some performance in the case
4870       where the match fails. */
4871 
4872       if (!possessive_quantifier &&
4873           repeat_max < 0 &&
4874           check_auto_possessive(previous, utf, ptr + 1, options, cd))
4875         {
4876         repeat_type = 0;    /* Force greedy */
4877         possessive_quantifier = TRUE;
4878         }
4879 
4880       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
4881       }
4882 
4883     /* If previous was a character type match (\d or similar), abolish it and
4884     create a suitable repeat item. The code is shared with single-character
4885     repeats by setting op_type to add a suitable offset into repeat_type. Note
4886     the the Unicode property types will be present only when SUPPORT_UCP is
4887     defined, but we don't wrap the little bits of code here because it just
4888     makes it horribly messy. */
4889 
4890     else if (*previous < OP_EODN)
4891       {
4892       pcre_uchar *oldcode;
4893       int prop_type, prop_value;
4894       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
4895       c = *previous;
4896 
4897       if (!possessive_quantifier &&
4898           repeat_max < 0 &&
4899           check_auto_possessive(previous, utf, ptr + 1, options, cd))
4900         {
4901         repeat_type = 0;    /* Force greedy */
4902         possessive_quantifier = TRUE;
4903         }
4904 
4905       OUTPUT_SINGLE_REPEAT:
4906       if (*previous == OP_PROP || *previous == OP_NOTPROP)
4907         {
4908         prop_type = previous[1];
4909         prop_value = previous[2];
4910         }
4911       else prop_type = prop_value = -1;
4912 
4913       oldcode = code;
4914       code = previous;                  /* Usually overwrite previous item */
4915 
4916       /* If the maximum is zero then the minimum must also be zero; Perl allows
4917       this case, so we do too - by simply omitting the item altogether. */
4918 
4919       if (repeat_max == 0) goto END_REPEAT;
4920 
4921       /*--------------------------------------------------------------------*/
4922       /* This code is obsolete from release 8.00; the restriction was finally
4923       removed: */
4924 
4925       /* All real repeats make it impossible to handle partial matching (maybe
4926       one day we will be able to remove this restriction). */
4927 
4928       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
4929       /*--------------------------------------------------------------------*/
4930 
4931       /* Combine the op_type with the repeat_type */
4932 
4933       repeat_type += op_type;
4934 
4935       /* A minimum of zero is handled either as the special case * or ?, or as
4936       an UPTO, with the maximum given. */
4937 
4938       if (repeat_min == 0)
4939         {
4940         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
4941           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
4942         else
4943           {
4944           *code++ = OP_UPTO + repeat_type;
4945           PUT2INC(code, 0, repeat_max);
4946           }
4947         }
4948 
4949       /* A repeat minimum of 1 is optimized into some special cases. If the
4950       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
4951       left in place and, if the maximum is greater than 1, we use OP_UPTO with
4952       one less than the maximum. */
4953 
4954       else if (repeat_min == 1)
4955         {
4956         if (repeat_max == -1)
4957           *code++ = OP_PLUS + repeat_type;
4958         else
4959           {
4960           code = oldcode;                 /* leave previous item in place */
4961           if (repeat_max == 1) goto END_REPEAT;
4962           *code++ = OP_UPTO + repeat_type;
4963           PUT2INC(code, 0, repeat_max - 1);
4964           }
4965         }
4966 
4967       /* The case {n,n} is just an EXACT, while the general case {n,m} is
4968       handled as an EXACT followed by an UPTO. */
4969 
4970       else
4971         {
4972         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
4973         PUT2INC(code, 0, repeat_min);
4974 
4975         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
4976         we have to insert the character for the previous code. For a repeated
4977         Unicode property match, there are two extra bytes that define the
4978         required property. In UTF-8 mode, long characters have their length in
4979         c, with the UTF_LENGTH bit as a flag. */
4980 
4981         if (repeat_max < 0)
4982           {
4983 #ifdef SUPPORT_UTF
4984           if (utf && (c & UTF_LENGTH) != 0)
4985             {
4986             memcpy(code, utf_chars, IN_UCHARS(c & 7));
4987             code += c & 7;
4988             }
4989           else
4990 #endif
4991             {
4992             *code++ = c;
4993             if (prop_type >= 0)
4994               {
4995               *code++ = prop_type;
4996               *code++ = prop_value;
4997               }
4998             }
4999           *code++ = OP_STAR + repeat_type;
5000           }
5001 
5002         /* Else insert an UPTO if the max is greater than the min, again
5003         preceded by the character, for the previously inserted code. If the
5004         UPTO is just for 1 instance, we can use QUERY instead. */
5005 
5006         else if (repeat_max != repeat_min)
5007           {
5008 #ifdef SUPPORT_UTF
5009           if (utf && (c & UTF_LENGTH) != 0)
5010             {
5011             memcpy(code, utf_chars, IN_UCHARS(c & 7));
5012             code += c & 7;
5013             }
5014           else
5015 #endif
5016           *code++ = c;
5017           if (prop_type >= 0)
5018             {
5019             *code++ = prop_type;
5020             *code++ = prop_value;
5021             }
5022           repeat_max -= repeat_min;
5023 
5024           if (repeat_max == 1)
5025             {
5026             *code++ = OP_QUERY + repeat_type;
5027             }
5028           else
5029             {
5030             *code++ = OP_UPTO + repeat_type;
5031             PUT2INC(code, 0, repeat_max);
5032             }
5033           }
5034         }
5035 
5036       /* The character or character type itself comes last in all cases. */
5037 
5038 #ifdef SUPPORT_UTF
5039       if (utf && (c & UTF_LENGTH) != 0)
5040         {
5041         memcpy(code, utf_chars, IN_UCHARS(c & 7));
5042         code += c & 7;
5043         }
5044       else
5045 #endif
5046       *code++ = c;
5047 
5048       /* For a repeated Unicode property match, there are two extra bytes that
5049       define the required property. */
5050 
5051 #ifdef SUPPORT_UCP
5052       if (prop_type >= 0)
5053         {
5054         *code++ = prop_type;
5055         *code++ = prop_value;
5056         }
5057 #endif
5058       }
5059 
5060     /* If previous was a character class or a back reference, we put the repeat
5061     stuff after it, but just skip the item if the repeat was {0,0}. */
5062 
5063     else if (*previous == OP_CLASS ||
5064              *previous == OP_NCLASS ||
5065 #if defined SUPPORT_UTF || !defined COMPILE_PCRE8
5066              *previous == OP_XCLASS ||
5067 #endif
5068              *previous == OP_REF ||
5069              *previous == OP_REFI)
5070       {
5071       if (repeat_max == 0)
5072         {
5073         code = previous;
5074         goto END_REPEAT;
5075         }
5076 
5077       /*--------------------------------------------------------------------*/
5078       /* This code is obsolete from release 8.00; the restriction was finally
5079       removed: */
5080 
5081       /* All real repeats make it impossible to handle partial matching (maybe
5082       one day we will be able to remove this restriction). */
5083 
5084       /* if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL; */
5085       /*--------------------------------------------------------------------*/
5086 
5087       if (repeat_min == 0 && repeat_max == -1)
5088         *code++ = OP_CRSTAR + repeat_type;
5089       else if (repeat_min == 1 && repeat_max == -1)
5090         *code++ = OP_CRPLUS + repeat_type;
5091       else if (repeat_min == 0 && repeat_max == 1)
5092         *code++ = OP_CRQUERY + repeat_type;
5093       else
5094         {
5095         *code++ = OP_CRRANGE + repeat_type;
5096         PUT2INC(code, 0, repeat_min);
5097         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
5098         PUT2INC(code, 0, repeat_max);
5099         }
5100       }
5101 
5102     /* If previous was a bracket group, we may have to replicate it in certain
5103     cases. Note that at this point we can encounter only the "basic" bracket
5104     opcodes such as BRA and CBRA, as this is the place where they get converted
5105     into the more special varieties such as BRAPOS and SBRA. A test for >=
5106     OP_ASSERT and <= OP_COND includes ASSERT, ASSERT_NOT, ASSERTBACK,
5107     ASSERTBACK_NOT, ONCE, BRA, CBRA, and COND. Originally, PCRE did not allow
5108     repetition of assertions, but now it does, for Perl compatibility. */
5109 
5110     else if (*previous >= OP_ASSERT && *previous <= OP_COND)
5111       {
5112       int i;
5113       int len = (int)(code - previous);
5114       pcre_uchar *bralink = NULL;
5115       pcre_uchar *brazeroptr = NULL;
5116 
5117       /* Repeating a DEFINE group is pointless, but Perl allows the syntax, so
5118       we just ignore the repeat. */
5119 
5120       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
5121         goto END_REPEAT;
5122 
5123       /* There is no sense in actually repeating assertions. The only potential
5124       use of repetition is in cases when the assertion is optional. Therefore,
5125       if the minimum is greater than zero, just ignore the repeat. If the
5126       maximum is not not zero or one, set it to 1. */
5127 
5128       if (*previous < OP_ONCE)    /* Assertion */
5129         {
5130         if (repeat_min > 0) goto END_REPEAT;
5131         if (repeat_max < 0 || repeat_max > 1) repeat_max = 1;
5132         }
5133 
5134       /* The case of a zero minimum is special because of the need to stick
5135       OP_BRAZERO in front of it, and because the group appears once in the
5136       data, whereas in other cases it appears the minimum number of times. For
5137       this reason, it is simplest to treat this case separately, as otherwise
5138       the code gets far too messy. There are several special subcases when the
5139       minimum is zero. */
5140 
5141       if (repeat_min == 0)
5142         {
5143         /* If the maximum is also zero, we used to just omit the group from the
5144         output altogether, like this:
5145 
5146         ** if (repeat_max == 0)
5147         **   {
5148         **   code = previous;
5149         **   goto END_REPEAT;
5150         **   }
5151 
5152         However, that fails when a group or a subgroup within it is referenced
5153         as a subroutine from elsewhere in the pattern, so now we stick in
5154         OP_SKIPZERO in front of it so that it is skipped on execution. As we
5155         don't have a list of which groups are referenced, we cannot do this
5156         selectively.
5157 
5158         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
5159         and do no more at this point. However, we do need to adjust any
5160         OP_RECURSE calls inside the group that refer to the group itself or any
5161         internal or forward referenced group, because the offset is from the
5162         start of the whole regex. Temporarily terminate the pattern while doing
5163         this. */
5164 
5165         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
5166           {
5167           *code = OP_END;
5168           adjust_recurse(previous, 1, utf, cd, save_hwm);
5169           memmove(previous + 1, previous, IN_UCHARS(len));
5170           code++;
5171           if (repeat_max == 0)
5172             {
5173             *previous++ = OP_SKIPZERO;
5174             goto END_REPEAT;
5175             }
5176           brazeroptr = previous;    /* Save for possessive optimizing */
5177           *previous++ = OP_BRAZERO + repeat_type;
5178           }
5179 
5180         /* If the maximum is greater than 1 and limited, we have to replicate
5181         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
5182         The first one has to be handled carefully because it's the original
5183         copy, which has to be moved up. The remainder can be handled by code
5184         that is common with the non-zero minimum case below. We have to
5185         adjust the value or repeat_max, since one less copy is required. Once
5186         again, we may have to adjust any OP_RECURSE calls inside the group. */
5187 
5188         else
5189           {
5190           int offset;
5191           *code = OP_END;
5192           adjust_recurse(previous, 2 + LINK_SIZE, utf, cd, save_hwm);
5193           memmove(previous + 2 + LINK_SIZE, previous, IN_UCHARS(len));
5194           code += 2 + LINK_SIZE;
5195           *previous++ = OP_BRAZERO + repeat_type;
5196           *previous++ = OP_BRA;
5197 
5198           /* We chain together the bracket offset fields that have to be
5199           filled in later when the ends of the brackets are reached. */
5200 
5201           offset = (bralink == NULL)? 0 : (int)(previous - bralink);
5202           bralink = previous;
5203           PUTINC(previous, 0, offset);
5204           }
5205 
5206         repeat_max--;
5207         }
5208 
5209       /* If the minimum is greater than zero, replicate the group as many
5210       times as necessary, and adjust the maximum to the number of subsequent
5211       copies that we need. If we set a first char from the group, and didn't
5212       set a required char, copy the latter from the former. If there are any
5213       forward reference subroutine calls in the group, there will be entries on
5214       the workspace list; replicate these with an appropriate increment. */
5215 
5216       else
5217         {
5218         if (repeat_min > 1)
5219           {
5220           /* In the pre-compile phase, we don't actually do the replication. We
5221           just adjust the length as if we had. Do some paranoid checks for
5222           potential integer overflow. The INT64_OR_DOUBLE type is a 64-bit
5223           integer type when available, otherwise double. */
5224 
5225           if (lengthptr != NULL)
5226             {
5227             int delta = (repeat_min - 1)*length_prevgroup;
5228             if ((INT64_OR_DOUBLE)(repeat_min - 1)*
5229                   (INT64_OR_DOUBLE)length_prevgroup >
5230                     (INT64_OR_DOUBLE)INT_MAX ||
5231                 OFLOW_MAX - *lengthptr < delta)
5232               {
5233               *errorcodeptr = ERR20;
5234               goto FAILED;
5235               }
5236             *lengthptr += delta;
5237             }
5238 
5239           /* This is compiling for real. If there is a set first byte for
5240           the group, and we have not yet set a "required byte", set it. Make
5241           sure there is enough workspace for copying forward references before
5242           doing the copy. */
5243 
5244           else
5245             {
5246             if (groupsetfirstchar && reqchar < 0) reqchar = firstchar;
5247 
5248             for (i = 1; i < repeat_min; i++)
5249               {
5250               pcre_uchar *hc;
5251               pcre_uchar *this_hwm = cd->hwm;
5252               memcpy(code, previous, IN_UCHARS(len));
5253 
5254               while (cd->hwm > cd->start_workspace + cd->workspace_size -
5255                      WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5256                 {
5257                 int save_offset = save_hwm - cd->start_workspace;
5258                 int this_offset = this_hwm - cd->start_workspace;
5259                 *errorcodeptr = expand_workspace(cd);
5260                 if (*errorcodeptr != 0) goto FAILED;
5261                 save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5262                 this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5263                 }
5264 
5265               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5266                 {
5267                 PUT(cd->hwm, 0, GET(hc, 0) + len);
5268                 cd->hwm += LINK_SIZE;
5269                 }
5270               save_hwm = this_hwm;
5271               code += len;
5272               }
5273             }
5274           }
5275 
5276         if (repeat_max > 0) repeat_max -= repeat_min;
5277         }
5278 
5279       /* This code is common to both the zero and non-zero minimum cases. If
5280       the maximum is limited, it replicates the group in a nested fashion,
5281       remembering the bracket starts on a stack. In the case of a zero minimum,
5282       the first one was set up above. In all cases the repeat_max now specifies
5283       the number of additional copies needed. Again, we must remember to
5284       replicate entries on the forward reference list. */
5285 
5286       if (repeat_max >= 0)
5287         {
5288         /* In the pre-compile phase, we don't actually do the replication. We
5289         just adjust the length as if we had. For each repetition we must add 1
5290         to the length for BRAZERO and for all but the last repetition we must
5291         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
5292         paranoid checks to avoid integer overflow. The INT64_OR_DOUBLE type is
5293         a 64-bit integer type when available, otherwise double. */
5294 
5295         if (lengthptr != NULL && repeat_max > 0)
5296           {
5297           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
5298                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
5299           if ((INT64_OR_DOUBLE)repeat_max *
5300                 (INT64_OR_DOUBLE)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
5301                   > (INT64_OR_DOUBLE)INT_MAX ||
5302               OFLOW_MAX - *lengthptr < delta)
5303             {
5304             *errorcodeptr = ERR20;
5305             goto FAILED;
5306             }
5307           *lengthptr += delta;
5308           }
5309 
5310         /* This is compiling for real */
5311 
5312         else for (i = repeat_max - 1; i >= 0; i--)
5313           {
5314           pcre_uchar *hc;
5315           pcre_uchar *this_hwm = cd->hwm;
5316 
5317           *code++ = OP_BRAZERO + repeat_type;
5318 
5319           /* All but the final copy start a new nesting, maintaining the
5320           chain of brackets outstanding. */
5321 
5322           if (i != 0)
5323             {
5324             int offset;
5325             *code++ = OP_BRA;
5326             offset = (bralink == NULL)? 0 : (int)(code - bralink);
5327             bralink = code;
5328             PUTINC(code, 0, offset);
5329             }
5330 
5331           memcpy(code, previous, IN_UCHARS(len));
5332 
5333           /* Ensure there is enough workspace for forward references before
5334           copying them. */
5335 
5336           while (cd->hwm > cd->start_workspace + cd->workspace_size -
5337                  WORK_SIZE_SAFETY_MARGIN - (this_hwm - save_hwm))
5338             {
5339             int save_offset = save_hwm - cd->start_workspace;
5340             int this_offset = this_hwm - cd->start_workspace;
5341             *errorcodeptr = expand_workspace(cd);
5342             if (*errorcodeptr != 0) goto FAILED;
5343             save_hwm = (pcre_uchar *)cd->start_workspace + save_offset;
5344             this_hwm = (pcre_uchar *)cd->start_workspace + this_offset;
5345             }
5346 
5347           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
5348             {
5349             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
5350             cd->hwm += LINK_SIZE;
5351             }
5352           save_hwm = this_hwm;
5353           code += len;
5354           }
5355 
5356         /* Now chain through the pending brackets, and fill in their length
5357         fields (which are holding the chain links pro tem). */
5358 
5359         while (bralink != NULL)
5360           {
5361           int oldlinkoffset;
5362           int offset = (int)(code - bralink + 1);
5363           pcre_uchar *bra = code - offset;
5364           oldlinkoffset = GET(bra, 1);
5365           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
5366           *code++ = OP_KET;
5367           PUTINC(code, 0, offset);
5368           PUT(bra, 1, offset);
5369           }
5370         }
5371 
5372       /* If the maximum is unlimited, set a repeater in the final copy. For
5373       ONCE brackets, that's all we need to do. However, possessively repeated
5374       ONCE brackets can be converted into non-capturing brackets, as the
5375       behaviour of (?:xx)++ is the same as (?>xx)++ and this saves having to
5376       deal with possessive ONCEs specially.
5377 
5378       Otherwise, when we are doing the actual compile phase, check to see
5379       whether this group is one that could match an empty string. If so,
5380       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
5381       that runtime checking can be done. [This check is also applied to ONCE
5382       groups at runtime, but in a different way.]
5383 
5384       Then, if the quantifier was possessive and the bracket is not a
5385       conditional, we convert the BRA code to the POS form, and the KET code to
5386       KETRPOS. (It turns out to be convenient at runtime to detect this kind of
5387       subpattern at both the start and at the end.) The use of special opcodes
5388       makes it possible to reduce greatly the stack usage in pcre_exec(). If
5389       the group is preceded by OP_BRAZERO, convert this to OP_BRAPOSZERO.
5390 
5391       Then, if the minimum number of matches is 1 or 0, cancel the possessive
5392       flag so that the default action below, of wrapping everything inside
5393       atomic brackets, does not happen. When the minimum is greater than 1,
5394       there will be earlier copies of the group, and so we still have to wrap
5395       the whole thing. */
5396 
5397       else
5398         {
5399         pcre_uchar *ketcode = code - 1 - LINK_SIZE;
5400         pcre_uchar *bracode = ketcode - GET(ketcode, 1);
5401 
5402         /* Convert possessive ONCE brackets to non-capturing */
5403 
5404         if ((*bracode == OP_ONCE || *bracode == OP_ONCE_NC) &&
5405             possessive_quantifier) *bracode = OP_BRA;
5406 
5407         /* For non-possessive ONCE brackets, all we need to do is to
5408         set the KET. */
5409 
5410         if (*bracode == OP_ONCE || *bracode == OP_ONCE_NC)
5411           *ketcode = OP_KETRMAX + repeat_type;
5412 
5413         /* Handle non-ONCE brackets and possessive ONCEs (which have been
5414         converted to non-capturing above). */
5415 
5416         else
5417           {
5418           /* In the compile phase, check for empty string matching. */
5419 
5420           if (lengthptr == NULL)
5421             {
5422             pcre_uchar *scode = bracode;
5423             do
5424               {
5425               if (could_be_empty_branch(scode, ketcode, utf, cd))
5426                 {
5427                 *bracode += OP_SBRA - OP_BRA;
5428                 break;
5429                 }
5430               scode += GET(scode, 1);
5431               }
5432             while (*scode == OP_ALT);
5433             }
5434 
5435           /* Handle possessive quantifiers. */
5436 
5437           if (possessive_quantifier)
5438             {
5439             /* For COND brackets, we wrap the whole thing in a possessively
5440             repeated non-capturing bracket, because we have not invented POS
5441             versions of the COND opcodes. Because we are moving code along, we
5442             must ensure that any pending recursive references are updated. */
5443 
5444             if (*bracode == OP_COND || *bracode == OP_SCOND)
5445               {
5446               int nlen = (int)(code - bracode);
5447               *code = OP_END;
5448               adjust_recurse(bracode, 1 + LINK_SIZE, utf, cd, save_hwm);
5449               memmove(bracode + 1 + LINK_SIZE, bracode, IN_UCHARS(nlen));
5450               code += 1 + LINK_SIZE;
5451               nlen += 1 + LINK_SIZE;
5452               *bracode = OP_BRAPOS;
5453               *code++ = OP_KETRPOS;
5454               PUTINC(code, 0, nlen);
5455               PUT(bracode, 1, nlen);
5456               }
5457 
5458             /* For non-COND brackets, we modify the BRA code and use KETRPOS. */
5459 
5460             else
5461               {
5462               *bracode += 1;              /* Switch to xxxPOS opcodes */
5463               *ketcode = OP_KETRPOS;
5464               }
5465 
5466             /* If the minimum is zero, mark it as possessive, then unset the
5467             possessive flag when the minimum is 0 or 1. */
5468 
5469             if (brazeroptr != NULL) *brazeroptr = OP_BRAPOSZERO;
5470             if (repeat_min < 2) possessive_quantifier = FALSE;
5471             }
5472 
5473           /* Non-possessive quantifier */
5474 
5475           else *ketcode = OP_KETRMAX + repeat_type;
5476           }
5477         }
5478       }
5479 
5480     /* If previous is OP_FAIL, it was generated by an empty class [] in
5481     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
5482     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
5483     error above. We can just ignore the repeat in JS case. */
5484 
5485     else if (*previous == OP_FAIL) goto END_REPEAT;
5486 
5487     /* Else there's some kind of shambles */
5488 
5489     else
5490       {
5491       *errorcodeptr = ERR11;
5492       goto FAILED;
5493       }
5494 
5495     /* If the character following a repeat is '+', or if certain optimization
5496     tests above succeeded, possessive_quantifier is TRUE. For some opcodes,
5497     there are special alternative opcodes for this case. For anything else, we
5498     wrap the entire repeated item inside OP_ONCE brackets. Logically, the '+'
5499     notation is just syntactic sugar, taken from Sun's Java package, but the
5500     special opcodes can optimize it.
5501 
5502     Some (but not all) possessively repeated subpatterns have already been
5503     completely handled in the code just above. For them, possessive_quantifier
5504     is always FALSE at this stage.
5505 
5506     Note that the repeated item starts at tempcode, not at previous, which
5507     might be the first part of a string whose (former) last char we repeated.
5508 
5509     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
5510     an 'upto' may follow. We skip over an 'exact' item, and then test the
5511     length of what remains before proceeding. */
5512 
5513     if (possessive_quantifier)
5514       {
5515       int len;
5516 
5517       if (*tempcode == OP_TYPEEXACT)
5518         tempcode += PRIV(OP_lengths)[*tempcode] +
5519           ((tempcode[1 + IMM2_SIZE] == OP_PROP
5520           || tempcode[1 + IMM2_SIZE] == OP_NOTPROP)? 2 : 0);
5521 
5522       else if (*tempcode == OP_EXACT || *tempcode == OP_NOTEXACT)
5523         {
5524         tempcode += PRIV(OP_lengths)[*tempcode];
5525 #ifdef SUPPORT_UTF
5526         if (utf && HAS_EXTRALEN(tempcode[-1]))
5527           tempcode += GET_EXTRALEN(tempcode[-1]);
5528 #endif
5529         }
5530 
5531       len = (int)(code - tempcode);
5532       if (len > 0) switch (*tempcode)
5533         {
5534         case OP_STAR:  *tempcode = OP_POSSTAR; break;
5535         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
5536         case OP_QUERY: *tempcode = OP_POSQUERY; break;
5537         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
5538 
5539         case OP_STARI:  *tempcode = OP_POSSTARI; break;
5540         case OP_PLUSI:  *tempcode = OP_POSPLUSI; break;
5541         case OP_QUERYI: *tempcode = OP_POSQUERYI; break;
5542         case OP_UPTOI:  *tempcode = OP_POSUPTOI; break;
5543 
5544         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
5545         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
5546         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
5547         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
5548 
5549         case OP_NOTSTARI:  *tempcode = OP_NOTPOSSTARI; break;
5550         case OP_NOTPLUSI:  *tempcode = OP_NOTPOSPLUSI; break;
5551         case OP_NOTQUERYI: *tempcode = OP_NOTPOSQUERYI; break;
5552         case OP_NOTUPTOI:  *tempcode = OP_NOTPOSUPTOI; break;
5553 
5554         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
5555         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
5556         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
5557         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
5558 
5559         /* Because we are moving code along, we must ensure that any
5560         pending recursive references are updated. */
5561 
5562         default:
5563         *code = OP_END;
5564         adjust_recurse(tempcode, 1 + LINK_SIZE, utf, cd, save_hwm);
5565         memmove(tempcode + 1 + LINK_SIZE, tempcode, IN_UCHARS(len));
5566         code += 1 + LINK_SIZE;
5567         len += 1 + LINK_SIZE;
5568         tempcode[0] = OP_ONCE;
5569         *code++ = OP_KET;
5570         PUTINC(code, 0, len);
5571         PUT(tempcode, 1, len);
5572         break;
5573         }
5574       }
5575 
5576     /* In all case we no longer have a previous item. We also set the
5577     "follows varying string" flag for subsequently encountered reqchars if
5578     it isn't already set and we have just passed a varying length item. */
5579 
5580     END_REPEAT:
5581     previous = NULL;
5582     cd->req_varyopt |= reqvary;
5583     break;
5584 
5585 
5586     /* ===================================================================*/
5587     /* Start of nested parenthesized sub-expression, or comment or lookahead or
5588     lookbehind or option setting or condition or all the other extended
5589     parenthesis forms.  */
5590 
5591     case CHAR_LEFT_PARENTHESIS:
5592     newoptions = options;
5593     skipbytes = 0;
5594     bravalue = OP_CBRA;
5595     save_hwm = cd->hwm;
5596     reset_bracount = FALSE;
5597 
5598     /* First deal with various "verbs" that can be introduced by '*'. */
5599 
5600     ptr++;
5601     if (ptr[0] == CHAR_ASTERISK && (ptr[1] == ':'
5602          || (MAX_255(ptr[1]) && ((cd->ctypes[ptr[1]] & ctype_letter) != 0))))
5603       {
5604       int i, namelen;
5605       int arglen = 0;
5606       const char *vn = verbnames;
5607       const pcre_uchar *name = ptr + 1;
5608       const pcre_uchar *arg = NULL;
5609       previous = NULL;
5610       ptr++;
5611       while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
5612       namelen = (int)(ptr - name);
5613 
5614       /* It appears that Perl allows any characters whatsoever, other than
5615       a closing parenthesis, to appear in arguments, so we no longer insist on
5616       letters, digits, and underscores. */
5617 
5618       if (*ptr == CHAR_COLON)
5619         {
5620         arg = ++ptr;
5621         while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5622         arglen = (int)(ptr - arg);
5623         if (arglen > (int)MAX_MARK)
5624           {
5625           *errorcodeptr = ERR75;
5626           goto FAILED;
5627           }
5628         }
5629 
5630       if (*ptr != CHAR_RIGHT_PARENTHESIS)
5631         {
5632         *errorcodeptr = ERR60;
5633         goto FAILED;
5634         }
5635 
5636       /* Scan the table of verb names */
5637 
5638       for (i = 0; i < verbcount; i++)
5639         {
5640         if (namelen == verbs[i].len &&
5641             STRNCMP_UC_C8(name, vn, namelen) == 0)
5642           {
5643           /* Check for open captures before ACCEPT and convert it to
5644           ASSERT_ACCEPT if in an assertion. */
5645 
5646           if (verbs[i].op == OP_ACCEPT)
5647             {
5648             open_capitem *oc;
5649             if (arglen != 0)
5650               {
5651               *errorcodeptr = ERR59;
5652               goto FAILED;
5653               }
5654             cd->had_accept = TRUE;
5655             for (oc = cd->open_caps; oc != NULL; oc = oc->next)
5656               {
5657               *code++ = OP_CLOSE;
5658               PUT2INC(code, 0, oc->number);
5659               }
5660             *code++ = (cd->assert_depth > 0)? OP_ASSERT_ACCEPT : OP_ACCEPT;
5661 
5662             /* Do not set firstchar after *ACCEPT */
5663             if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
5664             }
5665 
5666           /* Handle other cases with/without an argument */
5667 
5668           else if (arglen == 0)
5669             {
5670             if (verbs[i].op < 0)   /* Argument is mandatory */
5671               {
5672               *errorcodeptr = ERR66;
5673               goto FAILED;
5674               }
5675             *code = verbs[i].op;
5676             if (*code++ == OP_THEN) cd->external_flags |= PCRE_HASTHEN;
5677             }
5678 
5679           else
5680             {
5681             if (verbs[i].op_arg < 0)   /* Argument is forbidden */
5682               {
5683               *errorcodeptr = ERR59;
5684               goto FAILED;
5685               }
5686             *code = verbs[i].op_arg;
5687             if (*code++ == OP_THEN_ARG) cd->external_flags |= PCRE_HASTHEN;
5688             *code++ = arglen;
5689             memcpy(code, arg, IN_UCHARS(arglen));
5690             code += arglen;
5691             *code++ = 0;
5692             }
5693 
5694           break;  /* Found verb, exit loop */
5695           }
5696 
5697         vn += verbs[i].len + 1;
5698         }
5699 
5700       if (i < verbcount) continue;    /* Successfully handled a verb */
5701       *errorcodeptr = ERR60;          /* Verb not recognized */
5702       goto FAILED;
5703       }
5704 
5705     /* Deal with the extended parentheses; all are introduced by '?', and the
5706     appearance of any of them means that this is not a capturing group. */
5707 
5708     else if (*ptr == CHAR_QUESTION_MARK)
5709       {
5710       int i, set, unset, namelen;
5711       int *optset;
5712       const pcre_uchar *name;
5713       pcre_uchar *slot;
5714 
5715       switch (*(++ptr))
5716         {
5717         case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
5718         ptr++;
5719         while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
5720         if (*ptr == 0)
5721           {
5722           *errorcodeptr = ERR18;
5723           goto FAILED;
5724           }
5725         continue;
5726 
5727 
5728         /* ------------------------------------------------------------ */
5729         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
5730         reset_bracount = TRUE;
5731         /* Fall through */
5732 
5733         /* ------------------------------------------------------------ */
5734         case CHAR_COLON:          /* Non-capturing bracket */
5735         bravalue = OP_BRA;
5736         ptr++;
5737         break;
5738 
5739 
5740         /* ------------------------------------------------------------ */
5741         case CHAR_LEFT_PARENTHESIS:
5742         bravalue = OP_COND;       /* Conditional group */
5743 
5744         /* A condition can be an assertion, a number (referring to a numbered
5745         group), a name (referring to a named group), or 'R', referring to
5746         recursion. R<digits> and R&name are also permitted for recursion tests.
5747 
5748         There are several syntaxes for testing a named group: (?(name)) is used
5749         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
5750 
5751         There are two unfortunate ambiguities, caused by history. (a) 'R' can
5752         be the recursive thing or the name 'R' (and similarly for 'R' followed
5753         by digits), and (b) a number could be a name that consists of digits.
5754         In both cases, we look for a name first; if not found, we try the other
5755         cases. */
5756 
5757         /* For conditions that are assertions, check the syntax, and then exit
5758         the switch. This will take control down to where bracketed groups,
5759         including assertions, are processed. */
5760 
5761         if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
5762             ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
5763           break;
5764 
5765         /* Most other conditions use OP_CREF (a couple change to OP_RREF
5766         below), and all need to skip 1+IMM2_SIZE bytes at the start of the group. */
5767 
5768         code[1+LINK_SIZE] = OP_CREF;
5769         skipbytes = 1+IMM2_SIZE;
5770         refsign = -1;
5771 
5772         /* Check for a test for recursion in a named group. */
5773 
5774         if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
5775           {
5776           terminator = -1;
5777           ptr += 2;
5778           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
5779           }
5780 
5781         /* Check for a test for a named group's having been set, using the Perl
5782         syntax (?(<name>) or (?('name') */
5783 
5784         else if (ptr[1] == CHAR_LESS_THAN_SIGN)
5785           {
5786           terminator = CHAR_GREATER_THAN_SIGN;
5787           ptr++;
5788           }
5789         else if (ptr[1] == CHAR_APOSTROPHE)
5790           {
5791           terminator = CHAR_APOSTROPHE;
5792           ptr++;
5793           }
5794         else
5795           {
5796           terminator = 0;
5797           if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
5798           }
5799 
5800         /* We now expect to read a name; any thing else is an error */
5801 
5802         if (!MAX_255(ptr[1]) || (cd->ctypes[ptr[1]] & ctype_word) == 0)
5803           {
5804           ptr += 1;  /* To get the right offset */
5805           *errorcodeptr = ERR28;
5806           goto FAILED;
5807           }
5808 
5809         /* Read the name, but also get it as a number if it's all digits */
5810 
5811         recno = 0;
5812         name = ++ptr;
5813         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0)
5814           {
5815           if (recno >= 0)
5816             recno = (IS_DIGIT(*ptr))? recno * 10 + *ptr - CHAR_0 : -1;
5817           ptr++;
5818           }
5819         namelen = (int)(ptr - name);
5820 
5821         if ((terminator > 0 && *ptr++ != terminator) ||
5822             *ptr++ != CHAR_RIGHT_PARENTHESIS)
5823           {
5824           ptr--;      /* Error offset */
5825           *errorcodeptr = ERR26;
5826           goto FAILED;
5827           }
5828 
5829         /* Do no further checking in the pre-compile phase. */
5830 
5831         if (lengthptr != NULL) break;
5832 
5833         /* In the real compile we do the work of looking for the actual
5834         reference. If the string started with "+" or "-" we require the rest to
5835         be digits, in which case recno will be set. */
5836 
5837         if (refsign > 0)
5838           {
5839           if (recno <= 0)
5840             {
5841             *errorcodeptr = ERR58;
5842             goto FAILED;
5843             }
5844           recno = (refsign == CHAR_MINUS)?
5845             cd->bracount - recno + 1 : recno +cd->bracount;
5846           if (recno <= 0 || recno > cd->final_bracount)
5847             {
5848             *errorcodeptr = ERR15;
5849             goto FAILED;
5850             }
5851           PUT2(code, 2+LINK_SIZE, recno);
5852           break;
5853           }
5854 
5855         /* Otherwise (did not start with "+" or "-"), start by looking for the
5856         name. If we find a name, add one to the opcode to change OP_CREF or
5857         OP_RREF into OP_NCREF or OP_NRREF. These behave exactly the same,
5858         except they record that the reference was originally to a name. The
5859         information is used to check duplicate names. */
5860 
5861         slot = cd->name_table;
5862         for (i = 0; i < cd->names_found; i++)
5863           {
5864           if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0) break;
5865           slot += cd->name_entry_size;
5866           }
5867 
5868         /* Found a previous named subpattern */
5869 
5870         if (i < cd->names_found)
5871           {
5872           recno = GET2(slot, 0);
5873           PUT2(code, 2+LINK_SIZE, recno);
5874           code[1+LINK_SIZE]++;
5875           }
5876 
5877         /* Search the pattern for a forward reference */
5878 
5879         else if ((i = find_parens(cd, name, namelen,
5880                         (options & PCRE_EXTENDED) != 0, utf)) > 0)
5881           {
5882           PUT2(code, 2+LINK_SIZE, i);
5883           code[1+LINK_SIZE]++;
5884           }
5885 
5886         /* If terminator == 0 it means that the name followed directly after
5887         the opening parenthesis [e.g. (?(abc)...] and in this case there are
5888         some further alternatives to try. For the cases where terminator != 0
5889         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
5890         now checked all the possibilities, so give an error. */
5891 
5892         else if (terminator != 0)
5893           {
5894           *errorcodeptr = ERR15;
5895           goto FAILED;
5896           }
5897 
5898         /* Check for (?(R) for recursion. Allow digits after R to specify a
5899         specific group number. */
5900 
5901         else if (*name == CHAR_R)
5902           {
5903           recno = 0;
5904           for (i = 1; i < namelen; i++)
5905             {
5906             if (!IS_DIGIT(name[i]))
5907               {
5908               *errorcodeptr = ERR15;
5909               goto FAILED;
5910               }
5911             recno = recno * 10 + name[i] - CHAR_0;
5912             }
5913           if (recno == 0) recno = RREF_ANY;
5914           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
5915           PUT2(code, 2+LINK_SIZE, recno);
5916           }
5917 
5918         /* Similarly, check for the (?(DEFINE) "condition", which is always
5919         false. */
5920 
5921         else if (namelen == 6 && STRNCMP_UC_C8(name, STRING_DEFINE, 6) == 0)
5922           {
5923           code[1+LINK_SIZE] = OP_DEF;
5924           skipbytes = 1;
5925           }
5926 
5927         /* Check for the "name" actually being a subpattern number. We are
5928         in the second pass here, so final_bracount is set. */
5929 
5930         else if (recno > 0 && recno <= cd->final_bracount)
5931           {
5932           PUT2(code, 2+LINK_SIZE, recno);
5933           }
5934 
5935         /* Either an unidentified subpattern, or a reference to (?(0) */
5936 
5937         else
5938           {
5939           *errorcodeptr = (recno == 0)? ERR35: ERR15;
5940           goto FAILED;
5941           }
5942         break;
5943 
5944 
5945         /* ------------------------------------------------------------ */
5946         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
5947         bravalue = OP_ASSERT;
5948         cd->assert_depth += 1;
5949         ptr++;
5950         break;
5951 
5952 
5953         /* ------------------------------------------------------------ */
5954         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
5955         ptr++;
5956         if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
5957           {
5958           *code++ = OP_FAIL;
5959           previous = NULL;
5960           continue;
5961           }
5962         bravalue = OP_ASSERT_NOT;
5963         cd->assert_depth += 1;
5964         break;
5965 
5966 
5967         /* ------------------------------------------------------------ */
5968         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
5969         switch (ptr[1])
5970           {
5971           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
5972           bravalue = OP_ASSERTBACK;
5973           cd->assert_depth += 1;
5974           ptr += 2;
5975           break;
5976 
5977           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
5978           bravalue = OP_ASSERTBACK_NOT;
5979           cd->assert_depth += 1;
5980           ptr += 2;
5981           break;
5982 
5983           default:                /* Could be name define, else bad */
5984           if (MAX_255(ptr[1]) && (cd->ctypes[ptr[1]] & ctype_word) != 0)
5985             goto DEFINE_NAME;
5986           ptr++;                  /* Correct offset for error */
5987           *errorcodeptr = ERR24;
5988           goto FAILED;
5989           }
5990         break;
5991 
5992 
5993         /* ------------------------------------------------------------ */
5994         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
5995         bravalue = OP_ONCE;
5996         ptr++;
5997         break;
5998 
5999 
6000         /* ------------------------------------------------------------ */
6001         case CHAR_C:                 /* Callout - may be followed by digits; */
6002         previous_callout = code;     /* Save for later completion */
6003         after_manual_callout = 1;    /* Skip one item before completing */
6004         *code++ = OP_CALLOUT;
6005           {
6006           int n = 0;
6007           ptr++;
6008           while(IS_DIGIT(*ptr))
6009             n = n * 10 + *ptr++ - CHAR_0;
6010           if (*ptr != CHAR_RIGHT_PARENTHESIS)
6011             {
6012             *errorcodeptr = ERR39;
6013             goto FAILED;
6014             }
6015           if (n > 255)
6016             {
6017             *errorcodeptr = ERR38;
6018             goto FAILED;
6019             }
6020           *code++ = n;
6021           PUT(code, 0, (int)(ptr - cd->start_pattern + 1)); /* Pattern offset */
6022           PUT(code, LINK_SIZE, 0);                          /* Default length */
6023           code += 2 * LINK_SIZE;
6024           }
6025         previous = NULL;
6026         continue;
6027 
6028 
6029         /* ------------------------------------------------------------ */
6030         case CHAR_P:              /* Python-style named subpattern handling */
6031         if (*(++ptr) == CHAR_EQUALS_SIGN ||
6032             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
6033           {
6034           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
6035           terminator = CHAR_RIGHT_PARENTHESIS;
6036           goto NAMED_REF_OR_RECURSE;
6037           }
6038         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
6039           {
6040           *errorcodeptr = ERR41;
6041           goto FAILED;
6042           }
6043         /* Fall through to handle (?P< as (?< is handled */
6044 
6045 
6046         /* ------------------------------------------------------------ */
6047         DEFINE_NAME:    /* Come here from (?< handling */
6048         case CHAR_APOSTROPHE:
6049           {
6050           terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
6051             CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6052           name = ++ptr;
6053 
6054           while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6055           namelen = (int)(ptr - name);
6056 
6057           /* In the pre-compile phase, just do a syntax check. */
6058 
6059           if (lengthptr != NULL)
6060             {
6061             if (*ptr != terminator)
6062               {
6063               *errorcodeptr = ERR42;
6064               goto FAILED;
6065               }
6066             if (cd->names_found >= MAX_NAME_COUNT)
6067               {
6068               *errorcodeptr = ERR49;
6069               goto FAILED;
6070               }
6071             if (namelen + IMM2_SIZE + 1 > cd->name_entry_size)
6072               {
6073               cd->name_entry_size = namelen + IMM2_SIZE + 1;
6074               if (namelen > MAX_NAME_SIZE)
6075                 {
6076                 *errorcodeptr = ERR48;
6077                 goto FAILED;
6078                 }
6079               }
6080             }
6081 
6082           /* In the real compile, create the entry in the table, maintaining
6083           alphabetical order. Duplicate names for different numbers are
6084           permitted only if PCRE_DUPNAMES is set. Duplicate names for the same
6085           number are always OK. (An existing number can be re-used if (?|
6086           appears in the pattern.) In either event, a duplicate name results in
6087           a duplicate entry in the table, even if the number is the same. This
6088           is because the number of names, and hence the table size, is computed
6089           in the pre-compile, and it affects various numbers and pointers which
6090           would all have to be modified, and the compiled code moved down, if
6091           duplicates with the same number were omitted from the table. This
6092           doesn't seem worth the hassle. However, *different* names for the
6093           same number are not permitted. */
6094 
6095           else
6096             {
6097             BOOL dupname = FALSE;
6098             slot = cd->name_table;
6099 
6100             for (i = 0; i < cd->names_found; i++)
6101               {
6102               int crc = memcmp(name, slot+IMM2_SIZE, IN_UCHARS(namelen));
6103               if (crc == 0)
6104                 {
6105                 if (slot[IMM2_SIZE+namelen] == 0)
6106                   {
6107                   if (GET2(slot, 0) != cd->bracount + 1 &&
6108                       (options & PCRE_DUPNAMES) == 0)
6109                     {
6110                     *errorcodeptr = ERR43;
6111                     goto FAILED;
6112                     }
6113                   else dupname = TRUE;
6114                   }
6115                 else crc = -1;      /* Current name is a substring */
6116                 }
6117 
6118               /* Make space in the table and break the loop for an earlier
6119               name. For a duplicate or later name, carry on. We do this for
6120               duplicates so that in the simple case (when ?(| is not used) they
6121               are in order of their numbers. */
6122 
6123               if (crc < 0)
6124                 {
6125                 memmove(slot + cd->name_entry_size, slot,
6126                   IN_UCHARS((cd->names_found - i) * cd->name_entry_size));
6127                 break;
6128                 }
6129 
6130               /* Continue the loop for a later or duplicate name */
6131 
6132               slot += cd->name_entry_size;
6133               }
6134 
6135             /* For non-duplicate names, check for a duplicate number before
6136             adding the new name. */
6137 
6138             if (!dupname)
6139               {
6140               pcre_uchar *cslot = cd->name_table;
6141               for (i = 0; i < cd->names_found; i++)
6142                 {
6143                 if (cslot != slot)
6144                   {
6145                   if (GET2(cslot, 0) == cd->bracount + 1)
6146                     {
6147                     *errorcodeptr = ERR65;
6148                     goto FAILED;
6149                     }
6150                   }
6151                 else i--;
6152                 cslot += cd->name_entry_size;
6153                 }
6154               }
6155 
6156             PUT2(slot, 0, cd->bracount + 1);
6157             memcpy(slot + IMM2_SIZE, name, IN_UCHARS(namelen));
6158             slot[IMM2_SIZE + namelen] = 0;
6159             }
6160           }
6161 
6162         /* In both pre-compile and compile, count the number of names we've
6163         encountered. */
6164 
6165         cd->names_found++;
6166         ptr++;                    /* Move past > or ' */
6167         goto NUMBERED_GROUP;
6168 
6169 
6170         /* ------------------------------------------------------------ */
6171         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
6172         terminator = CHAR_RIGHT_PARENTHESIS;
6173         is_recurse = TRUE;
6174         /* Fall through */
6175 
6176         /* We come here from the Python syntax above that handles both
6177         references (?P=name) and recursion (?P>name), as well as falling
6178         through from the Perl recursion syntax (?&name). We also come here from
6179         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
6180         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
6181 
6182         NAMED_REF_OR_RECURSE:
6183         name = ++ptr;
6184         while (MAX_255(*ptr) && (cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
6185         namelen = (int)(ptr - name);
6186 
6187         /* In the pre-compile phase, do a syntax check. We used to just set
6188         a dummy reference number, because it was not used in the first pass.
6189         However, with the change of recursive back references to be atomic,
6190         we have to look for the number so that this state can be identified, as
6191         otherwise the incorrect length is computed. If it's not a backwards
6192         reference, the dummy number will do. */
6193 
6194         if (lengthptr != NULL)
6195           {
6196           const pcre_uchar *temp;
6197 
6198           if (namelen == 0)
6199             {
6200             *errorcodeptr = ERR62;
6201             goto FAILED;
6202             }
6203           if (*ptr != terminator)
6204             {
6205             *errorcodeptr = ERR42;
6206             goto FAILED;
6207             }
6208           if (namelen > MAX_NAME_SIZE)
6209             {
6210             *errorcodeptr = ERR48;
6211             goto FAILED;
6212             }
6213 
6214           /* The name table does not exist in the first pass, so we cannot
6215           do a simple search as in the code below. Instead, we have to scan the
6216           pattern to find the number. It is important that we scan it only as
6217           far as we have got because the syntax of named subpatterns has not
6218           been checked for the rest of the pattern, and find_parens() assumes
6219           correct syntax. In any case, it's a waste of resources to scan
6220           further. We stop the scan at the current point by temporarily
6221           adjusting the value of cd->endpattern. */
6222 
6223           temp = cd->end_pattern;
6224           cd->end_pattern = ptr;
6225           recno = find_parens(cd, name, namelen,
6226             (options & PCRE_EXTENDED) != 0, utf);
6227           cd->end_pattern = temp;
6228           if (recno < 0) recno = 0;    /* Forward ref; set dummy number */
6229           }
6230 
6231         /* In the real compile, seek the name in the table. We check the name
6232         first, and then check that we have reached the end of the name in the
6233         table. That way, if the name that is longer than any in the table,
6234         the comparison will fail without reading beyond the table entry. */
6235 
6236         else
6237           {
6238           slot = cd->name_table;
6239           for (i = 0; i < cd->names_found; i++)
6240             {
6241             if (STRNCMP_UC_UC(name, slot+IMM2_SIZE, namelen) == 0 &&
6242                 slot[IMM2_SIZE+namelen] == 0)
6243               break;
6244             slot += cd->name_entry_size;
6245             }
6246 
6247           if (i < cd->names_found)         /* Back reference */
6248             {
6249             recno = GET2(slot, 0);
6250             }
6251           else if ((recno =                /* Forward back reference */
6252                     find_parens(cd, name, namelen,
6253                       (options & PCRE_EXTENDED) != 0, utf)) <= 0)
6254             {
6255             *errorcodeptr = ERR15;
6256             goto FAILED;
6257             }
6258           }
6259 
6260         /* In both phases, we can now go to the code than handles numerical
6261         recursion or backreferences. */
6262 
6263         if (is_recurse) goto HANDLE_RECURSION;
6264           else goto HANDLE_REFERENCE;
6265 
6266 
6267         /* ------------------------------------------------------------ */
6268         case CHAR_R:              /* Recursion */
6269         ptr++;                    /* Same as (?0)      */
6270         /* Fall through */
6271 
6272 
6273         /* ------------------------------------------------------------ */
6274         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
6275         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
6276         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
6277           {
6278           const pcre_uchar *called;
6279           terminator = CHAR_RIGHT_PARENTHESIS;
6280 
6281           /* Come here from the \g<...> and \g'...' code (Oniguruma
6282           compatibility). However, the syntax has been checked to ensure that
6283           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
6284           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
6285           ever be taken. */
6286 
6287           HANDLE_NUMERICAL_RECURSION:
6288 
6289           if ((refsign = *ptr) == CHAR_PLUS)
6290             {
6291             ptr++;
6292             if (!IS_DIGIT(*ptr))
6293               {
6294               *errorcodeptr = ERR63;
6295               goto FAILED;
6296               }
6297             }
6298           else if (refsign == CHAR_MINUS)
6299             {
6300             if (!IS_DIGIT(ptr[1]))
6301               goto OTHER_CHAR_AFTER_QUERY;
6302             ptr++;
6303             }
6304 
6305           recno = 0;
6306           while(IS_DIGIT(*ptr))
6307             recno = recno * 10 + *ptr++ - CHAR_0;
6308 
6309           if (*ptr != terminator)
6310             {
6311             *errorcodeptr = ERR29;
6312             goto FAILED;
6313             }
6314 
6315           if (refsign == CHAR_MINUS)
6316             {
6317             if (recno == 0)
6318               {
6319               *errorcodeptr = ERR58;
6320               goto FAILED;
6321               }
6322             recno = cd->bracount - recno + 1;
6323             if (recno <= 0)
6324               {
6325               *errorcodeptr = ERR15;
6326               goto FAILED;
6327               }
6328             }
6329           else if (refsign == CHAR_PLUS)
6330             {
6331             if (recno == 0)
6332               {
6333               *errorcodeptr = ERR58;
6334               goto FAILED;
6335               }
6336             recno += cd->bracount;
6337             }
6338 
6339           /* Come here from code above that handles a named recursion */
6340 
6341           HANDLE_RECURSION:
6342 
6343           previous = code;
6344           called = cd->start_code;
6345 
6346           /* When we are actually compiling, find the bracket that is being
6347           referenced. Temporarily end the regex in case it doesn't exist before
6348           this point. If we end up with a forward reference, first check that
6349           the bracket does occur later so we can give the error (and position)
6350           now. Then remember this forward reference in the workspace so it can
6351           be filled in at the end. */
6352 
6353           if (lengthptr == NULL)
6354             {
6355             *code = OP_END;
6356             if (recno != 0)
6357               called = PRIV(find_bracket)(cd->start_code, utf, recno);
6358 
6359             /* Forward reference */
6360 
6361             if (called == NULL)
6362               {
6363               if (find_parens(cd, NULL, recno,
6364                     (options & PCRE_EXTENDED) != 0, utf) < 0)
6365                 {
6366                 *errorcodeptr = ERR15;
6367                 goto FAILED;
6368                 }
6369 
6370               /* Fudge the value of "called" so that when it is inserted as an
6371               offset below, what it actually inserted is the reference number
6372               of the group. Then remember the forward reference. */
6373 
6374               called = cd->start_code + recno;
6375               if (cd->hwm >= cd->start_workspace + cd->workspace_size -
6376                   WORK_SIZE_SAFETY_MARGIN)
6377                 {
6378                 *errorcodeptr = expand_workspace(cd);
6379                 if (*errorcodeptr != 0) goto FAILED;
6380                 }
6381               PUTINC(cd->hwm, 0, (int)(code + 1 - cd->start_code));
6382               }
6383 
6384             /* If not a forward reference, and the subpattern is still open,
6385             this is a recursive call. We check to see if this is a left
6386             recursion that could loop for ever, and diagnose that case. We
6387             must not, however, do this check if we are in a conditional
6388             subpattern because the condition might be testing for recursion in
6389             a pattern such as /(?(R)a+|(?R)b)/, which is perfectly valid.
6390             Forever loops are also detected at runtime, so those that occur in
6391             conditional subpatterns will be picked up then. */
6392 
6393             else if (GET(called, 1) == 0 && cond_depth <= 0 &&
6394                      could_be_empty(called, code, bcptr, utf, cd))
6395               {
6396               *errorcodeptr = ERR40;
6397               goto FAILED;
6398               }
6399             }
6400 
6401           /* Insert the recursion/subroutine item. It does not have a set first
6402           character (relevant if it is repeated, because it will then be
6403           wrapped with ONCE brackets). */
6404 
6405           *code = OP_RECURSE;
6406           PUT(code, 1, (int)(called - cd->start_code));
6407           code += 1 + LINK_SIZE;
6408           groupsetfirstchar = FALSE;
6409           }
6410 
6411         /* Can't determine a first byte now */
6412 
6413         if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
6414         continue;
6415 
6416 
6417         /* ------------------------------------------------------------ */
6418         default:              /* Other characters: check option setting */
6419         OTHER_CHAR_AFTER_QUERY:
6420         set = unset = 0;
6421         optset = &set;
6422 
6423         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
6424           {
6425           switch (*ptr++)
6426             {
6427             case CHAR_MINUS: optset = &unset; break;
6428 
6429             case CHAR_J:    /* Record that it changed in the external options */
6430             *optset |= PCRE_DUPNAMES;
6431             cd->external_flags |= PCRE_JCHANGED;
6432             break;
6433 
6434             case CHAR_i: *optset |= PCRE_CASELESS; break;
6435             case CHAR_m: *optset |= PCRE_MULTILINE; break;
6436             case CHAR_s: *optset |= PCRE_DOTALL; break;
6437             case CHAR_x: *optset |= PCRE_EXTENDED; break;
6438             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
6439             case CHAR_X: *optset |= PCRE_EXTRA; break;
6440 
6441             default:  *errorcodeptr = ERR12;
6442                       ptr--;    /* Correct the offset */
6443                       goto FAILED;
6444             }
6445           }
6446 
6447         /* Set up the changed option bits, but don't change anything yet. */
6448 
6449         newoptions = (options | set) & (~unset);
6450 
6451         /* If the options ended with ')' this is not the start of a nested
6452         group with option changes, so the options change at this level. If this
6453         item is right at the start of the pattern, the options can be
6454         abstracted and made external in the pre-compile phase, and ignored in
6455         the compile phase. This can be helpful when matching -- for instance in
6456         caseless checking of required bytes.
6457 
6458         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
6459         definitely *not* at the start of the pattern because something has been
6460         compiled. In the pre-compile phase, however, the code pointer can have
6461         that value after the start, because it gets reset as code is discarded
6462         during the pre-compile. However, this can happen only at top level - if
6463         we are within parentheses, the starting BRA will still be present. At
6464         any parenthesis level, the length value can be used to test if anything
6465         has been compiled at that level. Thus, a test for both these conditions
6466         is necessary to ensure we correctly detect the start of the pattern in
6467         both phases.
6468 
6469         If we are not at the pattern start, reset the greedy defaults and the
6470         case value for firstchar and reqchar. */
6471 
6472         if (*ptr == CHAR_RIGHT_PARENTHESIS)
6473           {
6474           if (code == cd->start_code + 1 + LINK_SIZE &&
6475                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
6476             {
6477             cd->external_options = newoptions;
6478             }
6479           else
6480             {
6481             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
6482             greedy_non_default = greedy_default ^ 1;
6483             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS:0;
6484             }
6485 
6486           /* Change options at this level, and pass them back for use
6487           in subsequent branches. */
6488 
6489           *optionsptr = options = newoptions;
6490           previous = NULL;       /* This item can't be repeated */
6491           continue;              /* It is complete */
6492           }
6493 
6494         /* If the options ended with ':' we are heading into a nested group
6495         with possible change of options. Such groups are non-capturing and are
6496         not assertions of any kind. All we need to do is skip over the ':';
6497         the newoptions value is handled below. */
6498 
6499         bravalue = OP_BRA;
6500         ptr++;
6501         }     /* End of switch for character following (? */
6502       }       /* End of (? handling */
6503 
6504     /* Opening parenthesis not followed by '*' or '?'. If PCRE_NO_AUTO_CAPTURE
6505     is set, all unadorned brackets become non-capturing and behave like (?:...)
6506     brackets. */
6507 
6508     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
6509       {
6510       bravalue = OP_BRA;
6511       }
6512 
6513     /* Else we have a capturing group. */
6514 
6515     else
6516       {
6517       NUMBERED_GROUP:
6518       cd->bracount += 1;
6519       PUT2(code, 1+LINK_SIZE, cd->bracount);
6520       skipbytes = IMM2_SIZE;
6521       }
6522 
6523     /* Process nested bracketed regex. Assertions used not to be repeatable,
6524     but this was changed for Perl compatibility, so all kinds can now be
6525     repeated. We copy code into a non-register variable (tempcode) in order to
6526     be able to pass its address because some compilers complain otherwise. */
6527 
6528     previous = code;                      /* For handling repetition */
6529     *code = bravalue;
6530     tempcode = code;
6531     tempreqvary = cd->req_varyopt;        /* Save value before bracket */
6532     tempbracount = cd->bracount;          /* Save value before bracket */
6533     length_prevgroup = 0;                 /* Initialize for pre-compile phase */
6534 
6535     if (!compile_regex(
6536          newoptions,                      /* The complete new option state */
6537          &tempcode,                       /* Where to put code (updated) */
6538          &ptr,                            /* Input pointer (updated) */
6539          errorcodeptr,                    /* Where to put an error message */
6540          (bravalue == OP_ASSERTBACK ||
6541           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
6542          reset_bracount,                  /* True if (?| group */
6543          skipbytes,                       /* Skip over bracket number */
6544          cond_depth +
6545            ((bravalue == OP_COND)?1:0),   /* Depth of condition subpatterns */
6546          &subfirstchar,                   /* For possible first char */
6547          &subreqchar,                     /* For possible last char */
6548          bcptr,                           /* Current branch chain */
6549          cd,                              /* Tables block */
6550          (lengthptr == NULL)? NULL :      /* Actual compile phase */
6551            &length_prevgroup              /* Pre-compile phase */
6552          ))
6553       goto FAILED;
6554 
6555     /* If this was an atomic group and there are no capturing groups within it,
6556     generate OP_ONCE_NC instead of OP_ONCE. */
6557 
6558     if (bravalue == OP_ONCE && cd->bracount <= tempbracount)
6559       *code = OP_ONCE_NC;
6560 
6561     if (bravalue >= OP_ASSERT && bravalue <= OP_ASSERTBACK_NOT)
6562       cd->assert_depth -= 1;
6563 
6564     /* At the end of compiling, code is still pointing to the start of the
6565     group, while tempcode has been updated to point past the end of the group.
6566     The pattern pointer (ptr) is on the bracket.
6567 
6568     If this is a conditional bracket, check that there are no more than
6569     two branches in the group, or just one if it's a DEFINE group. We do this
6570     in the real compile phase, not in the pre-pass, where the whole group may
6571     not be available. */
6572 
6573     if (bravalue == OP_COND && lengthptr == NULL)
6574       {
6575       pcre_uchar *tc = code;
6576       int condcount = 0;
6577 
6578       do {
6579          condcount++;
6580          tc += GET(tc,1);
6581          }
6582       while (*tc != OP_KET);
6583 
6584       /* A DEFINE group is never obeyed inline (the "condition" is always
6585       false). It must have only one branch. */
6586 
6587       if (code[LINK_SIZE+1] == OP_DEF)
6588         {
6589         if (condcount > 1)
6590           {
6591           *errorcodeptr = ERR54;
6592           goto FAILED;
6593           }
6594         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
6595         }
6596 
6597       /* A "normal" conditional group. If there is just one branch, we must not
6598       make use of its firstchar or reqchar, because this is equivalent to an
6599       empty second branch. */
6600 
6601       else
6602         {
6603         if (condcount > 2)
6604           {
6605           *errorcodeptr = ERR27;
6606           goto FAILED;
6607           }
6608         if (condcount == 1) subfirstchar = subreqchar = REQ_NONE;
6609         }
6610       }
6611 
6612     /* Error if hit end of pattern */
6613 
6614     if (*ptr != CHAR_RIGHT_PARENTHESIS)
6615       {
6616       *errorcodeptr = ERR14;
6617       goto FAILED;
6618       }
6619 
6620     /* In the pre-compile phase, update the length by the length of the group,
6621     less the brackets at either end. Then reduce the compiled code to just a
6622     set of non-capturing brackets so that it doesn't use much memory if it is
6623     duplicated by a quantifier.*/
6624 
6625     if (lengthptr != NULL)
6626       {
6627       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
6628         {
6629         *errorcodeptr = ERR20;
6630         goto FAILED;
6631         }
6632       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
6633       code++;   /* This already contains bravalue */
6634       PUTINC(code, 0, 1 + LINK_SIZE);
6635       *code++ = OP_KET;
6636       PUTINC(code, 0, 1 + LINK_SIZE);
6637       break;    /* No need to waste time with special character handling */
6638       }
6639 
6640     /* Otherwise update the main code pointer to the end of the group. */
6641 
6642     code = tempcode;
6643 
6644     /* For a DEFINE group, required and first character settings are not
6645     relevant. */
6646 
6647     if (bravalue == OP_DEF) break;
6648 
6649     /* Handle updating of the required and first characters for other types of
6650     group. Update for normal brackets of all kinds, and conditions with two
6651     branches (see code above). If the bracket is followed by a quantifier with
6652     zero repeat, we have to back off. Hence the definition of zeroreqchar and
6653     zerofirstchar outside the main loop so that they can be accessed for the
6654     back off. */
6655 
6656     zeroreqchar = reqchar;
6657     zerofirstchar = firstchar;
6658     groupsetfirstchar = FALSE;
6659 
6660     if (bravalue >= OP_ONCE)
6661       {
6662       /* If we have not yet set a firstchar in this branch, take it from the
6663       subpattern, remembering that it was set here so that a repeat of more
6664       than one can replicate it as reqchar if necessary. If the subpattern has
6665       no firstchar, set "none" for the whole branch. In both cases, a zero
6666       repeat forces firstchar to "none". */
6667 
6668       if (firstchar == REQ_UNSET)
6669         {
6670         if (subfirstchar >= 0)
6671           {
6672           firstchar = subfirstchar;
6673           groupsetfirstchar = TRUE;
6674           }
6675         else firstchar = REQ_NONE;
6676         zerofirstchar = REQ_NONE;
6677         }
6678 
6679       /* If firstchar was previously set, convert the subpattern's firstchar
6680       into reqchar if there wasn't one, using the vary flag that was in
6681       existence beforehand. */
6682 
6683       else if (subfirstchar >= 0 && subreqchar < 0)
6684         subreqchar = subfirstchar | tempreqvary;
6685 
6686       /* If the subpattern set a required byte (or set a first byte that isn't
6687       really the first byte - see above), set it. */
6688 
6689       if (subreqchar >= 0) reqchar = subreqchar;
6690       }
6691 
6692     /* For a forward assertion, we take the reqchar, if set. This can be
6693     helpful if the pattern that follows the assertion doesn't set a different
6694     char. For example, it's useful for /(?=abcde).+/. We can't set firstchar
6695     for an assertion, however because it leads to incorrect effect for patterns
6696     such as /(?=a)a.+/ when the "real" "a" would then become a reqchar instead
6697     of a firstchar. This is overcome by a scan at the end if there's no
6698     firstchar, looking for an asserted first char. */
6699 
6700     else if (bravalue == OP_ASSERT && subreqchar >= 0) reqchar = subreqchar;
6701     break;     /* End of processing '(' */
6702 
6703 
6704     /* ===================================================================*/
6705     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
6706     are arranged to be the negation of the corresponding OP_values in the
6707     default case when PCRE_UCP is not set. For the back references, the values
6708     are ESC_REF plus the reference number. Only back references and those types
6709     that consume a character may be repeated. We can test for values between
6710     ESC_b and ESC_Z for the latter; this may have to change if any new ones are
6711     ever created. */
6712 
6713     case CHAR_BACKSLASH:
6714     tempptr = ptr;
6715     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
6716     if (*errorcodeptr != 0) goto FAILED;
6717 
6718     if (c < 0)
6719       {
6720       if (-c == ESC_Q)            /* Handle start of quoted string */
6721         {
6722         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
6723           ptr += 2;               /* avoid empty string */
6724             else inescq = TRUE;
6725         continue;
6726         }
6727 
6728       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
6729 
6730       /* For metasequences that actually match a character, we disable the
6731       setting of a first character if it hasn't already been set. */
6732 
6733       if (firstchar == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
6734         firstchar = REQ_NONE;
6735 
6736       /* Set values to reset to if this is followed by a zero repeat. */
6737 
6738       zerofirstchar = firstchar;
6739       zeroreqchar = reqchar;
6740 
6741       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
6742       is a subroutine call by number (Oniguruma syntax). In fact, the value
6743       -ESC_g is returned only for these cases. So we don't need to check for <
6744       or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
6745       -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
6746       that is a synonym for a named back reference). */
6747 
6748       if (-c == ESC_g)
6749         {
6750         const pcre_uchar *p;
6751         save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
6752         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6753           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
6754 
6755         /* These two statements stop the compiler for warning about possibly
6756         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
6757         fact, because we actually check for a number below, the paths that
6758         would actually be in error are never taken. */
6759 
6760         skipbytes = 0;
6761         reset_bracount = FALSE;
6762 
6763         /* Test for a name */
6764 
6765         if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
6766           {
6767           BOOL is_a_number = TRUE;
6768           for (p = ptr + 1; *p != 0 && *p != terminator; p++)
6769             {
6770             if (!MAX_255(*p)) { is_a_number = FALSE; break; }
6771             if ((cd->ctypes[*p] & ctype_digit) == 0) is_a_number = FALSE;
6772             if ((cd->ctypes[*p] & ctype_word) == 0) break;
6773             }
6774           if (*p != terminator)
6775             {
6776             *errorcodeptr = ERR57;
6777             break;
6778             }
6779           if (is_a_number)
6780             {
6781             ptr++;
6782             goto HANDLE_NUMERICAL_RECURSION;
6783             }
6784           is_recurse = TRUE;
6785           goto NAMED_REF_OR_RECURSE;
6786           }
6787 
6788         /* Test a signed number in angle brackets or quotes. */
6789 
6790         p = ptr + 2;
6791         while (IS_DIGIT(*p)) p++;
6792         if (*p != terminator)
6793           {
6794           *errorcodeptr = ERR57;
6795           break;
6796           }
6797         ptr++;
6798         goto HANDLE_NUMERICAL_RECURSION;
6799         }
6800 
6801       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
6802       We also support \k{name} (.NET syntax).  */
6803 
6804       if (-c == ESC_k)
6805         {
6806         if ((ptr[1] != CHAR_LESS_THAN_SIGN &&
6807           ptr[1] != CHAR_APOSTROPHE && ptr[1] != CHAR_LEFT_CURLY_BRACKET))
6808           {
6809           *errorcodeptr = ERR69;
6810           break;
6811           }
6812         is_recurse = FALSE;
6813         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
6814           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
6815           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
6816         goto NAMED_REF_OR_RECURSE;
6817         }
6818 
6819       /* Back references are handled specially; must disable firstchar if
6820       not set to cope with cases like (?=(\w+))\1: which would otherwise set
6821       ':' later. */
6822 
6823       if (-c >= ESC_REF)
6824         {
6825         open_capitem *oc;
6826         recno = -c - ESC_REF;
6827 
6828         HANDLE_REFERENCE:    /* Come here from named backref handling */
6829         if (firstchar == REQ_UNSET) firstchar = REQ_NONE;
6830         previous = code;
6831         *code++ = ((options & PCRE_CASELESS) != 0)? OP_REFI : OP_REF;
6832         PUT2INC(code, 0, recno);
6833         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
6834         if (recno > cd->top_backref) cd->top_backref = recno;
6835 
6836         /* Check to see if this back reference is recursive, that it, it
6837         is inside the group that it references. A flag is set so that the
6838         group can be made atomic. */
6839 
6840         for (oc = cd->open_caps; oc != NULL; oc = oc->next)
6841           {
6842           if (oc->number == recno)
6843             {
6844             oc->flag = TRUE;
6845             break;
6846             }
6847           }
6848         }
6849 
6850       /* So are Unicode property matches, if supported. */
6851 
6852 #ifdef SUPPORT_UCP
6853       else if (-c == ESC_P || -c == ESC_p)
6854         {
6855         BOOL negated;
6856         int pdata;
6857         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
6858         if (ptype < 0) goto FAILED;
6859         previous = code;
6860         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
6861         *code++ = ptype;
6862         *code++ = pdata;
6863         }
6864 #else
6865 
6866       /* If Unicode properties are not supported, \X, \P, and \p are not
6867       allowed. */
6868 
6869       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
6870         {
6871         *errorcodeptr = ERR45;
6872         goto FAILED;
6873         }
6874 #endif
6875 
6876       /* For the rest (including \X when Unicode properties are supported), we
6877       can obtain the OP value by negating the escape value in the default
6878       situation when PCRE_UCP is not set. When it *is* set, we substitute
6879       Unicode property tests. Note that \b and \B do a one-character
6880       lookbehind. */
6881 
6882       else
6883         {
6884         if ((-c == ESC_b || -c == ESC_B) && cd->max_lookbehind == 0)
6885           cd->max_lookbehind = 1;
6886 #ifdef SUPPORT_UCP
6887         if (-c >= ESC_DU && -c <= ESC_wu)
6888           {
6889           nestptr = ptr + 1;                   /* Where to resume */
6890           ptr = substitutes[-c - ESC_DU] - 1;  /* Just before substitute */
6891           }
6892         else
6893 #endif
6894         /* In non-UTF-8 mode, we turn \C into OP_ALLANY instead of OP_ANYBYTE
6895         so that it works in DFA mode and in lookbehinds. */
6896 
6897           {
6898           previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
6899           *code++ = (!utf && c == -ESC_C)? OP_ALLANY : -c;
6900           }
6901         }
6902       continue;
6903       }
6904 
6905     /* We have a data character whose value is in c. In UTF-8 mode it may have
6906     a value > 127. We set its representation in the length/buffer, and then
6907     handle it as a data character. */
6908 
6909 #ifdef SUPPORT_UTF
6910     if (utf && c > MAX_VALUE_FOR_SINGLE_CHAR)
6911       mclength = PRIV(ord2utf)(c, mcbuffer);
6912     else
6913 #endif
6914 
6915      {
6916      mcbuffer[0] = c;
6917      mclength = 1;
6918      }
6919     goto ONE_CHAR;
6920 
6921 
6922     /* ===================================================================*/
6923     /* Handle a literal character. It is guaranteed not to be whitespace or #
6924     when the extended flag is set. If we are in UTF-8 mode, it may be a
6925     multi-byte literal character. */
6926 
6927     default:
6928     NORMAL_CHAR:
6929     mclength = 1;
6930     mcbuffer[0] = c;
6931 
6932 #ifdef SUPPORT_UTF
6933     if (utf && HAS_EXTRALEN(c))
6934       ACROSSCHAR(TRUE, ptr[1], mcbuffer[mclength++] = *(++ptr));
6935 #endif
6936 
6937     /* At this point we have the character's bytes in mcbuffer, and the length
6938     in mclength. When not in UTF-8 mode, the length is always 1. */
6939 
6940     ONE_CHAR:
6941     previous = code;
6942     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARI : OP_CHAR;
6943     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
6944 
6945     /* Remember if \r or \n were seen */
6946 
6947     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
6948       cd->external_flags |= PCRE_HASCRORLF;
6949 
6950     /* Set the first and required bytes appropriately. If no previous first
6951     byte, set it from this character, but revert to none on a zero repeat.
6952     Otherwise, leave the firstchar value alone, and don't change it on a zero
6953     repeat. */
6954 
6955     if (firstchar == REQ_UNSET)
6956       {
6957       zerofirstchar = REQ_NONE;
6958       zeroreqchar = reqchar;
6959 
6960       /* If the character is more than one byte long, we can set firstchar
6961       only if it is not to be matched caselessly. */
6962 
6963       if (mclength == 1 || req_caseopt == 0)
6964         {
6965         firstchar = mcbuffer[0] | req_caseopt;
6966         if (mclength != 1) reqchar = code[-1] | cd->req_varyopt;
6967         }
6968       else firstchar = reqchar = REQ_NONE;
6969       }
6970 
6971     /* firstchar was previously set; we can set reqchar only if the length is
6972     1 or the matching is caseful. */
6973 
6974     else
6975       {
6976       zerofirstchar = firstchar;
6977       zeroreqchar = reqchar;
6978       if (mclength == 1 || req_caseopt == 0)
6979         reqchar = code[-1] | req_caseopt | cd->req_varyopt;
6980       }
6981 
6982     break;            /* End of literal character handling */
6983     }
6984   }                   /* end of big loop */
6985 
6986 
6987 /* Control never reaches here by falling through, only by a goto for all the
6988 error states. Pass back the position in the pattern so that it can be displayed
6989 to the user for diagnosing the error. */
6990 
6991 FAILED:
6992 *ptrptr = ptr;
6993 return FALSE;
6994 }
6995 
6996 
6997 
6998 
6999 /*************************************************
7000 *     Compile sequence of alternatives           *
7001 *************************************************/
7002 
7003 /* On entry, ptr is pointing past the bracket character, but on return it
7004 points to the closing bracket, or vertical bar, or end of string. The code
7005 variable is pointing at the byte into which the BRA operator has been stored.
7006 This function is used during the pre-compile phase when we are trying to find
7007 out the amount of memory needed, as well as during the real compile phase. The
7008 value of lengthptr distinguishes the two phases.
7009 
7010 Arguments:
7011   options        option bits, including any changes for this subpattern
7012   codeptr        -> the address of the current code pointer
7013   ptrptr         -> the address of the current pattern pointer
7014   errorcodeptr   -> pointer to error code variable
7015   lookbehind     TRUE if this is a lookbehind assertion
7016   reset_bracount TRUE to reset the count for each branch
7017   skipbytes      skip this many bytes at start (for brackets and OP_COND)
7018   cond_depth     depth of nesting for conditional subpatterns
7019   firstcharptr   place to put the first required character, or a negative number
7020   reqcharptr     place to put the last required character, or a negative number
7021   bcptr          pointer to the chain of currently open branches
7022   cd             points to the data block with tables pointers etc.
7023   lengthptr      NULL during the real compile phase
7024                  points to length accumulator during pre-compile phase
7025 
7026 Returns:         TRUE on success
7027 */
7028 
7029 static BOOL
compile_regex(int options,pcre_uchar ** codeptr,const pcre_uchar ** ptrptr,int * errorcodeptr,BOOL lookbehind,BOOL reset_bracount,int skipbytes,int cond_depth,pcre_int32 * firstcharptr,pcre_int32 * reqcharptr,branch_chain * bcptr,compile_data * cd,int * lengthptr)7030 compile_regex(int options, pcre_uchar **codeptr, const pcre_uchar **ptrptr,
7031   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
7032   int cond_depth, pcre_int32 *firstcharptr, pcre_int32 *reqcharptr,
7033   branch_chain *bcptr, compile_data *cd, int *lengthptr)
7034 {
7035 const pcre_uchar *ptr = *ptrptr;
7036 pcre_uchar *code = *codeptr;
7037 pcre_uchar *last_branch = code;
7038 pcre_uchar *start_bracket = code;
7039 pcre_uchar *reverse_count = NULL;
7040 open_capitem capitem;
7041 int capnumber = 0;
7042 pcre_int32 firstchar, reqchar;
7043 pcre_int32 branchfirstchar, branchreqchar;
7044 int length;
7045 int orig_bracount;
7046 int max_bracount;
7047 branch_chain bc;
7048 
7049 bc.outer = bcptr;
7050 bc.current_branch = code;
7051 
7052 firstchar = reqchar = REQ_UNSET;
7053 
7054 /* Accumulate the length for use in the pre-compile phase. Start with the
7055 length of the BRA and KET and any extra bytes that are required at the
7056 beginning. We accumulate in a local variable to save frequent testing of
7057 lenthptr for NULL. We cannot do this by looking at the value of code at the
7058 start and end of each alternative, because compiled items are discarded during
7059 the pre-compile phase so that the work space is not exceeded. */
7060 
7061 length = 2 + 2*LINK_SIZE + skipbytes;
7062 
7063 /* WARNING: If the above line is changed for any reason, you must also change
7064 the code that abstracts option settings at the start of the pattern and makes
7065 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
7066 pre-compile phase to find out whether anything has yet been compiled or not. */
7067 
7068 /* If this is a capturing subpattern, add to the chain of open capturing items
7069 so that we can detect them if (*ACCEPT) is encountered. This is also used to
7070 detect groups that contain recursive back references to themselves. Note that
7071 only OP_CBRA need be tested here; changing this opcode to one of its variants,
7072 e.g. OP_SCBRAPOS, happens later, after the group has been compiled. */
7073 
7074 if (*code == OP_CBRA)
7075   {
7076   capnumber = GET2(code, 1 + LINK_SIZE);
7077   capitem.number = capnumber;
7078   capitem.next = cd->open_caps;
7079   capitem.flag = FALSE;
7080   cd->open_caps = &capitem;
7081   }
7082 
7083 /* Offset is set zero to mark that this bracket is still open */
7084 
7085 PUT(code, 1, 0);
7086 code += 1 + LINK_SIZE + skipbytes;
7087 
7088 /* Loop for each alternative branch */
7089 
7090 orig_bracount = max_bracount = cd->bracount;
7091 for (;;)
7092   {
7093   /* For a (?| group, reset the capturing bracket count so that each branch
7094   uses the same numbers. */
7095 
7096   if (reset_bracount) cd->bracount = orig_bracount;
7097 
7098   /* Set up dummy OP_REVERSE if lookbehind assertion */
7099 
7100   if (lookbehind)
7101     {
7102     *code++ = OP_REVERSE;
7103     reverse_count = code;
7104     PUTINC(code, 0, 0);
7105     length += 1 + LINK_SIZE;
7106     }
7107 
7108   /* Now compile the branch; in the pre-compile phase its length gets added
7109   into the length. */
7110 
7111   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstchar,
7112         &branchreqchar, &bc, cond_depth, cd,
7113         (lengthptr == NULL)? NULL : &length))
7114     {
7115     *ptrptr = ptr;
7116     return FALSE;
7117     }
7118 
7119   /* Keep the highest bracket count in case (?| was used and some branch
7120   has fewer than the rest. */
7121 
7122   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
7123 
7124   /* In the real compile phase, there is some post-processing to be done. */
7125 
7126   if (lengthptr == NULL)
7127     {
7128     /* If this is the first branch, the firstchar and reqchar values for the
7129     branch become the values for the regex. */
7130 
7131     if (*last_branch != OP_ALT)
7132       {
7133       firstchar = branchfirstchar;
7134       reqchar = branchreqchar;
7135       }
7136 
7137     /* If this is not the first branch, the first char and reqchar have to
7138     match the values from all the previous branches, except that if the
7139     previous value for reqchar didn't have REQ_VARY set, it can still match,
7140     and we set REQ_VARY for the regex. */
7141 
7142     else
7143       {
7144       /* If we previously had a firstchar, but it doesn't match the new branch,
7145       we have to abandon the firstchar for the regex, but if there was
7146       previously no reqchar, it takes on the value of the old firstchar. */
7147 
7148       if (firstchar >= 0 && firstchar != branchfirstchar)
7149         {
7150         if (reqchar < 0) reqchar = firstchar;
7151         firstchar = REQ_NONE;
7152         }
7153 
7154       /* If we (now or from before) have no firstchar, a firstchar from the
7155       branch becomes a reqchar if there isn't a branch reqchar. */
7156 
7157       if (firstchar < 0 && branchfirstchar >= 0 && branchreqchar < 0)
7158           branchreqchar = branchfirstchar;
7159 
7160       /* Now ensure that the reqchars match */
7161 
7162       if ((reqchar & ~REQ_VARY) != (branchreqchar & ~REQ_VARY))
7163         reqchar = REQ_NONE;
7164       else reqchar |= branchreqchar;   /* To "or" REQ_VARY */
7165       }
7166 
7167     /* If lookbehind, check that this branch matches a fixed-length string, and
7168     put the length into the OP_REVERSE item. Temporarily mark the end of the
7169     branch with OP_END. If the branch contains OP_RECURSE, the result is -3
7170     because there may be forward references that we can't check here. Set a
7171     flag to cause another lookbehind check at the end. Why not do it all at the
7172     end? Because common, erroneous checks are picked up here and the offset of
7173     the problem can be shown. */
7174 
7175     if (lookbehind)
7176       {
7177       int fixed_length;
7178       *code = OP_END;
7179       fixed_length = find_fixedlength(last_branch,  (options & PCRE_UTF8) != 0,
7180         FALSE, cd);
7181       DPRINTF(("fixed length = %d\n", fixed_length));
7182       if (fixed_length == -3)
7183         {
7184         cd->check_lookbehind = TRUE;
7185         }
7186       else if (fixed_length < 0)
7187         {
7188         *errorcodeptr = (fixed_length == -2)? ERR36 :
7189                         (fixed_length == -4)? ERR70: ERR25;
7190         *ptrptr = ptr;
7191         return FALSE;
7192         }
7193       else
7194         {
7195         if (fixed_length > cd->max_lookbehind)
7196           cd->max_lookbehind = fixed_length;
7197         PUT(reverse_count, 0, fixed_length);
7198         }
7199       }
7200     }
7201 
7202   /* Reached end of expression, either ')' or end of pattern. In the real
7203   compile phase, go back through the alternative branches and reverse the chain
7204   of offsets, with the field in the BRA item now becoming an offset to the
7205   first alternative. If there are no alternatives, it points to the end of the
7206   group. The length in the terminating ket is always the length of the whole
7207   bracketed item. Return leaving the pointer at the terminating char. */
7208 
7209   if (*ptr != CHAR_VERTICAL_LINE)
7210     {
7211     if (lengthptr == NULL)
7212       {
7213       int branch_length = (int)(code - last_branch);
7214       do
7215         {
7216         int prev_length = GET(last_branch, 1);
7217         PUT(last_branch, 1, branch_length);
7218         branch_length = prev_length;
7219         last_branch -= branch_length;
7220         }
7221       while (branch_length > 0);
7222       }
7223 
7224     /* Fill in the ket */
7225 
7226     *code = OP_KET;
7227     PUT(code, 1, (int)(code - start_bracket));
7228     code += 1 + LINK_SIZE;
7229 
7230     /* If it was a capturing subpattern, check to see if it contained any
7231     recursive back references. If so, we must wrap it in atomic brackets.
7232     In any event, remove the block from the chain. */
7233 
7234     if (capnumber > 0)
7235       {
7236       if (cd->open_caps->flag)
7237         {
7238         memmove(start_bracket + 1 + LINK_SIZE, start_bracket,
7239           IN_UCHARS(code - start_bracket));
7240         *start_bracket = OP_ONCE;
7241         code += 1 + LINK_SIZE;
7242         PUT(start_bracket, 1, (int)(code - start_bracket));
7243         *code = OP_KET;
7244         PUT(code, 1, (int)(code - start_bracket));
7245         code += 1 + LINK_SIZE;
7246         length += 2 + 2*LINK_SIZE;
7247         }
7248       cd->open_caps = cd->open_caps->next;
7249       }
7250 
7251     /* Retain the highest bracket number, in case resetting was used. */
7252 
7253     cd->bracount = max_bracount;
7254 
7255     /* Set values to pass back */
7256 
7257     *codeptr = code;
7258     *ptrptr = ptr;
7259     *firstcharptr = firstchar;
7260     *reqcharptr = reqchar;
7261     if (lengthptr != NULL)
7262       {
7263       if (OFLOW_MAX - *lengthptr < length)
7264         {
7265         *errorcodeptr = ERR20;
7266         return FALSE;
7267         }
7268       *lengthptr += length;
7269       }
7270     return TRUE;
7271     }
7272 
7273   /* Another branch follows. In the pre-compile phase, we can move the code
7274   pointer back to where it was for the start of the first branch. (That is,
7275   pretend that each branch is the only one.)
7276 
7277   In the real compile phase, insert an ALT node. Its length field points back
7278   to the previous branch while the bracket remains open. At the end the chain
7279   is reversed. It's done like this so that the start of the bracket has a
7280   zero offset until it is closed, making it possible to detect recursion. */
7281 
7282   if (lengthptr != NULL)
7283     {
7284     code = *codeptr + 1 + LINK_SIZE + skipbytes;
7285     length += 1 + LINK_SIZE;
7286     }
7287   else
7288     {
7289     *code = OP_ALT;
7290     PUT(code, 1, (int)(code - last_branch));
7291     bc.current_branch = last_branch = code;
7292     code += 1 + LINK_SIZE;
7293     }
7294 
7295   ptr++;
7296   }
7297 /* Control never reaches here */
7298 }
7299 
7300 
7301 
7302 
7303 /*************************************************
7304 *          Check for anchored expression         *
7305 *************************************************/
7306 
7307 /* Try to find out if this is an anchored regular expression. Consider each
7308 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
7309 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
7310 it's anchored. However, if this is a multiline pattern, then only OP_SOD will
7311 be found, because ^ generates OP_CIRCM in that mode.
7312 
7313 We can also consider a regex to be anchored if OP_SOM starts all its branches.
7314 This is the code for \G, which means "match at start of match position, taking
7315 into account the match offset".
7316 
7317 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
7318 because that will try the rest of the pattern at all possible matching points,
7319 so there is no point trying again.... er ....
7320 
7321 .... except when the .* appears inside capturing parentheses, and there is a
7322 subsequent back reference to those parentheses. We haven't enough information
7323 to catch that case precisely.
7324 
7325 At first, the best we could do was to detect when .* was in capturing brackets
7326 and the highest back reference was greater than or equal to that level.
7327 However, by keeping a bitmap of the first 31 back references, we can catch some
7328 of the more common cases more precisely.
7329 
7330 Arguments:
7331   code           points to start of expression (the bracket)
7332   bracket_map    a bitmap of which brackets we are inside while testing; this
7333                   handles up to substring 31; after that we just have to take
7334                   the less precise approach
7335   backref_map    the back reference bitmap
7336 
7337 Returns:     TRUE or FALSE
7338 */
7339 
7340 static BOOL
is_anchored(const pcre_uchar * code,unsigned int bracket_map,unsigned int backref_map)7341 is_anchored(const pcre_uchar *code, unsigned int bracket_map,
7342   unsigned int backref_map)
7343 {
7344 do {
7345    const pcre_uchar *scode = first_significant_code(
7346      code + PRIV(OP_lengths)[*code], FALSE);
7347    int op = *scode;
7348 
7349    /* Non-capturing brackets */
7350 
7351    if (op == OP_BRA  || op == OP_BRAPOS ||
7352        op == OP_SBRA || op == OP_SBRAPOS)
7353      {
7354      if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
7355      }
7356 
7357    /* Capturing brackets */
7358 
7359    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
7360             op == OP_SCBRA || op == OP_SCBRAPOS)
7361      {
7362      int n = GET2(scode, 1+LINK_SIZE);
7363      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7364      if (!is_anchored(scode, new_map, backref_map)) return FALSE;
7365      }
7366 
7367    /* Other brackets */
7368 
7369    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC ||
7370             op == OP_COND)
7371      {
7372      if (!is_anchored(scode, bracket_map, backref_map)) return FALSE;
7373      }
7374 
7375    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
7376    it isn't in brackets that are or may be referenced. */
7377 
7378    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
7379              op == OP_TYPEPOSSTAR))
7380      {
7381      if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
7382        return FALSE;
7383      }
7384 
7385    /* Check for explicit anchoring */
7386 
7387    else if (op != OP_SOD && op != OP_SOM && op != OP_CIRC) return FALSE;
7388    code += GET(code, 1);
7389    }
7390 while (*code == OP_ALT);   /* Loop for each alternative */
7391 return TRUE;
7392 }
7393 
7394 
7395 
7396 /*************************************************
7397 *         Check for starting with ^ or .*        *
7398 *************************************************/
7399 
7400 /* This is called to find out if every branch starts with ^ or .* so that
7401 "first char" processing can be done to speed things up in multiline
7402 matching and for non-DOTALL patterns that start with .* (which must start at
7403 the beginning or after \n). As in the case of is_anchored() (see above), we
7404 have to take account of back references to capturing brackets that contain .*
7405 because in that case we can't make the assumption.
7406 
7407 Arguments:
7408   code           points to start of expression (the bracket)
7409   bracket_map    a bitmap of which brackets we are inside while testing; this
7410                   handles up to substring 31; after that we just have to take
7411                   the less precise approach
7412   backref_map    the back reference bitmap
7413 
7414 Returns:         TRUE or FALSE
7415 */
7416 
7417 static BOOL
is_startline(const pcre_uchar * code,unsigned int bracket_map,unsigned int backref_map)7418 is_startline(const pcre_uchar *code, unsigned int bracket_map,
7419   unsigned int backref_map)
7420 {
7421 do {
7422    const pcre_uchar *scode = first_significant_code(
7423      code + PRIV(OP_lengths)[*code], FALSE);
7424    int op = *scode;
7425 
7426    /* If we are at the start of a conditional assertion group, *both* the
7427    conditional assertion *and* what follows the condition must satisfy the test
7428    for start of line. Other kinds of condition fail. Note that there may be an
7429    auto-callout at the start of a condition. */
7430 
7431    if (op == OP_COND)
7432      {
7433      scode += 1 + LINK_SIZE;
7434      if (*scode == OP_CALLOUT) scode += PRIV(OP_lengths)[OP_CALLOUT];
7435      switch (*scode)
7436        {
7437        case OP_CREF:
7438        case OP_NCREF:
7439        case OP_RREF:
7440        case OP_NRREF:
7441        case OP_DEF:
7442        return FALSE;
7443 
7444        default:     /* Assertion */
7445        if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
7446        do scode += GET(scode, 1); while (*scode == OP_ALT);
7447        scode += 1 + LINK_SIZE;
7448        break;
7449        }
7450      scode = first_significant_code(scode, FALSE);
7451      op = *scode;
7452      }
7453 
7454    /* Non-capturing brackets */
7455 
7456    if (op == OP_BRA  || op == OP_BRAPOS ||
7457        op == OP_SBRA || op == OP_SBRAPOS)
7458      {
7459      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
7460      }
7461 
7462    /* Capturing brackets */
7463 
7464    else if (op == OP_CBRA  || op == OP_CBRAPOS ||
7465             op == OP_SCBRA || op == OP_SCBRAPOS)
7466      {
7467      int n = GET2(scode, 1+LINK_SIZE);
7468      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
7469      if (!is_startline(scode, new_map, backref_map)) return FALSE;
7470      }
7471 
7472    /* Other brackets */
7473 
7474    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_ONCE_NC)
7475      {
7476      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
7477      }
7478 
7479    /* .* means "start at start or after \n" if it isn't in brackets that
7480    may be referenced. */
7481 
7482    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
7483      {
7484      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
7485      }
7486 
7487    /* Check for explicit circumflex */
7488 
7489    else if (op != OP_CIRC && op != OP_CIRCM) return FALSE;
7490 
7491    /* Move on to the next alternative */
7492 
7493    code += GET(code, 1);
7494    }
7495 while (*code == OP_ALT);  /* Loop for each alternative */
7496 return TRUE;
7497 }
7498 
7499 
7500 
7501 /*************************************************
7502 *       Check for asserted fixed first char      *
7503 *************************************************/
7504 
7505 /* During compilation, the "first char" settings from forward assertions are
7506 discarded, because they can cause conflicts with actual literals that follow.
7507 However, if we end up without a first char setting for an unanchored pattern,
7508 it is worth scanning the regex to see if there is an initial asserted first
7509 char. If all branches start with the same asserted char, or with a bracket all
7510 of whose alternatives start with the same asserted char (recurse ad lib), then
7511 we return that char, otherwise -1.
7512 
7513 Arguments:
7514   code       points to start of expression (the bracket)
7515   inassert   TRUE if in an assertion
7516 
7517 Returns:     -1 or the fixed first char
7518 */
7519 
7520 static int
find_firstassertedchar(const pcre_uchar * code,BOOL inassert)7521 find_firstassertedchar(const pcre_uchar *code, BOOL inassert)
7522 {
7523 int c = -1;
7524 do {
7525    int d;
7526    int xl = (*code == OP_CBRA || *code == OP_SCBRA ||
7527              *code == OP_CBRAPOS || *code == OP_SCBRAPOS)? IMM2_SIZE:0;
7528    const pcre_uchar *scode = first_significant_code(code + 1+LINK_SIZE + xl,
7529      TRUE);
7530    int op = *scode;
7531 
7532    switch(op)
7533      {
7534      default:
7535      return -1;
7536 
7537      case OP_BRA:
7538      case OP_BRAPOS:
7539      case OP_CBRA:
7540      case OP_SCBRA:
7541      case OP_CBRAPOS:
7542      case OP_SCBRAPOS:
7543      case OP_ASSERT:
7544      case OP_ONCE:
7545      case OP_ONCE_NC:
7546      case OP_COND:
7547      if ((d = find_firstassertedchar(scode, op == OP_ASSERT)) < 0)
7548        return -1;
7549      if (c < 0) c = d; else if (c != d) return -1;
7550      break;
7551 
7552      case OP_EXACT:
7553      scode += IMM2_SIZE;
7554      /* Fall through */
7555 
7556      case OP_CHAR:
7557      case OP_PLUS:
7558      case OP_MINPLUS:
7559      case OP_POSPLUS:
7560      if (!inassert) return -1;
7561      if (c < 0) c = scode[1];
7562        else if (c != scode[1]) return -1;
7563      break;
7564 
7565      case OP_EXACTI:
7566      scode += IMM2_SIZE;
7567      /* Fall through */
7568 
7569      case OP_CHARI:
7570      case OP_PLUSI:
7571      case OP_MINPLUSI:
7572      case OP_POSPLUSI:
7573      if (!inassert) return -1;
7574      if (c < 0) c = scode[1] | REQ_CASELESS;
7575        else if (c != scode[1]) return -1;
7576      break;
7577      }
7578 
7579    code += GET(code, 1);
7580    }
7581 while (*code == OP_ALT);
7582 return c;
7583 }
7584 
7585 
7586 
7587 /*************************************************
7588 *        Compile a Regular Expression            *
7589 *************************************************/
7590 
7591 /* This function takes a string and returns a pointer to a block of store
7592 holding a compiled version of the expression. The original API for this
7593 function had no error code return variable; it is retained for backwards
7594 compatibility. The new function is given a new name.
7595 
7596 Arguments:
7597   pattern       the regular expression
7598   options       various option bits
7599   errorcodeptr  pointer to error code variable (pcre_compile2() only)
7600                   can be NULL if you don't want a code value
7601   errorptr      pointer to pointer to error text
7602   erroroffset   ptr offset in pattern where error was detected
7603   tables        pointer to character tables or NULL
7604 
7605 Returns:        pointer to compiled data block, or NULL on error,
7606                 with errorptr and erroroffset set
7607 */
7608 
7609 #ifdef COMPILE_PCRE8
7610 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile(const char * pattern,int options,const char ** errorptr,int * erroroffset,const unsigned char * tables)7611 pcre_compile(const char *pattern, int options, const char **errorptr,
7612   int *erroroffset, const unsigned char *tables)
7613 #else
7614 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
7615 pcre16_compile(PCRE_SPTR16 pattern, int options, const char **errorptr,
7616   int *erroroffset, const unsigned char *tables)
7617 #endif
7618 {
7619 #ifdef COMPILE_PCRE8
7620 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
7621 #else
7622 return pcre16_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
7623 #endif
7624 }
7625 
7626 
7627 #ifdef COMPILE_PCRE8
7628 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
pcre_compile2(const char * pattern,int options,int * errorcodeptr,const char ** errorptr,int * erroroffset,const unsigned char * tables)7629 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
7630   const char **errorptr, int *erroroffset, const unsigned char *tables)
7631 #else
7632 PCRE_EXP_DEFN pcre16 * PCRE_CALL_CONVENTION
7633 pcre16_compile2(PCRE_SPTR16 pattern, int options, int *errorcodeptr,
7634   const char **errorptr, int *erroroffset, const unsigned char *tables)
7635 #endif
7636 {
7637 REAL_PCRE *re;
7638 int length = 1;  /* For final END opcode */
7639 pcre_int32 firstchar, reqchar;
7640 int newline;
7641 int errorcode = 0;
7642 int skipatstart = 0;
7643 BOOL utf;
7644 size_t size;
7645 pcre_uchar *code;
7646 const pcre_uchar *codestart;
7647 const pcre_uchar *ptr;
7648 compile_data compile_block;
7649 compile_data *cd = &compile_block;
7650 
7651 /* This space is used for "compiling" into during the first phase, when we are
7652 computing the amount of memory that is needed. Compiled items are thrown away
7653 as soon as possible, so that a fairly large buffer should be sufficient for
7654 this purpose. The same space is used in the second phase for remembering where
7655 to fill in forward references to subpatterns. That may overflow, in which case
7656 new memory is obtained from malloc(). */
7657 
7658 pcre_uchar cworkspace[COMPILE_WORK_SIZE];
7659 
7660 /* Set this early so that early errors get offset 0. */
7661 
7662 ptr = (const pcre_uchar *)pattern;
7663 
7664 /* We can't pass back an error message if errorptr is NULL; I guess the best we
7665 can do is just return NULL, but we can set a code value if there is a code
7666 pointer. */
7667 
7668 if (errorptr == NULL)
7669   {
7670   if (errorcodeptr != NULL) *errorcodeptr = 99;
7671   return NULL;
7672   }
7673 
7674 *errorptr = NULL;
7675 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
7676 
7677 /* However, we can give a message for this error */
7678 
7679 if (erroroffset == NULL)
7680   {
7681   errorcode = ERR16;
7682   goto PCRE_EARLY_ERROR_RETURN2;
7683   }
7684 
7685 *erroroffset = 0;
7686 
7687 /* Set up pointers to the individual character tables */
7688 
7689 if (tables == NULL) tables = PRIV(default_tables);
7690 cd->lcc = tables + lcc_offset;
7691 cd->fcc = tables + fcc_offset;
7692 cd->cbits = tables + cbits_offset;
7693 cd->ctypes = tables + ctypes_offset;
7694 
7695 /* Check that all undefined public option bits are zero */
7696 
7697 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
7698   {
7699   errorcode = ERR17;
7700   goto PCRE_EARLY_ERROR_RETURN;
7701   }
7702 
7703 /* Check for global one-time settings at the start of the pattern, and remember
7704 the offset for later. */
7705 
7706 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
7707        ptr[skipatstart+1] == CHAR_ASTERISK)
7708   {
7709   int newnl = 0;
7710   int newbsr = 0;
7711 
7712 #ifdef COMPILE_PCRE8
7713   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 5) == 0)
7714     { skipatstart += 7; options |= PCRE_UTF8; continue; }
7715 #endif
7716 #ifdef COMPILE_PCRE16
7717   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UTF_RIGHTPAR, 6) == 0)
7718     { skipatstart += 8; options |= PCRE_UTF16; continue; }
7719 #endif
7720   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_UCP_RIGHTPAR, 4) == 0)
7721     { skipatstart += 6; options |= PCRE_UCP; continue; }
7722   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_NO_START_OPT_RIGHTPAR, 13) == 0)
7723     { skipatstart += 15; options |= PCRE_NO_START_OPTIMIZE; continue; }
7724 
7725   if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CR_RIGHTPAR, 3) == 0)
7726     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
7727   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_LF_RIGHTPAR, 3)  == 0)
7728     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
7729   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_CRLF_RIGHTPAR, 5)  == 0)
7730     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
7731   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANY_RIGHTPAR, 4) == 0)
7732     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
7733   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_ANYCRLF_RIGHTPAR, 8) == 0)
7734     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
7735 
7736   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
7737     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
7738   else if (STRNCMP_UC_C8(ptr+skipatstart+2, STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
7739     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
7740 
7741   if (newnl != 0)
7742     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
7743   else if (newbsr != 0)
7744     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
7745   else break;
7746   }
7747 
7748 /* PCRE_UTF16 has the same value as PCRE_UTF8. */
7749 utf = (options & PCRE_UTF8) != 0;
7750 
7751 /* Can't support UTF unless PCRE has been compiled to include the code. The
7752 return of an error code from PRIV(valid_utf)() is a new feature, introduced in
7753 release 8.13. It is passed back from pcre_[dfa_]exec(), but at the moment is
7754 not used here. */
7755 
7756 #ifdef SUPPORT_UTF
7757 if (utf && (options & PCRE_NO_UTF8_CHECK) == 0 &&
7758      (errorcode = PRIV(valid_utf)((PCRE_PUCHAR)pattern, -1, erroroffset)) != 0)
7759   {
7760 #ifdef COMPILE_PCRE8
7761   errorcode = ERR44;
7762 #else
7763   errorcode = ERR74;
7764 #endif
7765   goto PCRE_EARLY_ERROR_RETURN2;
7766   }
7767 #else
7768 if (utf)
7769   {
7770   errorcode = ERR32;
7771   goto PCRE_EARLY_ERROR_RETURN;
7772   }
7773 #endif
7774 
7775 /* Can't support UCP unless PCRE has been compiled to include the code. */
7776 
7777 #ifndef SUPPORT_UCP
7778 if ((options & PCRE_UCP) != 0)
7779   {
7780   errorcode = ERR67;
7781   goto PCRE_EARLY_ERROR_RETURN;
7782   }
7783 #endif
7784 
7785 /* Check validity of \R options. */
7786 
7787 if ((options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) ==
7788      (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
7789   {
7790   errorcode = ERR56;
7791   goto PCRE_EARLY_ERROR_RETURN;
7792   }
7793 
7794 /* Handle different types of newline. The three bits give seven cases. The
7795 current code allows for fixed one- or two-byte sequences, plus "any" and
7796 "anycrlf". */
7797 
7798 switch (options & PCRE_NEWLINE_BITS)
7799   {
7800   case 0: newline = NEWLINE; break;   /* Build-time default */
7801   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
7802   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
7803   case PCRE_NEWLINE_CR+
7804        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
7805   case PCRE_NEWLINE_ANY: newline = -1; break;
7806   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
7807   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
7808   }
7809 
7810 if (newline == -2)
7811   {
7812   cd->nltype = NLTYPE_ANYCRLF;
7813   }
7814 else if (newline < 0)
7815   {
7816   cd->nltype = NLTYPE_ANY;
7817   }
7818 else
7819   {
7820   cd->nltype = NLTYPE_FIXED;
7821   if (newline > 255)
7822     {
7823     cd->nllen = 2;
7824     cd->nl[0] = (newline >> 8) & 255;
7825     cd->nl[1] = newline & 255;
7826     }
7827   else
7828     {
7829     cd->nllen = 1;
7830     cd->nl[0] = newline;
7831     }
7832   }
7833 
7834 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
7835 references to help in deciding whether (.*) can be treated as anchored or not.
7836 */
7837 
7838 cd->top_backref = 0;
7839 cd->backref_map = 0;
7840 
7841 /* Reflect pattern for debugging output */
7842 
7843 DPRINTF(("------------------------------------------------------------------\n"));
7844 #ifdef PCRE_DEBUG
7845 print_puchar(stdout, (PCRE_PUCHAR)pattern);
7846 #endif
7847 DPRINTF(("\n"));
7848 
7849 /* Pretend to compile the pattern while actually just accumulating the length
7850 of memory required. This behaviour is triggered by passing a non-NULL final
7851 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
7852 to compile parts of the pattern into; the compiled code is discarded when it is
7853 no longer needed, so hopefully this workspace will never overflow, though there
7854 is a test for its doing so. */
7855 
7856 cd->bracount = cd->final_bracount = 0;
7857 cd->names_found = 0;
7858 cd->name_entry_size = 0;
7859 cd->name_table = NULL;
7860 cd->start_code = cworkspace;
7861 cd->hwm = cworkspace;
7862 cd->start_workspace = cworkspace;
7863 cd->workspace_size = COMPILE_WORK_SIZE;
7864 cd->start_pattern = (const pcre_uchar *)pattern;
7865 cd->end_pattern = (const pcre_uchar *)(pattern + STRLEN_UC((const pcre_uchar *)pattern));
7866 cd->req_varyopt = 0;
7867 cd->assert_depth = 0;
7868 cd->max_lookbehind = 0;
7869 cd->external_options = options;
7870 cd->external_flags = 0;
7871 cd->open_caps = NULL;
7872 
7873 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
7874 don't need to look at the result of the function here. The initial options have
7875 been put into the cd block so that they can be changed if an option setting is
7876 found within the regex right at the beginning. Bringing initial option settings
7877 outside can help speed up starting point checks. */
7878 
7879 ptr += skipatstart;
7880 code = cworkspace;
7881 *code = OP_BRA;
7882 (void)compile_regex(cd->external_options, &code, &ptr, &errorcode, FALSE,
7883   FALSE, 0, 0, &firstchar, &reqchar, NULL, cd, &length);
7884 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
7885 
7886 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
7887   (int)(cd->hwm - cworkspace)));
7888 
7889 if (length > MAX_PATTERN_SIZE)
7890   {
7891   errorcode = ERR20;
7892   goto PCRE_EARLY_ERROR_RETURN;
7893   }
7894 
7895 /* Compute the size of data block needed and get it, either from malloc or
7896 externally provided function. Integer overflow should no longer be possible
7897 because nowadays we limit the maximum value of cd->names_found and
7898 cd->name_entry_size. */
7899 
7900 size = sizeof(REAL_PCRE) + (length + cd->names_found * cd->name_entry_size) * sizeof(pcre_uchar);
7901 re = (REAL_PCRE *)(PUBL(malloc))(size);
7902 
7903 if (re == NULL)
7904   {
7905   errorcode = ERR21;
7906   goto PCRE_EARLY_ERROR_RETURN;
7907   }
7908 
7909 /* Put in the magic number, and save the sizes, initial options, internal
7910 flags, and character table pointer. NULL is used for the default character
7911 tables. The nullpad field is at the end; it's there to help in the case when a
7912 regex compiled on a system with 4-byte pointers is run on another with 8-byte
7913 pointers. */
7914 
7915 re->magic_number = MAGIC_NUMBER;
7916 re->size = (int)size;
7917 re->options = cd->external_options;
7918 re->flags = cd->external_flags;
7919 re->first_char = 0;
7920 re->req_char = 0;
7921 re->name_table_offset = sizeof(REAL_PCRE) / sizeof(pcre_uchar);
7922 re->name_entry_size = cd->name_entry_size;
7923 re->name_count = cd->names_found;
7924 re->ref_count = 0;
7925 re->tables = (tables == PRIV(default_tables))? NULL : tables;
7926 re->nullpad = NULL;
7927 
7928 /* The starting points of the name/number translation table and of the code are
7929 passed around in the compile data block. The start/end pattern and initial
7930 options are already set from the pre-compile phase, as is the name_entry_size
7931 field. Reset the bracket count and the names_found field. Also reset the hwm
7932 field; this time it's used for remembering forward references to subpatterns.
7933 */
7934 
7935 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
7936 cd->assert_depth = 0;
7937 cd->bracount = 0;
7938 cd->max_lookbehind = 0;
7939 cd->names_found = 0;
7940 cd->name_table = (pcre_uchar *)re + re->name_table_offset;
7941 codestart = cd->name_table + re->name_entry_size * re->name_count;
7942 cd->start_code = codestart;
7943 cd->hwm = (pcre_uchar *)(cd->start_workspace);
7944 cd->req_varyopt = 0;
7945 cd->had_accept = FALSE;
7946 cd->check_lookbehind = FALSE;
7947 cd->open_caps = NULL;
7948 
7949 /* Set up a starting, non-extracting bracket, then compile the expression. On
7950 error, errorcode will be set non-zero, so we don't need to look at the result
7951 of the function here. */
7952 
7953 ptr = (const pcre_uchar *)pattern + skipatstart;
7954 code = (pcre_uchar *)codestart;
7955 *code = OP_BRA;
7956 (void)compile_regex(re->options, &code, &ptr, &errorcode, FALSE, FALSE, 0, 0,
7957   &firstchar, &reqchar, NULL, cd, NULL);
7958 re->top_bracket = cd->bracount;
7959 re->top_backref = cd->top_backref;
7960 re->max_lookbehind = cd->max_lookbehind;
7961 re->flags = cd->external_flags | PCRE_MODE;
7962 
7963 if (cd->had_accept) reqchar = REQ_NONE;   /* Must disable after (*ACCEPT) */
7964 
7965 /* If not reached end of pattern on success, there's an excess bracket. */
7966 
7967 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
7968 
7969 /* Fill in the terminating state and check for disastrous overflow, but
7970 if debugging, leave the test till after things are printed out. */
7971 
7972 *code++ = OP_END;
7973 
7974 #ifndef PCRE_DEBUG
7975 if (code - codestart > length) errorcode = ERR23;
7976 #endif
7977 
7978 /* Fill in any forward references that are required. There may be repeated
7979 references; optimize for them, as searching a large regex takes time. */
7980 
7981 if (cd->hwm > cd->start_workspace)
7982   {
7983   int prev_recno = -1;
7984   const pcre_uchar *groupptr = NULL;
7985   while (errorcode == 0 && cd->hwm > cd->start_workspace)
7986     {
7987     int offset, recno;
7988     cd->hwm -= LINK_SIZE;
7989     offset = GET(cd->hwm, 0);
7990     recno = GET(codestart, offset);
7991     if (recno != prev_recno)
7992       {
7993       groupptr = PRIV(find_bracket)(codestart, utf, recno);
7994       prev_recno = recno;
7995       }
7996     if (groupptr == NULL) errorcode = ERR53;
7997       else PUT(((pcre_uchar *)codestart), offset, (int)(groupptr - codestart));
7998     }
7999   }
8000 
8001 /* If the workspace had to be expanded, free the new memory. */
8002 
8003 if (cd->workspace_size > COMPILE_WORK_SIZE)
8004   (PUBL(free))((void *)cd->start_workspace);
8005 
8006 /* Give an error if there's back reference to a non-existent capturing
8007 subpattern. */
8008 
8009 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
8010 
8011 /* If there were any lookbehind assertions that contained OP_RECURSE
8012 (recursions or subroutine calls), a flag is set for them to be checked here,
8013 because they may contain forward references. Actual recursions can't be fixed
8014 length, but subroutine calls can. It is done like this so that those without
8015 OP_RECURSE that are not fixed length get a diagnosic with a useful offset. The
8016 exceptional ones forgo this. We scan the pattern to check that they are fixed
8017 length, and set their lengths. */
8018 
8019 if (cd->check_lookbehind)
8020   {
8021   pcre_uchar *cc = (pcre_uchar *)codestart;
8022 
8023   /* Loop, searching for OP_REVERSE items, and process those that do not have
8024   their length set. (Actually, it will also re-process any that have a length
8025   of zero, but that is a pathological case, and it does no harm.) When we find
8026   one, we temporarily terminate the branch it is in while we scan it. */
8027 
8028   for (cc = (pcre_uchar *)PRIV(find_bracket)(codestart, utf, -1);
8029        cc != NULL;
8030        cc = (pcre_uchar *)PRIV(find_bracket)(cc, utf, -1))
8031     {
8032     if (GET(cc, 1) == 0)
8033       {
8034       int fixed_length;
8035       pcre_uchar *be = cc - 1 - LINK_SIZE + GET(cc, -LINK_SIZE);
8036       int end_op = *be;
8037       *be = OP_END;
8038       fixed_length = find_fixedlength(cc, (re->options & PCRE_UTF8) != 0, TRUE,
8039         cd);
8040       *be = end_op;
8041       DPRINTF(("fixed length = %d\n", fixed_length));
8042       if (fixed_length < 0)
8043         {
8044         errorcode = (fixed_length == -2)? ERR36 :
8045                     (fixed_length == -4)? ERR70 : ERR25;
8046         break;
8047         }
8048       if (fixed_length > cd->max_lookbehind) cd->max_lookbehind = fixed_length;
8049       PUT(cc, 1, fixed_length);
8050       }
8051     cc += 1 + LINK_SIZE;
8052     }
8053   }
8054 
8055 /* Failed to compile, or error while post-processing */
8056 
8057 if (errorcode != 0)
8058   {
8059   (PUBL(free))(re);
8060   PCRE_EARLY_ERROR_RETURN:
8061   *erroroffset = (int)(ptr - (const pcre_uchar *)pattern);
8062   PCRE_EARLY_ERROR_RETURN2:
8063   *errorptr = find_error_text(errorcode);
8064   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
8065   return NULL;
8066   }
8067 
8068 /* If the anchored option was not passed, set the flag if we can determine that
8069 the pattern is anchored by virtue of ^ characters or \A or anything else (such
8070 as starting with .* when DOTALL is set).
8071 
8072 Otherwise, if we know what the first byte has to be, save it, because that
8073 speeds up unanchored matches no end. If not, see if we can set the
8074 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
8075 start with ^. and also when all branches start with .* for non-DOTALL matches.
8076 */
8077 
8078 if ((re->options & PCRE_ANCHORED) == 0)
8079   {
8080   if (is_anchored(codestart, 0, cd->backref_map))
8081     re->options |= PCRE_ANCHORED;
8082   else
8083     {
8084     if (firstchar < 0)
8085       firstchar = find_firstassertedchar(codestart, FALSE);
8086     if (firstchar >= 0)   /* Remove caseless flag for non-caseable chars */
8087       {
8088 #ifdef COMPILE_PCRE8
8089       re->first_char = firstchar & 0xff;
8090 #else
8091 #ifdef COMPILE_PCRE16
8092       re->first_char = firstchar & 0xffff;
8093 #endif
8094 #endif
8095       if ((firstchar & REQ_CASELESS) != 0)
8096         {
8097 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
8098         /* We ignore non-ASCII first chars in 8 bit mode. */
8099         if (utf)
8100           {
8101           if (re->first_char < 128)
8102             {
8103             if (cd->fcc[re->first_char] != re->first_char)
8104               re->flags |= PCRE_FCH_CASELESS;
8105             }
8106           else if (UCD_OTHERCASE(re->first_char) != re->first_char)
8107             re->flags |= PCRE_FCH_CASELESS;
8108           }
8109         else
8110 #endif
8111         if (MAX_255(re->first_char)
8112             && cd->fcc[re->first_char] != re->first_char)
8113           re->flags |= PCRE_FCH_CASELESS;
8114         }
8115 
8116       re->flags |= PCRE_FIRSTSET;
8117       }
8118     else if (is_startline(codestart, 0, cd->backref_map))
8119       re->flags |= PCRE_STARTLINE;
8120     }
8121   }
8122 
8123 /* For an anchored pattern, we use the "required byte" only if it follows a
8124 variable length item in the regex. Remove the caseless flag for non-caseable
8125 bytes. */
8126 
8127 if (reqchar >= 0 &&
8128      ((re->options & PCRE_ANCHORED) == 0 || (reqchar & REQ_VARY) != 0))
8129   {
8130 #ifdef COMPILE_PCRE8
8131   re->req_char = reqchar & 0xff;
8132 #else
8133 #ifdef COMPILE_PCRE16
8134   re->req_char = reqchar & 0xffff;
8135 #endif
8136 #endif
8137   if ((reqchar & REQ_CASELESS) != 0)
8138     {
8139 #if defined SUPPORT_UCP && !(defined COMPILE_PCRE8)
8140     /* We ignore non-ASCII first chars in 8 bit mode. */
8141     if (utf)
8142       {
8143       if (re->req_char < 128)
8144         {
8145         if (cd->fcc[re->req_char] != re->req_char)
8146           re->flags |= PCRE_RCH_CASELESS;
8147         }
8148       else if (UCD_OTHERCASE(re->req_char) != re->req_char)
8149         re->flags |= PCRE_RCH_CASELESS;
8150       }
8151     else
8152 #endif
8153     if (MAX_255(re->req_char) && cd->fcc[re->req_char] != re->req_char)
8154       re->flags |= PCRE_RCH_CASELESS;
8155     }
8156 
8157   re->flags |= PCRE_REQCHSET;
8158   }
8159 
8160 /* Print out the compiled data if debugging is enabled. This is never the
8161 case when building a production library. */
8162 
8163 #ifdef PCRE_DEBUG
8164 printf("Length = %d top_bracket = %d top_backref = %d\n",
8165   length, re->top_bracket, re->top_backref);
8166 
8167 printf("Options=%08x\n", re->options);
8168 
8169 if ((re->flags & PCRE_FIRSTSET) != 0)
8170   {
8171   pcre_uchar ch = re->first_char;
8172   const char *caseless =
8173     ((re->flags & PCRE_FCH_CASELESS) == 0)? "" : " (caseless)";
8174   if (PRINTABLE(ch)) printf("First char = %c%s\n", ch, caseless);
8175     else printf("First char = \\x%02x%s\n", ch, caseless);
8176   }
8177 
8178 if ((re->flags & PCRE_REQCHSET) != 0)
8179   {
8180   pcre_uchar ch = re->req_char;
8181   const char *caseless =
8182     ((re->flags & PCRE_RCH_CASELESS) == 0)? "" : " (caseless)";
8183   if (PRINTABLE(ch)) printf("Req char = %c%s\n", ch, caseless);
8184     else printf("Req char = \\x%02x%s\n", ch, caseless);
8185   }
8186 
8187 #ifdef COMPILE_PCRE8
8188 pcre_printint((pcre *)re, stdout, TRUE);
8189 #else
8190 pcre16_printint((pcre *)re, stdout, TRUE);
8191 #endif
8192 
8193 /* This check is done here in the debugging case so that the code that
8194 was compiled can be seen. */
8195 
8196 if (code - codestart > length)
8197   {
8198   (PUBL(free))(re);
8199   *errorptr = find_error_text(ERR23);
8200   *erroroffset = ptr - (pcre_uchar *)pattern;
8201   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
8202   return NULL;
8203   }
8204 #endif   /* PCRE_DEBUG */
8205 
8206 #ifdef COMPILE_PCRE8
8207 return (pcre *)re;
8208 #else
8209 return (pcre16 *)re;
8210 #endif
8211 }
8212 
8213 /* End of pcre_compile.c */
8214