• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* This is JavaScriptCore's variant of the PCRE library. While this library
2 started out as a copy of PCRE, many of the features of PCRE have been
3 removed. This library now supports only the regular expression features
4 required by the JavaScript language specification, and has only the functions
5 needed by JavaScriptCore and the rest of WebKit.
6 
7                  Originally written by Philip Hazel
8            Copyright (c) 1997-2006 University of Cambridge
9     Copyright (C) 2002, 2004, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved.
10     Copyright (C) 2007 Eric Seidel <eric@webkit.org>
11 
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15 
16     * Redistributions of source code must retain the above copyright notice,
17       this list of conditions and the following disclaimer.
18 
19     * Redistributions in binary form must reproduce the above copyright
20       notice, this list of conditions and the following disclaimer in the
21       documentation and/or other materials provided with the distribution.
22 
23     * Neither the name of the University of Cambridge nor the names of its
24       contributors may be used to endorse or promote products derived from
25       this software without specific prior written permission.
26 
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40 
41 /* This module contains the external function jsRegExpExecute(), along with
42 supporting internal functions that are not used by other modules. */
43 
44 #include "config.h"
45 
46 #include "pcre_internal.h"
47 
48 #include <string.h>
49 #include <wtf/ASCIICType.h>
50 #include <wtf/FastMalloc.h>
51 
52 using namespace WTF;
53 
54 /* Negative values for the firstchar and reqchar variables */
55 
56 #define REQ_UNSET (-2)
57 #define REQ_NONE  (-1)
58 
59 /*************************************************
60 *      Code parameters and static tables         *
61 *************************************************/
62 
63 /* Maximum number of items on the nested bracket stacks at compile time. This
64 applies to the nesting of all kinds of parentheses. It does not limit
65 un-nested, non-capturing parentheses. This number can be made bigger if
66 necessary - it is used to dimension one int and one unsigned char vector at
67 compile time. */
68 
69 #define BRASTACK_SIZE 200
70 
71 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
72 are simple data values; negative values are for special things like \d and so
73 on. Zero means further processing is needed (for things like \x), or the escape
74 is invalid. */
75 
76 static const short escapes[] = {
77      0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
78      0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
79    '@',      0, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
80      0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
81      0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
82      0,      0,      0,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
83    '`',      7, -ESC_b,      0, -ESC_d,      0,   '\f',      0,   /* ` - g */
84      0,      0,      0,      0,      0,      0,   '\n',      0,   /* h - o */
85      0,      0,    '\r', -ESC_s,   '\t',      0,  '\v', -ESC_w,   /* p - w */
86      0,      0,      0                                            /* x - z */
87 };
88 
89 /* Error code numbers. They are given names so that they can more easily be
90 tracked. */
91 
92 enum ErrorCode {
93     ERR0, ERR1, ERR2, ERR3, ERR4, ERR5, ERR6, ERR7, ERR8, ERR9,
94     ERR10, ERR11, ERR12, ERR13, ERR14, ERR15, ERR16, ERR17
95 };
96 
97 /* The texts of compile-time error messages. These are "char *" because they
98 are passed to the outside world. */
99 
errorText(ErrorCode code)100 static const char* errorText(ErrorCode code)
101 {
102     static const char errorTexts[] =
103       /* 1 */
104       "\\ at end of pattern\0"
105       "\\c at end of pattern\0"
106       "character value in \\x{...} sequence is too large\0"
107       "numbers out of order in {} quantifier\0"
108       /* 5 */
109       "number too big in {} quantifier\0"
110       "missing terminating ] for character class\0"
111       "internal error: code overflow\0"
112       "range out of order in character class\0"
113       "nothing to repeat\0"
114       /* 10 */
115       "unmatched parentheses\0"
116       "internal error: unexpected repeat\0"
117       "unrecognized character after (?\0"
118       "failed to get memory\0"
119       "missing )\0"
120       /* 15 */
121       "reference to non-existent subpattern\0"
122       "regular expression too large\0"
123       "parentheses nested too deeply"
124     ;
125 
126     int i = code;
127     const char* text = errorTexts;
128     while (i > 1)
129         i -= !*text++;
130     return text;
131 }
132 
133 /* Structure for passing "static" information around between the functions
134 doing the compiling. */
135 
136 struct CompileData {
CompileDataCompileData137     CompileData() {
138         topBackref = 0;
139         backrefMap = 0;
140         reqVaryOpt = 0;
141         needOuterBracket = false;
142         numCapturingBrackets = 0;
143     }
144     int topBackref;            /* Maximum back reference */
145     unsigned backrefMap;       /* Bitmap of low back refs */
146     int reqVaryOpt;            /* "After variable item" flag for reqByte */
147     bool needOuterBracket;
148     int numCapturingBrackets;
149 };
150 
151 /* Definitions to allow mutual recursion */
152 
153 static bool compileBracket(int, int*, unsigned char**, const UChar**, const UChar*, ErrorCode*, int, int*, int*, CompileData&);
154 static bool bracketIsAnchored(const unsigned char* code);
155 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap);
156 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert);
157 
158 /*************************************************
159 *            Handle escapes                      *
160 *************************************************/
161 
162 /* This function is called when a \ has been encountered. It either returns a
163 positive value for a simple escape such as \n, or a negative value which
164 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
165 a positive value greater than 255 may be returned. On entry, ptr is pointing at
166 the \. On exit, it is on the final character of the escape sequence.
167 
168 Arguments:
169   ptrPtr         points to the pattern position pointer
170   errorCodePtr   points to the errorcode variable
171   bracount       number of previous extracting brackets
172   options        the options bits
173   isClass        true if inside a character class
174 
175 Returns:         zero or positive => a data character
176                  negative => a special escape sequence
177                  on error, errorPtr is set
178 */
179 
checkEscape(const UChar ** ptrPtr,const UChar * patternEnd,ErrorCode * errorCodePtr,int bracount,bool isClass)180 static int checkEscape(const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int bracount, bool isClass)
181 {
182     const UChar* ptr = *ptrPtr + 1;
183 
184     /* If backslash is at the end of the pattern, it's an error. */
185     if (ptr == patternEnd) {
186         *errorCodePtr = ERR1;
187         *ptrPtr = ptr;
188         return 0;
189     }
190 
191     int c = *ptr;
192 
193     /* Non-alphamerics are literals. For digits or letters, do an initial lookup in
194      a table. A non-zero result is something that can be returned immediately.
195      Otherwise further processing may be required. */
196 
197     if (c < '0' || c > 'z') { /* Not alphameric */
198     } else if (int escapeValue = escapes[c - '0']) {
199         c = escapeValue;
200         if (isClass) {
201             if (-c == ESC_b)
202                 c = '\b'; /* \b is backslash in a class */
203             else if (-c == ESC_B)
204                 c = 'B'; /* and \B is a capital B in a class (in browsers event though ECMAScript 15.10.2.19 says it raises an error) */
205         }
206     /* Escapes that need further processing, or are illegal. */
207 
208     } else {
209         switch (c) {
210             case '1':
211             case '2':
212             case '3':
213             case '4':
214             case '5':
215             case '6':
216             case '7':
217             case '8':
218             case '9':
219                 /* Escape sequences starting with a non-zero digit are backreferences,
220                  unless there are insufficient brackets, in which case they are octal
221                  escape sequences. Those sequences end on the first non-octal character
222                  or when we overflow 0-255, whichever comes first. */
223 
224                 if (!isClass) {
225                     const UChar* oldptr = ptr;
226                     c -= '0';
227                     while ((ptr + 1 < patternEnd) && isASCIIDigit(ptr[1]) && c <= bracount)
228                         c = c * 10 + *(++ptr) - '0';
229                     if (c <= bracount) {
230                         c = -(ESC_REF + c);
231                         break;
232                     }
233                     ptr = oldptr;      /* Put the pointer back and fall through */
234                 }
235 
236                 /* Handle an octal number following \. If the first digit is 8 or 9,
237                  this is not octal. */
238 
239                 if ((c = *ptr) >= '8') {
240                     c = '\\';
241                     ptr -= 1;
242                     break;
243                 }
244 
245             /* \0 always starts an octal number, but we may drop through to here with a
246              larger first octal digit. */
247 
248             case '0': {
249                 c -= '0';
250                 int i;
251                 for (i = 1; i <= 2; ++i) {
252                     if (ptr + i >= patternEnd || ptr[i] < '0' || ptr[i] > '7')
253                         break;
254                     int cc = c * 8 + ptr[i] - '0';
255                     if (cc > 255)
256                         break;
257                     c = cc;
258                 }
259                 ptr += i - 1;
260                 break;
261             }
262 
263             case 'x': {
264                 c = 0;
265                 int i;
266                 for (i = 1; i <= 2; ++i) {
267                     if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
268                         c = 'x';
269                         i = 1;
270                         break;
271                     }
272                     int cc = ptr[i];
273                     if (cc >= 'a')
274                         cc -= 32;             /* Convert to upper case */
275                     c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
276                 }
277                 ptr += i - 1;
278                 break;
279             }
280 
281             case 'u': {
282                 c = 0;
283                 int i;
284                 for (i = 1; i <= 4; ++i) {
285                     if (ptr + i >= patternEnd || !isASCIIHexDigit(ptr[i])) {
286                         c = 'u';
287                         i = 1;
288                         break;
289                     }
290                     int cc = ptr[i];
291                     if (cc >= 'a')
292                         cc -= 32;             /* Convert to upper case */
293                     c = c * 16 + cc - ((cc < 'A') ? '0' : ('A' - 10));
294                 }
295                 ptr += i - 1;
296                 break;
297             }
298 
299             case 'c':
300                 if (++ptr == patternEnd) {
301                     *errorCodePtr = ERR2;
302                     return 0;
303                 }
304 
305                 c = *ptr;
306 
307                 /* To match Firefox, inside a character class, we also accept
308                    numbers and '_' as control characters */
309                 if ((!isClass && !isASCIIAlpha(c)) || (!isASCIIAlphanumeric(c) && c != '_')) {
310                     c = '\\';
311                     ptr -= 2;
312                     break;
313                 }
314 
315                 /* A letter is upper-cased; then the 0x40 bit is flipped. This coding
316                  is ASCII-specific, but then the whole concept of \cx is ASCII-specific. */
317                 c = toASCIIUpper(c) ^ 0x40;
318                 break;
319             }
320     }
321 
322     *ptrPtr = ptr;
323     return c;
324 }
325 
326 /*************************************************
327 *            Check for counted repeat            *
328 *************************************************/
329 
330 /* This function is called when a '{' is encountered in a place where it might
331 start a quantifier. It looks ahead to see if it really is a quantifier or not.
332 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
333 where the ddds are digits.
334 
335 Arguments:
336   p         pointer to the first char after '{'
337 
338 Returns:    true or false
339 */
340 
isCountedRepeat(const UChar * p,const UChar * patternEnd)341 static bool isCountedRepeat(const UChar* p, const UChar* patternEnd)
342 {
343     if (p >= patternEnd || !isASCIIDigit(*p))
344         return false;
345     p++;
346     while (p < patternEnd && isASCIIDigit(*p))
347         p++;
348     if (p < patternEnd && *p == '}')
349         return true;
350 
351     if (p >= patternEnd || *p++ != ',')
352         return false;
353     if (p < patternEnd && *p == '}')
354         return true;
355 
356     if (p >= patternEnd || !isASCIIDigit(*p))
357         return false;
358     p++;
359     while (p < patternEnd && isASCIIDigit(*p))
360         p++;
361 
362     return (p < patternEnd && *p == '}');
363 }
364 
365 /*************************************************
366 *         Read repeat counts                     *
367 *************************************************/
368 
369 /* Read an item of the form {n,m} and return the values. This is called only
370 after isCountedRepeat() has confirmed that a repeat-count quantifier exists,
371 so the syntax is guaranteed to be correct, but we need to check the values.
372 
373 Arguments:
374   p              pointer to first char after '{'
375   minp           pointer to int for min
376   maxp           pointer to int for max
377                  returned as -1 if no max
378   errorCodePtr   points to error code variable
379 
380 Returns:         pointer to '}' on success;
381                  current ptr on error, with errorCodePtr set non-zero
382 */
383 
readRepeatCounts(const UChar * p,int * minp,int * maxp,ErrorCode * errorCodePtr)384 static const UChar* readRepeatCounts(const UChar* p, int* minp, int* maxp, ErrorCode* errorCodePtr)
385 {
386     int min = 0;
387     int max = -1;
388 
389     /* Read the minimum value and do a paranoid check: a negative value indicates
390      an integer overflow. */
391 
392     while (isASCIIDigit(*p))
393         min = min * 10 + *p++ - '0';
394     if (min < 0 || min > 65535) {
395         *errorCodePtr = ERR5;
396         return p;
397     }
398 
399     /* Read the maximum value if there is one, and again do a paranoid on its size.
400      Also, max must not be less than min. */
401 
402     if (*p == '}')
403         max = min;
404     else {
405         if (*(++p) != '}') {
406             max = 0;
407             while (isASCIIDigit(*p))
408                 max = max * 10 + *p++ - '0';
409             if (max < 0 || max > 65535) {
410                 *errorCodePtr = ERR5;
411                 return p;
412             }
413             if (max < min) {
414                 *errorCodePtr = ERR4;
415                 return p;
416             }
417         }
418     }
419 
420     /* Fill in the required variables, and pass back the pointer to the terminating
421      '}'. */
422 
423     *minp = min;
424     *maxp = max;
425     return p;
426 }
427 
428 /*************************************************
429 *      Find first significant op code            *
430 *************************************************/
431 
432 /* This is called by several functions that scan a compiled expression looking
433 for a fixed first character, or an anchoring op code etc. It skips over things
434 that do not influence this.
435 
436 Arguments:
437   code         pointer to the start of the group
438 Returns:       pointer to the first significant opcode
439 */
440 
firstSignificantOpcode(const unsigned char * code)441 static const unsigned char* firstSignificantOpcode(const unsigned char* code)
442 {
443     while (*code == OP_BRANUMBER)
444         code += 3;
445     return code;
446 }
447 
firstSignificantOpcodeSkippingAssertions(const unsigned char * code)448 static const unsigned char* firstSignificantOpcodeSkippingAssertions(const unsigned char* code)
449 {
450     while (true) {
451         switch (*code) {
452             case OP_ASSERT_NOT:
453                 advanceToEndOfBracket(code);
454                 code += 1 + LINK_SIZE;
455                 break;
456             case OP_WORD_BOUNDARY:
457             case OP_NOT_WORD_BOUNDARY:
458                 ++code;
459                 break;
460             case OP_BRANUMBER:
461                 code += 3;
462                 break;
463             default:
464                 return code;
465         }
466     }
467 }
468 
469 /*************************************************
470 *           Get othercase range                  *
471 *************************************************/
472 
473 /* This function is passed the start and end of a class range, in UTF-8 mode
474 with UCP support. It searches up the characters, looking for internal ranges of
475 characters in the "other" case. Each call returns the next one, updating the
476 start address.
477 
478 Arguments:
479   cptr        points to starting character value; updated
480   d           end value
481   ocptr       where to put start of othercase range
482   odptr       where to put end of othercase range
483 
484 Yield:        true when range returned; false when no more
485 */
486 
getOthercaseRange(int * cptr,int d,int * ocptr,int * odptr)487 static bool getOthercaseRange(int* cptr, int d, int* ocptr, int* odptr)
488 {
489     int c, othercase = 0;
490 
491     for (c = *cptr; c <= d; c++) {
492         if ((othercase = jsc_pcre_ucp_othercase(c)) >= 0)
493             break;
494     }
495 
496     if (c > d)
497         return false;
498 
499     *ocptr = othercase;
500     int next = othercase + 1;
501 
502     for (++c; c <= d; c++) {
503         if (jsc_pcre_ucp_othercase(c) != next)
504             break;
505         next++;
506     }
507 
508     *odptr = next - 1;
509     *cptr = c;
510 
511     return true;
512 }
513 
514 /*************************************************
515  *       Convert character value to UTF-8         *
516  *************************************************/
517 
518 /* This function takes an integer value in the range 0 - 0x7fffffff
519  and encodes it as a UTF-8 character in 0 to 6 bytes.
520 
521  Arguments:
522  cvalue     the character value
523  buffer     pointer to buffer for result - at least 6 bytes long
524 
525  Returns:     number of characters placed in the buffer
526  */
527 
encodeUTF8(int cvalue,unsigned char * buffer)528 static int encodeUTF8(int cvalue, unsigned char *buffer)
529 {
530     int i;
531     for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
532         if (cvalue <= jsc_pcre_utf8_table1[i])
533             break;
534     buffer += i;
535     for (int j = i; j > 0; j--) {
536         *buffer-- = 0x80 | (cvalue & 0x3f);
537         cvalue >>= 6;
538     }
539     *buffer = jsc_pcre_utf8_table2[i] | cvalue;
540     return i + 1;
541 }
542 
543 /*************************************************
544 *           Compile one branch                   *
545 *************************************************/
546 
547 /* Scan the pattern, compiling it into the code vector.
548 
549 Arguments:
550   options        the option bits
551   brackets       points to number of extracting brackets used
552   codePtr        points to the pointer to the current code point
553   ptrPtr         points to the current pattern pointer
554   errorCodePtr   points to error code variable
555   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
556   reqbyteptr     set to the last literal character required, else < 0
557   cd             contains pointers to tables etc.
558 
559 Returns:         true on success
560                  false, with *errorCodePtr set non-zero on error
561 */
562 
safelyCheckNextChar(const UChar * ptr,const UChar * patternEnd,UChar expected)563 static inline bool safelyCheckNextChar(const UChar* ptr, const UChar* patternEnd, UChar expected)
564 {
565     return ((ptr + 1 < patternEnd) && ptr[1] == expected);
566 }
567 
568 static bool
compileBranch(int options,int * brackets,unsigned char ** codePtr,const UChar ** ptrPtr,const UChar * patternEnd,ErrorCode * errorCodePtr,int * firstbyteptr,int * reqbyteptr,CompileData & cd)569 compileBranch(int options, int* brackets, unsigned char** codePtr,
570                const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int *firstbyteptr,
571                int* reqbyteptr, CompileData& cd)
572 {
573     int repeatType, opType;
574     int repeatMin = 0, repeat_max = 0;      /* To please picky compilers */
575     int bravalue = 0;
576     int reqvary, tempreqvary;
577     int c;
578     unsigned char* code = *codePtr;
579     unsigned char* tempcode;
580     bool didGroupSetFirstByte = false;
581     const UChar* ptr = *ptrPtr;
582     const UChar* tempptr;
583     unsigned char* previous = NULL;
584     unsigned char classbits[32];
585 
586     bool class_utf8;
587     unsigned char* class_utf8data;
588     unsigned char utf8_char[6];
589 
590     /* Initialize no first byte, no required byte. REQ_UNSET means "no char
591      matching encountered yet". It gets changed to REQ_NONE if we hit something that
592      matches a non-fixed char first char; reqByte just remains unset if we never
593      find one.
594 
595      When we hit a repeat whose minimum is zero, we may have to adjust these values
596      to take the zero repeat into account. This is implemented by setting them to
597      zeroFirstByte and zeroReqByte when such a repeat is encountered. The individual
598      item types that can be repeated set these backoff variables appropriately. */
599 
600     int firstByte = REQ_UNSET;
601     int reqByte = REQ_UNSET;
602     int zeroReqByte = REQ_UNSET;
603     int zeroFirstByte = REQ_UNSET;
604 
605     /* The variable reqCaseOpt contains either the REQ_IGNORE_CASE value or zero,
606      according to the current setting of the ignores-case flag. REQ_IGNORE_CASE is a bit
607      value > 255. It is added into the firstByte or reqByte variables to record the
608      case status of the value. This is used only for ASCII characters. */
609 
610     int reqCaseOpt = (options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0;
611 
612     /* Switch on next character until the end of the branch */
613 
614     for (;; ptr++) {
615         bool negateClass;
616         bool shouldFlipNegation; /* If a negative special such as \S is used, we should negate the whole class to properly support Unicode. */
617         int classCharCount;
618         int classLastChar;
619         int skipBytes;
620         int subReqByte;
621         int subFirstByte;
622         int mcLength;
623         unsigned char mcbuffer[8];
624 
625         /* Next byte in the pattern */
626 
627         c = ptr < patternEnd ? *ptr : 0;
628 
629         /* Fill in length of a previous callout, except when the next thing is
630          a quantifier. */
631 
632         bool isQuantifier = c == '*' || c == '+' || c == '?' || (c == '{' && isCountedRepeat(ptr + 1, patternEnd));
633 
634         switch (c) {
635             /* The branch terminates at end of string, |, or ). */
636 
637             case 0:
638                 if (ptr < patternEnd)
639                     goto NORMAL_CHAR;
640                 // End of string; fall through
641             case '|':
642             case ')':
643                 *firstbyteptr = firstByte;
644                 *reqbyteptr = reqByte;
645                 *codePtr = code;
646                 *ptrPtr = ptr;
647                 return true;
648 
649             /* Handle single-character metacharacters. In multiline mode, ^ disables
650              the setting of any following char as a first character. */
651 
652             case '^':
653                 if (options & MatchAcrossMultipleLinesOption) {
654                     if (firstByte == REQ_UNSET)
655                         firstByte = REQ_NONE;
656                     *code++ = OP_BOL;
657                 } else
658                     *code++ = OP_CIRC;
659                 previous = NULL;
660                 break;
661 
662             case '$':
663                 previous = NULL;
664                 if (options & MatchAcrossMultipleLinesOption)
665                   *code++ = OP_EOL;
666                 else
667                   *code++ = OP_DOLL;
668                 break;
669 
670             /* There can never be a first char if '.' is first, whatever happens about
671              repeats. The value of reqByte doesn't change either. */
672 
673             case '.':
674                 if (firstByte == REQ_UNSET)
675                     firstByte = REQ_NONE;
676                 zeroFirstByte = firstByte;
677                 zeroReqByte = reqByte;
678                 previous = code;
679                 *code++ = OP_NOT_NEWLINE;
680                 break;
681 
682             /* Character classes. If the included characters are all < 256, we build a
683              32-byte bitmap of the permitted characters, except in the special case
684              where there is only one such character. For negated classes, we build the
685              map as usual, then invert it at the end. However, we use a different opcode
686              so that data characters > 255 can be handled correctly.
687 
688              If the class contains characters outside the 0-255 range, a different
689              opcode is compiled. It may optionally have a bit map for characters < 256,
690              but those above are are explicitly listed afterwards. A flag byte tells
691              whether the bitmap is present, and whether this is a negated class or not.
692              */
693 
694             case '[': {
695                 previous = code;
696                 shouldFlipNegation = false;
697 
698                 /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
699                  they are encountered at the top level, so we'll do that too. */
700 
701                 /* If the first character is '^', set the negation flag and skip it. */
702 
703                 if (ptr + 1 >= patternEnd) {
704                     *errorCodePtr = ERR6;
705                     return false;
706                 }
707 
708                 if (ptr[1] == '^') {
709                     negateClass = true;
710                     ++ptr;
711                 } else
712                     negateClass = false;
713 
714                 /* Keep a count of chars with values < 256 so that we can optimize the case
715                  of just a single character (as long as it's < 256). For higher valued UTF-8
716                  characters, we don't yet do any optimization. */
717 
718                 classCharCount = 0;
719                 classLastChar = -1;
720 
721                 class_utf8 = false;                       /* No chars >= 256 */
722                 class_utf8data = code + LINK_SIZE + 34;   /* For UTF-8 items */
723 
724                 /* Initialize the 32-char bit map to all zeros. We have to build the
725                  map in a temporary bit of store, in case the class contains only 1
726                  character (< 256), because in that case the compiled code doesn't use the
727                  bit map. */
728 
729                 memset(classbits, 0, 32 * sizeof(unsigned char));
730 
731                 /* Process characters until ] is reached. The first pass
732                  through the regex checked the overall syntax, so we don't need to be very
733                  strict here. At the start of the loop, c contains the first byte of the
734                  character. */
735 
736                 while ((++ptr < patternEnd) && (c = *ptr) != ']') {
737                     /* Backslash may introduce a single character, or it may introduce one
738                      of the specials, which just set a flag. Escaped items are checked for
739                      validity in the pre-compiling pass. The sequence \b is a special case.
740                      Inside a class (and only there) it is treated as backspace. Elsewhere
741                      it marks a word boundary. Other escapes have preset maps ready to
742                      or into the one we are building. We assume they have more than one
743                      character in them, so set classCharCount bigger than one. */
744 
745                     if (c == '\\') {
746                         c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true);
747                         if (c < 0) {
748                             classCharCount += 2;     /* Greater than 1 is what matters */
749                             switch (-c) {
750                                 case ESC_d:
751                                     for (c = 0; c < 32; c++)
752                                         classbits[c] |= classBitmapForChar(c + cbit_digit);
753                                     continue;
754 
755                                 case ESC_D:
756                                     shouldFlipNegation = true;
757                                     for (c = 0; c < 32; c++)
758                                         classbits[c] |= ~classBitmapForChar(c + cbit_digit);
759                                     continue;
760 
761                                 case ESC_w:
762                                     for (c = 0; c < 32; c++)
763                                         classbits[c] |= classBitmapForChar(c + cbit_word);
764                                     continue;
765 
766                                 case ESC_W:
767                                     shouldFlipNegation = true;
768                                     for (c = 0; c < 32; c++)
769                                         classbits[c] |= ~classBitmapForChar(c + cbit_word);
770                                     continue;
771 
772                                 case ESC_s:
773                                     for (c = 0; c < 32; c++)
774                                          classbits[c] |= classBitmapForChar(c + cbit_space);
775                                     continue;
776 
777                                 case ESC_S:
778                                     shouldFlipNegation = true;
779                                     for (c = 0; c < 32; c++)
780                                          classbits[c] |= ~classBitmapForChar(c + cbit_space);
781                                     continue;
782 
783                                     /* Unrecognized escapes are faulted if PCRE is running in its
784                                      strict mode. By default, for compatibility with Perl, they are
785                                      treated as literals. */
786 
787                                 default:
788                                     c = *ptr;              /* The final character */
789                                     classCharCount -= 2;  /* Undo the default count from above */
790                             }
791                         }
792 
793                         /* Fall through if we have a single character (c >= 0). This may be
794                          > 256 in UTF-8 mode. */
795 
796                     }   /* End of backslash handling */
797 
798                     /* A single character may be followed by '-' to form a range. However,
799                      Perl does not permit ']' to be the end of the range. A '-' character
800                      here is treated as a literal. */
801 
802                     if ((ptr + 2 < patternEnd) && ptr[1] == '-' && ptr[2] != ']') {
803                         ptr += 2;
804 
805                         int d = *ptr;
806 
807                         /* The second part of a range can be a single-character escape, but
808                          not any of the other escapes. Perl 5.6 treats a hyphen as a literal
809                          in such circumstances. */
810 
811                         if (d == '\\') {
812                             const UChar* oldptr = ptr;
813                             d = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, true);
814 
815                             /* \X is literal X; any other special means the '-' was literal */
816                             if (d < 0) {
817                                 ptr = oldptr - 2;
818                                 goto LONE_SINGLE_CHARACTER;  /* A few lines below */
819                             }
820                         }
821 
822                         /* The check that the two values are in the correct order happens in
823                          the pre-pass. Optimize one-character ranges */
824 
825                         if (d == c)
826                             goto LONE_SINGLE_CHARACTER;  /* A few lines below */
827 
828                         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
829                          matching, we have to use an XCLASS with extra data items. Caseless
830                          matching for characters > 127 is available only if UCP support is
831                          available. */
832 
833                         if ((d > 255 || ((options & IgnoreCaseOption) && d > 127))) {
834                             class_utf8 = true;
835 
836                             /* With UCP support, we can find the other case equivalents of
837                              the relevant characters. There may be several ranges. Optimize how
838                              they fit with the basic range. */
839 
840                             if (options & IgnoreCaseOption) {
841                                 int occ, ocd;
842                                 int cc = c;
843                                 int origd = d;
844                                 while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
845                                     if (occ >= c && ocd <= d)
846                                         continue;  /* Skip embedded ranges */
847 
848                                     if (occ < c  && ocd >= c - 1)        /* Extend the basic range */
849                                     {                                  /* if there is overlap,   */
850                                         c = occ;                           /* noting that if occ < c */
851                                         continue;                          /* we can't have ocd > d  */
852                                     }                                  /* because a subrange is  */
853                                     if (ocd > d && occ <= d + 1)         /* always shorter than    */
854                                     {                                  /* the basic range.       */
855                                         d = ocd;
856                                         continue;
857                                     }
858 
859                                     if (occ == ocd)
860                                         *class_utf8data++ = XCL_SINGLE;
861                                     else {
862                                         *class_utf8data++ = XCL_RANGE;
863                                         class_utf8data += encodeUTF8(occ, class_utf8data);
864                                     }
865                                     class_utf8data += encodeUTF8(ocd, class_utf8data);
866                                 }
867                             }
868 
869                             /* Now record the original range, possibly modified for UCP caseless
870                              overlapping ranges. */
871 
872                             *class_utf8data++ = XCL_RANGE;
873                             class_utf8data += encodeUTF8(c, class_utf8data);
874                             class_utf8data += encodeUTF8(d, class_utf8data);
875 
876                             /* With UCP support, we are done. Without UCP support, there is no
877                              caseless matching for UTF-8 characters > 127; we can use the bit map
878                              for the smaller ones. */
879 
880                             continue;    /* With next character in the class */
881                         }
882 
883                         /* We use the bit map for all cases when not in UTF-8 mode; else
884                          ranges that lie entirely within 0-127 when there is UCP support; else
885                          for partial ranges without UCP support. */
886 
887                         for (; c <= d; c++) {
888                             classbits[c/8] |= (1 << (c&7));
889                             if (options & IgnoreCaseOption) {
890                                 int uc = flipCase(c);
891                                 classbits[uc/8] |= (1 << (uc&7));
892                             }
893                             classCharCount++;                /* in case a one-char range */
894                             classLastChar = c;
895                         }
896 
897                         continue;   /* Go get the next char in the class */
898                     }
899 
900                     /* Handle a lone single character - we can get here for a normal
901                      non-escape char, or after \ that introduces a single character or for an
902                      apparent range that isn't. */
903 
904                 LONE_SINGLE_CHARACTER:
905 
906                     /* Handle a character that cannot go in the bit map */
907 
908                     if ((c > 255 || ((options & IgnoreCaseOption) && c > 127))) {
909                         class_utf8 = true;
910                         *class_utf8data++ = XCL_SINGLE;
911                         class_utf8data += encodeUTF8(c, class_utf8data);
912 
913                         if (options & IgnoreCaseOption) {
914                             int othercase;
915                             if ((othercase = jsc_pcre_ucp_othercase(c)) >= 0) {
916                                 *class_utf8data++ = XCL_SINGLE;
917                                 class_utf8data += encodeUTF8(othercase, class_utf8data);
918                             }
919                         }
920                     } else {
921                         /* Handle a single-byte character */
922                         classbits[c/8] |= (1 << (c&7));
923                         if (options & IgnoreCaseOption) {
924                             c = flipCase(c);
925                             classbits[c/8] |= (1 << (c&7));
926                         }
927                         classCharCount++;
928                         classLastChar = c;
929                     }
930                 }
931 
932                 /* If classCharCount is 1, we saw precisely one character whose value is
933                  less than 256. In non-UTF-8 mode we can always optimize. In UTF-8 mode, we
934                  can optimize the negative case only if there were no characters >= 128
935                  because OP_NOT and the related opcodes like OP_NOTSTAR operate on
936                  single-bytes only. This is an historical hangover. Maybe one day we can
937                  tidy these opcodes to handle multi-byte characters.
938 
939                  The optimization throws away the bit map. We turn the item into a
940                  1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
941                  that OP_NOT does not support multibyte characters. In the positive case, it
942                  can cause firstByte to be set. Otherwise, there can be no first char if
943                  this item is first, whatever repeat count may follow. In the case of
944                  reqByte, save the previous value for reinstating. */
945 
946                 if (classCharCount == 1 && (!class_utf8 && (!negateClass || classLastChar < 128))) {
947                     zeroReqByte = reqByte;
948 
949                     /* The OP_NOT opcode works on one-byte characters only. */
950 
951                     if (negateClass) {
952                         if (firstByte == REQ_UNSET)
953                             firstByte = REQ_NONE;
954                         zeroFirstByte = firstByte;
955                         *code++ = OP_NOT;
956                         *code++ = classLastChar;
957                         break;
958                     }
959 
960                     /* For a single, positive character, get the value into c, and
961                      then we can handle this with the normal one-character code. */
962 
963                     c = classLastChar;
964                     goto NORMAL_CHAR;
965                 }       /* End of 1-char optimization */
966 
967                 /* The general case - not the one-char optimization. If this is the first
968                  thing in the branch, there can be no first char setting, whatever the
969                  repeat count. Any reqByte setting must remain unchanged after any kind of
970                  repeat. */
971 
972                 if (firstByte == REQ_UNSET) firstByte = REQ_NONE;
973                 zeroFirstByte = firstByte;
974                 zeroReqByte = reqByte;
975 
976                 /* If there are characters with values > 255, we have to compile an
977                  extended class, with its own opcode. If there are no characters < 256,
978                  we can omit the bitmap. */
979 
980                 if (class_utf8 && !shouldFlipNegation) {
981                     *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
982                     *code++ = OP_XCLASS;
983                     code += LINK_SIZE;
984                     *code = negateClass? XCL_NOT : 0;
985 
986                     /* If the map is required, install it, and move on to the end of
987                      the extra data */
988 
989                     if (classCharCount > 0) {
990                         *code++ |= XCL_MAP;
991                         memcpy(code, classbits, 32);
992                         code = class_utf8data;
993                     }
994 
995                     /* If the map is not required, slide down the extra data. */
996 
997                     else {
998                         int len = class_utf8data - (code + 33);
999                         memmove(code + 1, code + 33, len);
1000                         code += len + 1;
1001                     }
1002 
1003                     /* Now fill in the complete length of the item */
1004 
1005                     putLinkValue(previous + 1, code - previous);
1006                     break;   /* End of class handling */
1007                 }
1008 
1009                 /* If there are no characters > 255, negate the 32-byte map if necessary,
1010                  and copy it into the code vector. If this is the first thing in the branch,
1011                  there can be no first char setting, whatever the repeat count. Any reqByte
1012                  setting must remain unchanged after any kind of repeat. */
1013 
1014                 *code++ = (negateClass == shouldFlipNegation) ? OP_CLASS : OP_NCLASS;
1015                 if (negateClass)
1016                     for (c = 0; c < 32; c++)
1017                         code[c] = ~classbits[c];
1018                 else
1019                     memcpy(code, classbits, 32);
1020                 code += 32;
1021                 break;
1022             }
1023 
1024             /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
1025              has been tested above. */
1026 
1027             case '{':
1028                 if (!isQuantifier)
1029                     goto NORMAL_CHAR;
1030                 ptr = readRepeatCounts(ptr + 1, &repeatMin, &repeat_max, errorCodePtr);
1031                 if (*errorCodePtr)
1032                     goto FAILED;
1033                 goto REPEAT;
1034 
1035             case '*':
1036                 repeatMin = 0;
1037                 repeat_max = -1;
1038                 goto REPEAT;
1039 
1040             case '+':
1041                 repeatMin = 1;
1042                 repeat_max = -1;
1043                 goto REPEAT;
1044 
1045             case '?':
1046                 repeatMin = 0;
1047                 repeat_max = 1;
1048 
1049             REPEAT:
1050                 if (!previous) {
1051                     *errorCodePtr = ERR9;
1052                     goto FAILED;
1053                 }
1054 
1055                 if (repeatMin == 0) {
1056                     firstByte = zeroFirstByte;    /* Adjust for zero repeat */
1057                     reqByte = zeroReqByte;        /* Ditto */
1058                 }
1059 
1060                 /* Remember whether this is a variable length repeat */
1061 
1062                 reqvary = (repeatMin == repeat_max) ? 0 : REQ_VARY;
1063 
1064                 opType = 0;                    /* Default single-char op codes */
1065 
1066                 /* Save start of previous item, in case we have to move it up to make space
1067                  for an inserted OP_ONCE for the additional '+' extension. */
1068                 /* FIXME: Probably don't need this because we don't use OP_ONCE. */
1069 
1070                 tempcode = previous;
1071 
1072                 /* If the next character is '+', we have a possessive quantifier. This
1073                  implies greediness, whatever the setting of the PCRE_UNGREEDY option.
1074                  If the next character is '?' this is a minimizing repeat, by default,
1075                  but if PCRE_UNGREEDY is set, it works the other way round. We change the
1076                  repeat type to the non-default. */
1077 
1078                 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
1079                     repeatType = 1;
1080                     ptr++;
1081                 } else
1082                     repeatType = 0;
1083 
1084                 /* If previous was a character match, abolish the item and generate a
1085                  repeat item instead. If a char item has a minumum of more than one, ensure
1086                  that it is set in reqByte - it might not be if a sequence such as x{3} is
1087                  the first thing in a branch because the x will have gone into firstByte
1088                  instead.  */
1089 
1090                 if (*previous == OP_CHAR || *previous == OP_CHAR_IGNORING_CASE) {
1091                     /* Deal with UTF-8 characters that take up more than one byte. It's
1092                      easier to write this out separately than try to macrify it. Use c to
1093                      hold the length of the character in bytes, plus 0x80 to flag that it's a
1094                      length rather than a small character. */
1095 
1096                     if (code[-1] & 0x80) {
1097                         unsigned char *lastchar = code - 1;
1098                         while((*lastchar & 0xc0) == 0x80)
1099                             lastchar--;
1100                         c = code - lastchar;            /* Length of UTF-8 character */
1101                         memcpy(utf8_char, lastchar, c); /* Save the char */
1102                         c |= 0x80;                      /* Flag c as a length */
1103                     }
1104                     else {
1105                         c = code[-1];
1106                         if (repeatMin > 1)
1107                             reqByte = c | reqCaseOpt | cd.reqVaryOpt;
1108                     }
1109 
1110                     goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
1111                 }
1112 
1113                 else if (*previous == OP_ASCII_CHAR || *previous == OP_ASCII_LETTER_IGNORING_CASE) {
1114                     c = previous[1];
1115                     if (repeatMin > 1)
1116                         reqByte = c | reqCaseOpt | cd.reqVaryOpt;
1117                     goto OUTPUT_SINGLE_REPEAT;
1118                 }
1119 
1120                 /* If previous was a single negated character ([^a] or similar), we use
1121                  one of the special opcodes, replacing it. The code is shared with single-
1122                  character repeats by setting opt_type to add a suitable offset into
1123                  repeatType. OP_NOT is currently used only for single-byte chars. */
1124 
1125                 else if (*previous == OP_NOT) {
1126                     opType = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
1127                     c = previous[1];
1128                     goto OUTPUT_SINGLE_REPEAT;
1129                 }
1130 
1131                 /* If previous was a character type match (\d or similar), abolish it and
1132                  create a suitable repeat item. The code is shared with single-character
1133                  repeats by setting opType to add a suitable offset into repeatType. */
1134 
1135                 else if (*previous <= OP_NOT_NEWLINE) {
1136                     opType = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
1137                     c = *previous;
1138 
1139                 OUTPUT_SINGLE_REPEAT:
1140                     int prop_type = -1;
1141                     int prop_value = -1;
1142 
1143                     unsigned char* oldcode = code;
1144                     code = previous;                  /* Usually overwrite previous item */
1145 
1146                     /* If the maximum is zero then the minimum must also be zero; Perl allows
1147                      this case, so we do too - by simply omitting the item altogether. */
1148 
1149                     if (repeat_max == 0)
1150                         goto END_REPEAT;
1151 
1152                     /* Combine the opType with the repeatType */
1153 
1154                     repeatType += opType;
1155 
1156                     /* A minimum of zero is handled either as the special case * or ?, or as
1157                      an UPTO, with the maximum given. */
1158 
1159                     if (repeatMin == 0) {
1160                         if (repeat_max == -1)
1161                             *code++ = OP_STAR + repeatType;
1162                         else if (repeat_max == 1)
1163                             *code++ = OP_QUERY + repeatType;
1164                         else {
1165                             *code++ = OP_UPTO + repeatType;
1166                             put2ByteValueAndAdvance(code, repeat_max);
1167                         }
1168                     }
1169 
1170                     /* A repeat minimum of 1 is optimized into some special cases. If the
1171                      maximum is unlimited, we use OP_PLUS. Otherwise, the original item it
1172                      left in place and, if the maximum is greater than 1, we use OP_UPTO with
1173                      one less than the maximum. */
1174 
1175                     else if (repeatMin == 1) {
1176                         if (repeat_max == -1)
1177                             *code++ = OP_PLUS + repeatType;
1178                         else {
1179                             code = oldcode;                 /* leave previous item in place */
1180                             if (repeat_max == 1)
1181                                 goto END_REPEAT;
1182                             *code++ = OP_UPTO + repeatType;
1183                             put2ByteValueAndAdvance(code, repeat_max - 1);
1184                         }
1185                     }
1186 
1187                     /* The case {n,n} is just an EXACT, while the general case {n,m} is
1188                      handled as an EXACT followed by an UPTO. */
1189 
1190                     else {
1191                         *code++ = OP_EXACT + opType;  /* NB EXACT doesn't have repeatType */
1192                         put2ByteValueAndAdvance(code, repeatMin);
1193 
1194                         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
1195                          we have to insert the character for the previous code. For a repeated
1196                          Unicode property match, there are two extra bytes that define the
1197                          required property. In UTF-8 mode, long characters have their length in
1198                          c, with the 0x80 bit as a flag. */
1199 
1200                         if (repeat_max < 0) {
1201                             if (c >= 128) {
1202                                 memcpy(code, utf8_char, c & 7);
1203                                 code += c & 7;
1204                             } else {
1205                                 *code++ = c;
1206                                 if (prop_type >= 0) {
1207                                     *code++ = prop_type;
1208                                     *code++ = prop_value;
1209                                 }
1210                             }
1211                             *code++ = OP_STAR + repeatType;
1212                         }
1213 
1214                         /* Else insert an UPTO if the max is greater than the min, again
1215                          preceded by the character, for the previously inserted code. */
1216 
1217                         else if (repeat_max != repeatMin) {
1218                             if (c >= 128) {
1219                                 memcpy(code, utf8_char, c & 7);
1220                                 code += c & 7;
1221                             } else
1222                                 *code++ = c;
1223                             if (prop_type >= 0) {
1224                                 *code++ = prop_type;
1225                                 *code++ = prop_value;
1226                             }
1227                             repeat_max -= repeatMin;
1228                             *code++ = OP_UPTO + repeatType;
1229                             put2ByteValueAndAdvance(code, repeat_max);
1230                         }
1231                     }
1232 
1233                     /* The character or character type itself comes last in all cases. */
1234 
1235                     if (c >= 128) {
1236                         memcpy(code, utf8_char, c & 7);
1237                         code += c & 7;
1238                     } else
1239                         *code++ = c;
1240 
1241                     /* For a repeated Unicode property match, there are two extra bytes that
1242                      define the required property. */
1243 
1244                     if (prop_type >= 0) {
1245                         *code++ = prop_type;
1246                         *code++ = prop_value;
1247                     }
1248                 }
1249 
1250                 /* If previous was a character class or a back reference, we put the repeat
1251                  stuff after it, but just skip the item if the repeat was {0,0}. */
1252 
1253                 else if (*previous == OP_CLASS ||
1254                          *previous == OP_NCLASS ||
1255                          *previous == OP_XCLASS ||
1256                          *previous == OP_REF)
1257                 {
1258                     if (repeat_max == 0) {
1259                         code = previous;
1260                         goto END_REPEAT;
1261                     }
1262 
1263                     if (repeatMin == 0 && repeat_max == -1)
1264                         *code++ = OP_CRSTAR + repeatType;
1265                     else if (repeatMin == 1 && repeat_max == -1)
1266                         *code++ = OP_CRPLUS + repeatType;
1267                     else if (repeatMin == 0 && repeat_max == 1)
1268                         *code++ = OP_CRQUERY + repeatType;
1269                     else {
1270                         *code++ = OP_CRRANGE + repeatType;
1271                         put2ByteValueAndAdvance(code, repeatMin);
1272                         if (repeat_max == -1)
1273                             repeat_max = 0;  /* 2-byte encoding for max */
1274                         put2ByteValueAndAdvance(code, repeat_max);
1275                     }
1276                 }
1277 
1278                 /* If previous was a bracket group, we may have to replicate it in certain
1279                  cases. */
1280 
1281                 else if (*previous >= OP_BRA) {
1282                     int ketoffset = 0;
1283                     int len = code - previous;
1284                     unsigned char* bralink = NULL;
1285 
1286                     /* If the maximum repeat count is unlimited, find the end of the bracket
1287                      by scanning through from the start, and compute the offset back to it
1288                      from the current code pointer. There may be an OP_OPT setting following
1289                      the final KET, so we can't find the end just by going back from the code
1290                      pointer. */
1291 
1292                     if (repeat_max == -1) {
1293                         const unsigned char* ket = previous;
1294                         advanceToEndOfBracket(ket);
1295                         ketoffset = code - ket;
1296                     }
1297 
1298                     /* The case of a zero minimum is special because of the need to stick
1299                      OP_BRAZERO in front of it, and because the group appears once in the
1300                      data, whereas in other cases it appears the minimum number of times. For
1301                      this reason, it is simplest to treat this case separately, as otherwise
1302                      the code gets far too messy. There are several special subcases when the
1303                      minimum is zero. */
1304 
1305                     if (repeatMin == 0) {
1306                         /* If the maximum is also zero, we just omit the group from the output
1307                          altogether. */
1308 
1309                         if (repeat_max == 0) {
1310                             code = previous;
1311                             goto END_REPEAT;
1312                         }
1313 
1314                         /* If the maximum is 1 or unlimited, we just have to stick in the
1315                          BRAZERO and do no more at this point. However, we do need to adjust
1316                          any OP_RECURSE calls inside the group that refer to the group itself or
1317                          any internal group, because the offset is from the start of the whole
1318                          regex. Temporarily terminate the pattern while doing this. */
1319 
1320                         if (repeat_max <= 1) {
1321                             *code = OP_END;
1322                             memmove(previous+1, previous, len);
1323                             code++;
1324                             *previous++ = OP_BRAZERO + repeatType;
1325                         }
1326 
1327                         /* If the maximum is greater than 1 and limited, we have to replicate
1328                          in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1329                          The first one has to be handled carefully because it's the original
1330                          copy, which has to be moved up. The remainder can be handled by code
1331                          that is common with the non-zero minimum case below. We have to
1332                          adjust the value of repeat_max, since one less copy is required. */
1333 
1334                         else {
1335                             *code = OP_END;
1336                             memmove(previous + 2 + LINK_SIZE, previous, len);
1337                             code += 2 + LINK_SIZE;
1338                             *previous++ = OP_BRAZERO + repeatType;
1339                             *previous++ = OP_BRA;
1340 
1341                             /* We chain together the bracket offset fields that have to be
1342                              filled in later when the ends of the brackets are reached. */
1343 
1344                             int offset = (!bralink) ? 0 : previous - bralink;
1345                             bralink = previous;
1346                             putLinkValueAllowZeroAndAdvance(previous, offset);
1347                         }
1348 
1349                         repeat_max--;
1350                     }
1351 
1352                     /* If the minimum is greater than zero, replicate the group as many
1353                      times as necessary, and adjust the maximum to the number of subsequent
1354                      copies that we need. If we set a first char from the group, and didn't
1355                      set a required char, copy the latter from the former. */
1356 
1357                     else {
1358                         if (repeatMin > 1) {
1359                             if (didGroupSetFirstByte && reqByte < 0)
1360                                 reqByte = firstByte;
1361                             for (int i = 1; i < repeatMin; i++) {
1362                                 memcpy(code, previous, len);
1363                                 code += len;
1364                             }
1365                         }
1366                         if (repeat_max > 0)
1367                             repeat_max -= repeatMin;
1368                     }
1369 
1370                     /* This code is common to both the zero and non-zero minimum cases. If
1371                      the maximum is limited, it replicates the group in a nested fashion,
1372                      remembering the bracket starts on a stack. In the case of a zero minimum,
1373                      the first one was set up above. In all cases the repeat_max now specifies
1374                      the number of additional copies needed. */
1375 
1376                     if (repeat_max >= 0) {
1377                         for (int i = repeat_max - 1; i >= 0; i--) {
1378                             *code++ = OP_BRAZERO + repeatType;
1379 
1380                             /* All but the final copy start a new nesting, maintaining the
1381                              chain of brackets outstanding. */
1382 
1383                             if (i != 0) {
1384                                 *code++ = OP_BRA;
1385                                 int offset = (!bralink) ? 0 : code - bralink;
1386                                 bralink = code;
1387                                 putLinkValueAllowZeroAndAdvance(code, offset);
1388                             }
1389 
1390                             memcpy(code, previous, len);
1391                             code += len;
1392                         }
1393 
1394                         /* Now chain through the pending brackets, and fill in their length
1395                          fields (which are holding the chain links pro tem). */
1396 
1397                         while (bralink) {
1398                             int offset = code - bralink + 1;
1399                             unsigned char* bra = code - offset;
1400                             int oldlinkoffset = getLinkValueAllowZero(bra + 1);
1401                             bralink = (!oldlinkoffset) ? 0 : bralink - oldlinkoffset;
1402                             *code++ = OP_KET;
1403                             putLinkValueAndAdvance(code, offset);
1404                             putLinkValue(bra + 1, offset);
1405                         }
1406                     }
1407 
1408                     /* If the maximum is unlimited, set a repeater in the final copy. We
1409                      can't just offset backwards from the current code point, because we
1410                      don't know if there's been an options resetting after the ket. The
1411                      correct offset was computed above. */
1412 
1413                     else
1414                         code[-ketoffset] = OP_KETRMAX + repeatType;
1415                 }
1416 
1417                 // A quantifier after an assertion is mostly meaningless, but it
1418                 // can nullify the assertion if it has a 0 minimum.
1419                 else if (*previous == OP_ASSERT || *previous == OP_ASSERT_NOT) {
1420                     if (repeatMin == 0) {
1421                         code = previous;
1422                         goto END_REPEAT;
1423                     }
1424                 }
1425 
1426                 /* Else there's some kind of shambles */
1427 
1428                 else {
1429                     *errorCodePtr = ERR11;
1430                     goto FAILED;
1431                 }
1432 
1433                 /* In all case we no longer have a previous item. We also set the
1434                  "follows varying string" flag for subsequently encountered reqbytes if
1435                  it isn't already set and we have just passed a varying length item. */
1436 
1437             END_REPEAT:
1438                 previous = NULL;
1439                 cd.reqVaryOpt |= reqvary;
1440                 break;
1441 
1442             /* Start of nested bracket sub-expression, or comment or lookahead or
1443              lookbehind or option setting or condition. First deal with special things
1444              that can come after a bracket; all are introduced by ?, and the appearance
1445              of any of them means that this is not a referencing group. They were
1446              checked for validity in the first pass over the string, so we don't have to
1447              check for syntax errors here.  */
1448 
1449             case '(':
1450                 skipBytes = 0;
1451 
1452                 if (*(++ptr) == '?') {
1453                     switch (*(++ptr)) {
1454                         case ':':                 /* Non-extracting bracket */
1455                             bravalue = OP_BRA;
1456                             ptr++;
1457                             break;
1458 
1459                         case '=':                 /* Positive lookahead */
1460                             bravalue = OP_ASSERT;
1461                             ptr++;
1462                             break;
1463 
1464                         case '!':                 /* Negative lookahead */
1465                             bravalue = OP_ASSERT_NOT;
1466                             ptr++;
1467                             break;
1468 
1469                         /* Character after (? not specially recognized */
1470 
1471                         default:
1472                             *errorCodePtr = ERR12;
1473                             goto FAILED;
1474                         }
1475                 }
1476 
1477                 /* Else we have a referencing group; adjust the opcode. If the bracket
1478                  number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1479                  arrange for the true number to follow later, in an OP_BRANUMBER item. */
1480 
1481                 else {
1482                     if (++(*brackets) > EXTRACT_BASIC_MAX) {
1483                         bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1484                         code[1 + LINK_SIZE] = OP_BRANUMBER;
1485                         put2ByteValue(code + 2 + LINK_SIZE, *brackets);
1486                         skipBytes = 3;
1487                     }
1488                     else
1489                         bravalue = OP_BRA + *brackets;
1490                 }
1491 
1492                 /* Process nested bracketed re. We copy code into a non-variable
1493                  in order to be able to pass its address because some compilers
1494                  complain otherwise. Pass in a new setting for the ims options
1495                  if they have changed. */
1496 
1497                 previous = code;
1498                 *code = bravalue;
1499                 tempcode = code;
1500                 tempreqvary = cd.reqVaryOpt;     /* Save value before bracket */
1501 
1502                 if (!compileBracket(
1503                                    options,
1504                                    brackets,                     /* Extracting bracket count */
1505                                    &tempcode,                    /* Where to put code (updated) */
1506                                    &ptr,                         /* Input pointer (updated) */
1507                                    patternEnd,
1508                                    errorCodePtr,                 /* Where to put an error message */
1509                                    skipBytes,                    /* Skip over OP_BRANUMBER */
1510                                    &subFirstByte,                /* For possible first char */
1511                                    &subReqByte,                  /* For possible last char */
1512                                    cd))                          /* Tables block */
1513                     goto FAILED;
1514 
1515                 /* At the end of compiling, code is still pointing to the start of the
1516                  group, while tempcode has been updated to point past the end of the group
1517                  and any option resetting that may follow it. The pattern pointer (ptr)
1518                  is on the bracket. */
1519 
1520                 /* Handle updating of the required and first characters. Update for normal
1521                  brackets of all kinds, and conditions with two branches (see code above).
1522                  If the bracket is followed by a quantifier with zero repeat, we have to
1523                  back off. Hence the definition of zeroReqByte and zeroFirstByte outside the
1524                  main loop so that they can be accessed for the back off. */
1525 
1526                 zeroReqByte = reqByte;
1527                 zeroFirstByte = firstByte;
1528                 didGroupSetFirstByte = false;
1529 
1530                 if (bravalue >= OP_BRA) {
1531                     /* If we have not yet set a firstByte in this branch, take it from the
1532                      subpattern, remembering that it was set here so that a repeat of more
1533                      than one can replicate it as reqByte if necessary. If the subpattern has
1534                      no firstByte, set "none" for the whole branch. In both cases, a zero
1535                      repeat forces firstByte to "none". */
1536 
1537                     if (firstByte == REQ_UNSET) {
1538                         if (subFirstByte >= 0) {
1539                             firstByte = subFirstByte;
1540                             didGroupSetFirstByte = true;
1541                         }
1542                         else
1543                             firstByte = REQ_NONE;
1544                         zeroFirstByte = REQ_NONE;
1545                     }
1546 
1547                     /* If firstByte was previously set, convert the subpattern's firstByte
1548                      into reqByte if there wasn't one, using the vary flag that was in
1549                      existence beforehand. */
1550 
1551                     else if (subFirstByte >= 0 && subReqByte < 0)
1552                         subReqByte = subFirstByte | tempreqvary;
1553 
1554                     /* If the subpattern set a required byte (or set a first byte that isn't
1555                      really the first byte - see above), set it. */
1556 
1557                     if (subReqByte >= 0)
1558                         reqByte = subReqByte;
1559                 }
1560 
1561                 /* For a forward assertion, we take the reqByte, if set. This can be
1562                  helpful if the pattern that follows the assertion doesn't set a different
1563                  char. For example, it's useful for /(?=abcde).+/. We can't set firstByte
1564                  for an assertion, however because it leads to incorrect effect for patterns
1565                  such as /(?=a)a.+/ when the "real" "a" would then become a reqByte instead
1566                  of a firstByte. This is overcome by a scan at the end if there's no
1567                  firstByte, looking for an asserted first char. */
1568 
1569                 else if (bravalue == OP_ASSERT && subReqByte >= 0)
1570                     reqByte = subReqByte;
1571 
1572                 /* Now update the main code pointer to the end of the group. */
1573 
1574                 code = tempcode;
1575 
1576                 /* Error if hit end of pattern */
1577 
1578                 if (ptr >= patternEnd || *ptr != ')') {
1579                     *errorCodePtr = ERR14;
1580                     goto FAILED;
1581                 }
1582                 break;
1583 
1584             /* Check \ for being a real metacharacter; if not, fall through and handle
1585              it as a data character at the start of a string. Escape items are checked
1586              for validity in the pre-compiling pass. */
1587 
1588             case '\\':
1589                 tempptr = ptr;
1590                 c = checkEscape(&ptr, patternEnd, errorCodePtr, cd.numCapturingBrackets, false);
1591 
1592                 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1593                  are arranged to be the negation of the corresponding OP_values. For the
1594                  back references, the values are ESC_REF plus the reference number. Only
1595                  back references and those types that consume a character may be repeated.
1596                  We can test for values between ESC_b and ESC_w for the latter; this may
1597                  have to change if any new ones are ever created. */
1598 
1599                 if (c < 0) {
1600                     /* For metasequences that actually match a character, we disable the
1601                      setting of a first character if it hasn't already been set. */
1602 
1603                     if (firstByte == REQ_UNSET && -c > ESC_b && -c <= ESC_w)
1604                         firstByte = REQ_NONE;
1605 
1606                     /* Set values to reset to if this is followed by a zero repeat. */
1607 
1608                     zeroFirstByte = firstByte;
1609                     zeroReqByte = reqByte;
1610 
1611                     /* Back references are handled specially */
1612 
1613                     if (-c >= ESC_REF) {
1614                         int number = -c - ESC_REF;
1615                         previous = code;
1616                         *code++ = OP_REF;
1617                         put2ByteValueAndAdvance(code, number);
1618                     }
1619 
1620                     /* For the rest, we can obtain the OP value by negating the escape
1621                      value */
1622 
1623                     else {
1624                         previous = (-c > ESC_b && -c <= ESC_w) ? code : NULL;
1625                         *code++ = -c;
1626                     }
1627                     continue;
1628                 }
1629 
1630                 /* Fall through. */
1631 
1632                 /* Handle a literal character. It is guaranteed not to be whitespace or #
1633                  when the extended flag is set. If we are in UTF-8 mode, it may be a
1634                  multi-byte literal character. */
1635 
1636                 default:
1637             NORMAL_CHAR:
1638 
1639                 previous = code;
1640 
1641                 if (c < 128) {
1642                     mcLength = 1;
1643                     mcbuffer[0] = c;
1644 
1645                     if ((options & IgnoreCaseOption) && (c | 0x20) >= 'a' && (c | 0x20) <= 'z') {
1646                         *code++ = OP_ASCII_LETTER_IGNORING_CASE;
1647                         *code++ = c | 0x20;
1648                     } else {
1649                         *code++ = OP_ASCII_CHAR;
1650                         *code++ = c;
1651                     }
1652                 } else {
1653                     mcLength = encodeUTF8(c, mcbuffer);
1654 
1655                     *code++ = (options & IgnoreCaseOption) ? OP_CHAR_IGNORING_CASE : OP_CHAR;
1656                     for (c = 0; c < mcLength; c++)
1657                         *code++ = mcbuffer[c];
1658                 }
1659 
1660                 /* Set the first and required bytes appropriately. If no previous first
1661                  byte, set it from this character, but revert to none on a zero repeat.
1662                  Otherwise, leave the firstByte value alone, and don't change it on a zero
1663                  repeat. */
1664 
1665                 if (firstByte == REQ_UNSET) {
1666                     zeroFirstByte = REQ_NONE;
1667                     zeroReqByte = reqByte;
1668 
1669                     /* If the character is more than one byte long, we can set firstByte
1670                      only if it is not to be matched caselessly. */
1671 
1672                     if (mcLength == 1 || reqCaseOpt == 0) {
1673                         firstByte = mcbuffer[0] | reqCaseOpt;
1674                         if (mcLength != 1)
1675                             reqByte = code[-1] | cd.reqVaryOpt;
1676                     }
1677                     else
1678                         firstByte = reqByte = REQ_NONE;
1679                 }
1680 
1681                 /* firstByte was previously set; we can set reqByte only the length is
1682                  1 or the matching is caseful. */
1683 
1684                 else {
1685                     zeroFirstByte = firstByte;
1686                     zeroReqByte = reqByte;
1687                     if (mcLength == 1 || reqCaseOpt == 0)
1688                         reqByte = code[-1] | reqCaseOpt | cd.reqVaryOpt;
1689                 }
1690 
1691                 break;            /* End of literal character handling */
1692         }
1693     }                   /* end of big loop */
1694 
1695     /* Control never reaches here by falling through, only by a goto for all the
1696      error states. Pass back the position in the pattern so that it can be displayed
1697      to the user for diagnosing the error. */
1698 
1699 FAILED:
1700     *ptrPtr = ptr;
1701     return false;
1702 }
1703 
1704 /*************************************************
1705 *     Compile sequence of alternatives           *
1706 *************************************************/
1707 
1708 /* On entry, ptr is pointing past the bracket character, but on return
1709 it points to the closing bracket, or vertical bar, or end of string.
1710 The code variable is pointing at the byte into which the BRA operator has been
1711 stored. If the ims options are changed at the start (for a (?ims: group) or
1712 during any branch, we need to insert an OP_OPT item at the start of every
1713 following branch to ensure they get set correctly at run time, and also pass
1714 the new options into every subsequent branch compile.
1715 
1716 Argument:
1717   options        option bits, including any changes for this subpattern
1718   brackets       -> int containing the number of extracting brackets used
1719   codePtr        -> the address of the current code pointer
1720   ptrPtr         -> the address of the current pattern pointer
1721   errorCodePtr   -> pointer to error code variable
1722   skipBytes      skip this many bytes at start (for OP_BRANUMBER)
1723   firstbyteptr   place to put the first required character, or a negative number
1724   reqbyteptr     place to put the last required character, or a negative number
1725   cd             points to the data block with tables pointers etc.
1726 
1727 Returns:      true on success
1728 */
1729 
1730 static bool
compileBracket(int options,int * brackets,unsigned char ** codePtr,const UChar ** ptrPtr,const UChar * patternEnd,ErrorCode * errorCodePtr,int skipBytes,int * firstbyteptr,int * reqbyteptr,CompileData & cd)1731 compileBracket(int options, int* brackets, unsigned char** codePtr,
1732     const UChar** ptrPtr, const UChar* patternEnd, ErrorCode* errorCodePtr, int skipBytes,
1733     int* firstbyteptr, int* reqbyteptr, CompileData& cd)
1734 {
1735     const UChar* ptr = *ptrPtr;
1736     unsigned char* code = *codePtr;
1737     unsigned char* lastBranch = code;
1738     unsigned char* start_bracket = code;
1739     int firstByte = REQ_UNSET;
1740     int reqByte = REQ_UNSET;
1741 
1742     /* Offset is set zero to mark that this bracket is still open */
1743 
1744     putLinkValueAllowZero(code + 1, 0);
1745     code += 1 + LINK_SIZE + skipBytes;
1746 
1747     /* Loop for each alternative branch */
1748 
1749     while (true) {
1750         /* Now compile the branch */
1751 
1752         int branchFirstByte;
1753         int branchReqByte;
1754         if (!compileBranch(options, brackets, &code, &ptr, patternEnd, errorCodePtr,
1755                             &branchFirstByte, &branchReqByte, cd)) {
1756             *ptrPtr = ptr;
1757             return false;
1758         }
1759 
1760         /* If this is the first branch, the firstByte and reqByte values for the
1761          branch become the values for the regex. */
1762 
1763         if (*lastBranch != OP_ALT) {
1764             firstByte = branchFirstByte;
1765             reqByte = branchReqByte;
1766         }
1767 
1768         /* If this is not the first branch, the first char and reqByte have to
1769          match the values from all the previous branches, except that if the previous
1770          value for reqByte didn't have REQ_VARY set, it can still match, and we set
1771          REQ_VARY for the regex. */
1772 
1773         else {
1774             /* If we previously had a firstByte, but it doesn't match the new branch,
1775              we have to abandon the firstByte for the regex, but if there was previously
1776              no reqByte, it takes on the value of the old firstByte. */
1777 
1778             if (firstByte >= 0 && firstByte != branchFirstByte) {
1779                 if (reqByte < 0)
1780                     reqByte = firstByte;
1781                 firstByte = REQ_NONE;
1782             }
1783 
1784             /* If we (now or from before) have no firstByte, a firstByte from the
1785              branch becomes a reqByte if there isn't a branch reqByte. */
1786 
1787             if (firstByte < 0 && branchFirstByte >= 0 && branchReqByte < 0)
1788                 branchReqByte = branchFirstByte;
1789 
1790             /* Now ensure that the reqbytes match */
1791 
1792             if ((reqByte & ~REQ_VARY) != (branchReqByte & ~REQ_VARY))
1793                 reqByte = REQ_NONE;
1794             else
1795                 reqByte |= branchReqByte;   /* To "or" REQ_VARY */
1796         }
1797 
1798         /* Reached end of expression, either ')' or end of pattern. Go back through
1799          the alternative branches and reverse the chain of offsets, with the field in
1800          the BRA item now becoming an offset to the first alternative. If there are
1801          no alternatives, it points to the end of the group. The length in the
1802          terminating ket is always the length of the whole bracketed item. If any of
1803          the ims options were changed inside the group, compile a resetting op-code
1804          following, except at the very end of the pattern. Return leaving the pointer
1805          at the terminating char. */
1806 
1807         if (ptr >= patternEnd || *ptr != '|') {
1808             int length = code - lastBranch;
1809             do {
1810                 int prevLength = getLinkValueAllowZero(lastBranch + 1);
1811                 putLinkValue(lastBranch + 1, length);
1812                 length = prevLength;
1813                 lastBranch -= length;
1814             } while (length > 0);
1815 
1816             /* Fill in the ket */
1817 
1818             *code = OP_KET;
1819             putLinkValue(code + 1, code - start_bracket);
1820             code += 1 + LINK_SIZE;
1821 
1822             /* Set values to pass back */
1823 
1824             *codePtr = code;
1825             *ptrPtr = ptr;
1826             *firstbyteptr = firstByte;
1827             *reqbyteptr = reqByte;
1828             return true;
1829         }
1830 
1831         /* Another branch follows; insert an "or" node. Its length field points back
1832          to the previous branch while the bracket remains open. At the end the chain
1833          is reversed. It's done like this so that the start of the bracket has a
1834          zero offset until it is closed, making it possible to detect recursion. */
1835 
1836         *code = OP_ALT;
1837         putLinkValue(code + 1, code - lastBranch);
1838         lastBranch = code;
1839         code += 1 + LINK_SIZE;
1840         ptr++;
1841     }
1842     ASSERT_NOT_REACHED();
1843 }
1844 
1845 /*************************************************
1846 *          Check for anchored expression         *
1847 *************************************************/
1848 
1849 /* Try to find out if this is an anchored regular expression. Consider each
1850 alternative branch. If they all start OP_CIRC, or with a bracket
1851 all of whose alternatives start OP_CIRC (recurse ad lib), then
1852 it's anchored.
1853 
1854 Arguments:
1855   code          points to start of expression (the bracket)
1856   captureMap    a bitmap of which brackets we are inside while testing; this
1857                  handles up to substring 31; all brackets after that share
1858                  the zero bit
1859   backrefMap    the back reference bitmap
1860 */
1861 
branchIsAnchored(const unsigned char * code)1862 static bool branchIsAnchored(const unsigned char* code)
1863 {
1864     const unsigned char* scode = firstSignificantOpcode(code);
1865     int op = *scode;
1866 
1867     /* Brackets */
1868     if (op >= OP_BRA || op == OP_ASSERT)
1869         return bracketIsAnchored(scode);
1870 
1871     /* Check for explicit anchoring */
1872     return op == OP_CIRC;
1873 }
1874 
bracketIsAnchored(const unsigned char * code)1875 static bool bracketIsAnchored(const unsigned char* code)
1876 {
1877     do {
1878         if (!branchIsAnchored(code + 1 + LINK_SIZE))
1879             return false;
1880         code += getLinkValue(code + 1);
1881     } while (*code == OP_ALT);   /* Loop for each alternative */
1882     return true;
1883 }
1884 
1885 /*************************************************
1886 *         Check for starting with ^ or .*        *
1887 *************************************************/
1888 
1889 /* This is called to find out if every branch starts with ^ or .* so that
1890 "first char" processing can be done to speed things up in multiline
1891 matching and for non-DOTALL patterns that start with .* (which must start at
1892 the beginning or after \n)
1893 
1894 Except when the .* appears inside capturing parentheses, and there is a
1895 subsequent back reference to those parentheses. By keeping a bitmap of the
1896 first 31 back references, we can catch some of the more common cases more
1897 precisely; all the greater back references share a single bit.
1898 
1899 Arguments:
1900   code          points to start of expression (the bracket)
1901   captureMap    a bitmap of which brackets we are inside while testing; this
1902                  handles up to substring 31; all brackets after that share
1903                  the zero bit
1904   backrefMap    the back reference bitmap
1905 */
1906 
branchNeedsLineStart(const unsigned char * code,unsigned captureMap,unsigned backrefMap)1907 static bool branchNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
1908 {
1909     const unsigned char* scode = firstSignificantOpcode(code);
1910     int op = *scode;
1911 
1912     /* Capturing brackets */
1913     if (op > OP_BRA) {
1914         int captureNum = op - OP_BRA;
1915         if (captureNum > EXTRACT_BASIC_MAX)
1916             captureNum = get2ByteValue(scode + 2 + LINK_SIZE);
1917         int bracketMask = (captureNum < 32) ? (1 << captureNum) : 1;
1918         return bracketNeedsLineStart(scode, captureMap | bracketMask, backrefMap);
1919     }
1920 
1921     /* Other brackets */
1922     if (op == OP_BRA || op == OP_ASSERT)
1923         return bracketNeedsLineStart(scode, captureMap, backrefMap);
1924 
1925     /* .* means "start at start or after \n" if it isn't in brackets that
1926      may be referenced. */
1927 
1928     if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
1929         return scode[1] == OP_NOT_NEWLINE && !(captureMap & backrefMap);
1930 
1931     /* Explicit ^ */
1932     return op == OP_CIRC || op == OP_BOL;
1933 }
1934 
bracketNeedsLineStart(const unsigned char * code,unsigned captureMap,unsigned backrefMap)1935 static bool bracketNeedsLineStart(const unsigned char* code, unsigned captureMap, unsigned backrefMap)
1936 {
1937     do {
1938         if (!branchNeedsLineStart(code + 1 + LINK_SIZE, captureMap, backrefMap))
1939             return false;
1940         code += getLinkValue(code + 1);
1941     } while (*code == OP_ALT);  /* Loop for each alternative */
1942     return true;
1943 }
1944 
1945 /*************************************************
1946 *       Check for asserted fixed first char      *
1947 *************************************************/
1948 
1949 /* During compilation, the "first char" settings from forward assertions are
1950 discarded, because they can cause conflicts with actual literals that follow.
1951 However, if we end up without a first char setting for an unanchored pattern,
1952 it is worth scanning the regex to see if there is an initial asserted first
1953 char. If all branches start with the same asserted char, or with a bracket all
1954 of whose alternatives start with the same asserted char (recurse ad lib), then
1955 we return that char, otherwise -1.
1956 
1957 Arguments:
1958   code       points to start of expression (the bracket)
1959   options    pointer to the options (used to check casing changes)
1960   inassert   true if in an assertion
1961 
1962 Returns:     -1 or the fixed first char
1963 */
1964 
branchFindFirstAssertedCharacter(const unsigned char * code,bool inassert)1965 static int branchFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
1966 {
1967     const unsigned char* scode = firstSignificantOpcodeSkippingAssertions(code);
1968     int op = *scode;
1969 
1970     if (op >= OP_BRA)
1971         op = OP_BRA;
1972 
1973     switch (op) {
1974         default:
1975             return -1;
1976 
1977         case OP_BRA:
1978         case OP_ASSERT:
1979             return bracketFindFirstAssertedCharacter(scode, op == OP_ASSERT);
1980 
1981         case OP_EXACT:
1982             scode += 2;
1983             /* Fall through */
1984 
1985         case OP_CHAR:
1986         case OP_CHAR_IGNORING_CASE:
1987         case OP_ASCII_CHAR:
1988         case OP_ASCII_LETTER_IGNORING_CASE:
1989         case OP_PLUS:
1990         case OP_MINPLUS:
1991             if (!inassert)
1992                 return -1;
1993             return scode[1];
1994     }
1995 }
1996 
bracketFindFirstAssertedCharacter(const unsigned char * code,bool inassert)1997 static int bracketFindFirstAssertedCharacter(const unsigned char* code, bool inassert)
1998 {
1999     int c = -1;
2000     do {
2001         int d = branchFindFirstAssertedCharacter(code + 1 + LINK_SIZE, inassert);
2002         if (d < 0)
2003             return -1;
2004         if (c < 0)
2005             c = d;
2006         else if (c != d)
2007             return -1;
2008         code += getLinkValue(code + 1);
2009     } while (*code == OP_ALT);
2010     return c;
2011 }
2012 
multiplyWithOverflowCheck(int a,int b)2013 static inline int multiplyWithOverflowCheck(int a, int b)
2014 {
2015     if (!a || !b)
2016         return 0;
2017     if (a > MAX_PATTERN_SIZE / b)
2018         return -1;
2019     return a * b;
2020 }
2021 
calculateCompiledPatternLength(const UChar * pattern,int patternLength,JSRegExpIgnoreCaseOption ignoreCase,CompileData & cd,ErrorCode & errorcode)2022 static int calculateCompiledPatternLength(const UChar* pattern, int patternLength, JSRegExpIgnoreCaseOption ignoreCase,
2023     CompileData& cd, ErrorCode& errorcode)
2024 {
2025     /* Make a pass over the pattern to compute the
2026      amount of store required to hold the compiled code. This does not have to be
2027      perfect as long as errors are overestimates. */
2028 
2029     if (patternLength > MAX_PATTERN_SIZE) {
2030         errorcode = ERR16;
2031         return -1;
2032     }
2033 
2034     int length = 1 + LINK_SIZE;      /* For initial BRA plus length */
2035     int branch_extra = 0;
2036     int lastitemlength = 0;
2037     unsigned brastackptr = 0;
2038     int brastack[BRASTACK_SIZE];
2039     unsigned char bralenstack[BRASTACK_SIZE];
2040     int bracount = 0;
2041 
2042     const UChar* ptr = (const UChar*)(pattern - 1);
2043     const UChar* patternEnd = (const UChar*)(pattern + patternLength);
2044 
2045     while (++ptr < patternEnd) {
2046         int minRepeats = 0, maxRepeats = 0;
2047         int c = *ptr;
2048 
2049         switch (c) {
2050             /* A backslashed item may be an escaped data character or it may be a
2051              character type. */
2052 
2053             case '\\':
2054                 c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, false);
2055                 if (errorcode != 0)
2056                     return -1;
2057 
2058                 lastitemlength = 1;     /* Default length of last item for repeats */
2059 
2060                 if (c >= 0) {            /* Data character */
2061                     length += 2;          /* For a one-byte character */
2062 
2063                     if (c > 127) {
2064                         int i;
2065                         for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
2066                             if (c <= jsc_pcre_utf8_table1[i]) break;
2067                         length += i;
2068                         lastitemlength += i;
2069                     }
2070 
2071                     continue;
2072                 }
2073 
2074                 /* Other escapes need one byte */
2075 
2076                 length++;
2077 
2078                 /* A back reference needs an additional 2 bytes, plus either one or 5
2079                  bytes for a repeat. We also need to keep the value of the highest
2080                  back reference. */
2081 
2082                 if (c <= -ESC_REF) {
2083                     int refnum = -c - ESC_REF;
2084                     cd.backrefMap |= (refnum < 32) ? (1 << refnum) : 1;
2085                     if (refnum > cd.topBackref)
2086                         cd.topBackref = refnum;
2087                     length += 2;   /* For single back reference */
2088                     if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
2089                         ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
2090                         if (errorcode)
2091                             return -1;
2092                         if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2093                             (minRepeats == 1 && maxRepeats == -1))
2094                             length++;
2095                         else
2096                             length += 5;
2097                         if (safelyCheckNextChar(ptr, patternEnd, '?'))
2098                             ptr++;
2099                     }
2100                 }
2101                 continue;
2102 
2103             case '^':     /* Single-byte metacharacters */
2104             case '.':
2105             case '$':
2106                 length++;
2107                 lastitemlength = 1;
2108                 continue;
2109 
2110             case '*':            /* These repeats won't be after brackets; */
2111             case '+':            /* those are handled separately */
2112             case '?':
2113                 length++;
2114                 goto POSSESSIVE;
2115 
2116             /* This covers the cases of braced repeats after a single char, metachar,
2117              class, or back reference. */
2118 
2119             case '{':
2120                 if (!isCountedRepeat(ptr + 1, patternEnd))
2121                     goto NORMAL_CHAR;
2122                 ptr = readRepeatCounts(ptr + 1, &minRepeats, &maxRepeats, &errorcode);
2123                 if (errorcode != 0)
2124                     return -1;
2125 
2126                 /* These special cases just insert one extra opcode */
2127 
2128                 if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2129                     (minRepeats == 1 && maxRepeats == -1))
2130                     length++;
2131 
2132                 /* These cases might insert additional copies of a preceding character. */
2133 
2134                 else {
2135                     if (minRepeats != 1) {
2136                         length -= lastitemlength;   /* Uncount the original char or metachar */
2137                         if (minRepeats > 0)
2138                             length += 3 + lastitemlength;
2139                     }
2140                     length += lastitemlength + ((maxRepeats > 0) ? 3 : 1);
2141                 }
2142 
2143                 if (safelyCheckNextChar(ptr, patternEnd, '?'))
2144                     ptr++;      /* Needs no extra length */
2145 
2146             POSSESSIVE:                     /* Test for possessive quantifier */
2147                 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2148                     ptr++;
2149                     length += 2 + 2 * LINK_SIZE;   /* Allow for atomic brackets */
2150                 }
2151                 continue;
2152 
2153             /* An alternation contains an offset to the next branch or ket. If any ims
2154              options changed in the previous branch(es), and/or if we are in a
2155              lookbehind assertion, extra space will be needed at the start of the
2156              branch. This is handled by branch_extra. */
2157 
2158             case '|':
2159                 if (brastackptr == 0)
2160                     cd.needOuterBracket = true;
2161                 length += 1 + LINK_SIZE + branch_extra;
2162                 continue;
2163 
2164             /* A character class uses 33 characters provided that all the character
2165              values are less than 256. Otherwise, it uses a bit map for low valued
2166              characters, and individual items for others. Don't worry about character
2167              types that aren't allowed in classes - they'll get picked up during the
2168              compile. A character class that contains only one single-byte character
2169              uses 2 or 3 bytes, depending on whether it is negated or not. Notice this
2170              where we can. (In UTF-8 mode we can do this only for chars < 128.) */
2171 
2172             case '[': {
2173                 int class_optcount;
2174                 if (*(++ptr) == '^') {
2175                     class_optcount = 10;  /* Greater than one */
2176                     ptr++;
2177                 }
2178                 else
2179                     class_optcount = 0;
2180 
2181                 bool class_utf8 = false;
2182 
2183                 for (; ptr < patternEnd && *ptr != ']'; ++ptr) {
2184                     /* Check for escapes */
2185 
2186                     if (*ptr == '\\') {
2187                         c = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
2188                         if (errorcode != 0)
2189                             return -1;
2190 
2191                         /* Handle escapes that turn into characters */
2192 
2193                         if (c >= 0)
2194                             goto NON_SPECIAL_CHARACTER;
2195 
2196                         /* Escapes that are meta-things. The normal ones just affect the
2197                          bit map, but Unicode properties require an XCLASS extended item. */
2198 
2199                         else
2200                             class_optcount = 10;         /* \d, \s etc; make sure > 1 */
2201                     }
2202 
2203                     /* Anything else increments the possible optimization count. We have to
2204                      detect ranges here so that we can compute the number of extra ranges for
2205                      caseless wide characters when UCP support is available. If there are wide
2206                      characters, we are going to have to use an XCLASS, even for single
2207                      characters. */
2208 
2209                     else {
2210                         c = *ptr;
2211 
2212                         /* Come here from handling \ above when it escapes to a char value */
2213 
2214                     NON_SPECIAL_CHARACTER:
2215                         class_optcount++;
2216 
2217                         int d = -1;
2218                         if (safelyCheckNextChar(ptr, patternEnd, '-')) {
2219                             const UChar* hyptr = ptr++;
2220                             if (safelyCheckNextChar(ptr, patternEnd, '\\')) {
2221                                 ptr++;
2222                                 d = checkEscape(&ptr, patternEnd, &errorcode, cd.numCapturingBrackets, true);
2223                                 if (errorcode != 0)
2224                                     return -1;
2225                             }
2226                             else if ((ptr + 1 < patternEnd) && ptr[1] != ']')
2227                                 d = *++ptr;
2228                             if (d < 0)
2229                                 ptr = hyptr;      /* go back to hyphen as data */
2230                         }
2231 
2232                         /* If d >= 0 we have a range. In UTF-8 mode, if the end is > 255, or >
2233                          127 for caseless matching, we will need to use an XCLASS. */
2234 
2235                         if (d >= 0) {
2236                             class_optcount = 10;     /* Ensure > 1 */
2237                             if (d < c) {
2238                                 errorcode = ERR8;
2239                                 return -1;
2240                             }
2241 
2242                             if ((d > 255 || (ignoreCase && d > 127))) {
2243                                 unsigned char buffer[6];
2244                                 if (!class_utf8)         /* Allow for XCLASS overhead */
2245                                 {
2246                                     class_utf8 = true;
2247                                     length += LINK_SIZE + 2;
2248                                 }
2249 
2250                                 /* If we have UCP support, find out how many extra ranges are
2251                                  needed to map the other case of characters within this range. We
2252                                  have to mimic the range optimization here, because extending the
2253                                  range upwards might push d over a boundary that makes it use
2254                                  another byte in the UTF-8 representation. */
2255 
2256                                 if (ignoreCase) {
2257                                     int occ, ocd;
2258                                     int cc = c;
2259                                     int origd = d;
2260                                     while (getOthercaseRange(&cc, origd, &occ, &ocd)) {
2261                                         if (occ >= c && ocd <= d)
2262                                             continue;   /* Skip embedded */
2263 
2264                                         if (occ < c  && ocd >= c - 1)  /* Extend the basic range */
2265                                         {                            /* if there is overlap,   */
2266                                             c = occ;                     /* noting that if occ < c */
2267                                             continue;                    /* we can't have ocd > d  */
2268                                         }                            /* because a subrange is  */
2269                                         if (ocd > d && occ <= d + 1)   /* always shorter than    */
2270                                         {                            /* the basic range.       */
2271                                             d = ocd;
2272                                             continue;
2273                                         }
2274 
2275                                         /* An extra item is needed */
2276 
2277                                         length += 1 + encodeUTF8(occ, buffer) +
2278                                         ((occ == ocd) ? 0 : encodeUTF8(ocd, buffer));
2279                                     }
2280                                 }
2281 
2282                                 /* The length of the (possibly extended) range */
2283 
2284                                 length += 1 + encodeUTF8(c, buffer) + encodeUTF8(d, buffer);
2285                             }
2286 
2287                         }
2288 
2289                         /* We have a single character. There is nothing to be done unless we
2290                          are in UTF-8 mode. If the char is > 255, or 127 when caseless, we must
2291                          allow for an XCL_SINGLE item, doubled for caselessness if there is UCP
2292                          support. */
2293 
2294                         else {
2295                             if ((c > 255 || (ignoreCase && c > 127))) {
2296                                 unsigned char buffer[6];
2297                                 class_optcount = 10;     /* Ensure > 1 */
2298                                 if (!class_utf8)         /* Allow for XCLASS overhead */
2299                                 {
2300                                     class_utf8 = true;
2301                                     length += LINK_SIZE + 2;
2302                                 }
2303                                 length += (ignoreCase ? 2 : 1) * (1 + encodeUTF8(c, buffer));
2304                             }
2305                         }
2306                     }
2307                 }
2308 
2309                 if (ptr >= patternEnd) {   /* Missing terminating ']' */
2310                     errorcode = ERR6;
2311                     return -1;
2312                 }
2313 
2314                 /* We can optimize when there was only one optimizable character.
2315                  Note that this does not detect the case of a negated single character.
2316                  In that case we do an incorrect length computation, but it's not a serious
2317                  problem because the computed length is too large rather than too small. */
2318 
2319                 if (class_optcount == 1)
2320                     goto NORMAL_CHAR;
2321 
2322                 /* Here, we handle repeats for the class opcodes. */
2323                 {
2324                     length += 33;
2325 
2326                     /* A repeat needs either 1 or 5 bytes. If it is a possessive quantifier,
2327                      we also need extra for wrapping the whole thing in a sub-pattern. */
2328 
2329                     if (safelyCheckNextChar(ptr, patternEnd, '{') && isCountedRepeat(ptr + 2, patternEnd)) {
2330                         ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
2331                         if (errorcode != 0)
2332                             return -1;
2333                         if ((minRepeats == 0 && (maxRepeats == 1 || maxRepeats == -1)) ||
2334                             (minRepeats == 1 && maxRepeats == -1))
2335                             length++;
2336                         else
2337                             length += 5;
2338                         if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2339                             ptr++;
2340                             length += 2 + 2 * LINK_SIZE;
2341                         } else if (safelyCheckNextChar(ptr, patternEnd, '?'))
2342                             ptr++;
2343                     }
2344                 }
2345                 continue;
2346             }
2347 
2348             /* Brackets may be genuine groups or special things */
2349 
2350             case '(': {
2351                 int branch_newextra = 0;
2352                 int bracket_length = 1 + LINK_SIZE;
2353                 bool capturing = false;
2354 
2355                 /* Handle special forms of bracket, which all start (? */
2356 
2357                 if (safelyCheckNextChar(ptr, patternEnd, '?')) {
2358                     switch (c = (ptr + 2 < patternEnd ? ptr[2] : 0)) {
2359                         /* Non-referencing groups and lookaheads just move the pointer on, and
2360                          then behave like a non-special bracket, except that they don't increment
2361                          the count of extracting brackets. Ditto for the "once only" bracket,
2362                          which is in Perl from version 5.005. */
2363 
2364                         case ':':
2365                         case '=':
2366                         case '!':
2367                             ptr += 2;
2368                             break;
2369 
2370                         /* Else loop checking valid options until ) is met. Anything else is an
2371                          error. If we are without any brackets, i.e. at top level, the settings
2372                          act as if specified in the options, so massage the options immediately.
2373                          This is for backward compatibility with Perl 5.004. */
2374 
2375                         default:
2376                             errorcode = ERR12;
2377                             return -1;
2378                     }
2379                 } else
2380                     capturing = 1;
2381 
2382                 /* Capturing brackets must be counted so we can process escapes in a
2383                  Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to need
2384                  an additional 3 bytes of memory per capturing bracket. */
2385 
2386                 if (capturing) {
2387                     bracount++;
2388                     if (bracount > EXTRACT_BASIC_MAX)
2389                         bracket_length += 3;
2390                 }
2391 
2392                 /* Save length for computing whole length at end if there's a repeat that
2393                  requires duplication of the group. Also save the current value of
2394                  branch_extra, and start the new group with the new value. If non-zero, this
2395                  will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2396 
2397                 if (brastackptr >= sizeof(brastack)/sizeof(int)) {
2398                     errorcode = ERR17;
2399                     return -1;
2400                 }
2401 
2402                 bralenstack[brastackptr] = branch_extra;
2403                 branch_extra = branch_newextra;
2404 
2405                 brastack[brastackptr++] = length;
2406                 length += bracket_length;
2407                 continue;
2408             }
2409 
2410             /* Handle ket. Look for subsequent maxRepeats/minRepeats; for certain sets of values we
2411              have to replicate this bracket up to that many times. If brastackptr is
2412              0 this is an unmatched bracket which will generate an error, but take care
2413              not to try to access brastack[-1] when computing the length and restoring
2414              the branch_extra value. */
2415 
2416             case ')': {
2417                 int duplength;
2418                 length += 1 + LINK_SIZE;
2419                 if (brastackptr > 0) {
2420                     duplength = length - brastack[--brastackptr];
2421                     branch_extra = bralenstack[brastackptr];
2422                 }
2423                 else
2424                     duplength = 0;
2425 
2426                 /* Leave ptr at the final char; for readRepeatCounts this happens
2427                  automatically; for the others we need an increment. */
2428 
2429                 if ((ptr + 1 < patternEnd) && (c = ptr[1]) == '{' && isCountedRepeat(ptr + 2, patternEnd)) {
2430                     ptr = readRepeatCounts(ptr + 2, &minRepeats, &maxRepeats, &errorcode);
2431                     if (errorcode)
2432                         return -1;
2433                 } else if (c == '*') {
2434                     minRepeats = 0;
2435                     maxRepeats = -1;
2436                     ptr++;
2437                 } else if (c == '+') {
2438                     minRepeats = 1;
2439                     maxRepeats = -1;
2440                     ptr++;
2441                 } else if (c == '?') {
2442                     minRepeats = 0;
2443                     maxRepeats = 1;
2444                     ptr++;
2445                 } else {
2446                     minRepeats = 1;
2447                     maxRepeats = 1;
2448                 }
2449 
2450                 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2451                  group, and if the maximum is greater than zero, we have to replicate
2452                  maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2453                  bracket set. */
2454 
2455                 int repeatsLength;
2456                 if (minRepeats == 0) {
2457                     length++;
2458                     if (maxRepeats > 0) {
2459                         repeatsLength = multiplyWithOverflowCheck(maxRepeats - 1, duplength + 3 + 2 * LINK_SIZE);
2460                         if (repeatsLength < 0) {
2461                             errorcode = ERR16;
2462                             return -1;
2463                         }
2464                         length += repeatsLength;
2465                         if (length > MAX_PATTERN_SIZE) {
2466                             errorcode = ERR16;
2467                             return -1;
2468                         }
2469                     }
2470                 }
2471 
2472                 /* When the minimum is greater than zero, we have to replicate up to
2473                  minval-1 times, with no additions required in the copies. Then, if there
2474                  is a limited maximum we have to replicate up to maxval-1 times allowing
2475                  for a BRAZERO item before each optional copy and nesting brackets for all
2476                  but one of the optional copies. */
2477 
2478                 else {
2479                     repeatsLength = multiplyWithOverflowCheck(minRepeats - 1, duplength);
2480                     if (repeatsLength < 0) {
2481                         errorcode = ERR16;
2482                         return -1;
2483                     }
2484                     length += repeatsLength;
2485                     if (maxRepeats > minRepeats) { /* Need this test as maxRepeats=-1 means no limit */
2486                         repeatsLength = multiplyWithOverflowCheck(maxRepeats - minRepeats, duplength + 3 + 2 * LINK_SIZE);
2487                         if (repeatsLength < 0) {
2488                             errorcode = ERR16;
2489                             return -1;
2490                         }
2491                         length += repeatsLength - (2 + 2 * LINK_SIZE);
2492                     }
2493                     if (length > MAX_PATTERN_SIZE) {
2494                         errorcode = ERR16;
2495                         return -1;
2496                     }
2497                 }
2498 
2499                 /* Allow space for once brackets for "possessive quantifier" */
2500 
2501                 if (safelyCheckNextChar(ptr, patternEnd, '+')) {
2502                     ptr++;
2503                     length += 2 + 2 * LINK_SIZE;
2504                 }
2505                 continue;
2506             }
2507 
2508             /* Non-special character. It won't be space or # in extended mode, so it is
2509              always a genuine character. If we are in a \Q...\E sequence, check for the
2510              end; if not, we have a literal. */
2511 
2512             default:
2513             NORMAL_CHAR:
2514                 length += 2;          /* For a one-byte character */
2515                 lastitemlength = 1;   /* Default length of last item for repeats */
2516 
2517                 if (c > 127) {
2518                     int i;
2519                     for (i = 0; i < jsc_pcre_utf8_table1_size; i++)
2520                         if (c <= jsc_pcre_utf8_table1[i])
2521                             break;
2522                     length += i;
2523                     lastitemlength += i;
2524                 }
2525 
2526                 continue;
2527         }
2528     }
2529 
2530     length += 2 + LINK_SIZE;    /* For final KET and END */
2531 
2532     cd.numCapturingBrackets = bracount;
2533     return length;
2534 }
2535 
2536 /*************************************************
2537 *        Compile a Regular Expression            *
2538 *************************************************/
2539 
2540 /* This function takes a string and returns a pointer to a block of store
2541 holding a compiled version of the expression. The original API for this
2542 function had no error code return variable; it is retained for backwards
2543 compatibility. The new function is given a new name.
2544 
2545 Arguments:
2546   pattern       the regular expression
2547   options       various option bits
2548   errorCodePtr  pointer to error code variable (pcre_compile2() only)
2549                   can be NULL if you don't want a code value
2550   errorPtr      pointer to pointer to error text
2551   erroroffset   ptr offset in pattern where error was detected
2552   tables        pointer to character tables or NULL
2553 
2554 Returns:        pointer to compiled data block, or NULL on error,
2555                 with errorPtr and erroroffset set
2556 */
2557 
returnError(ErrorCode errorcode,const char ** errorPtr)2558 static inline JSRegExp* returnError(ErrorCode errorcode, const char** errorPtr)
2559 {
2560     *errorPtr = errorText(errorcode);
2561     return 0;
2562 }
2563 
jsRegExpCompile(const UChar * pattern,int patternLength,JSRegExpIgnoreCaseOption ignoreCase,JSRegExpMultilineOption multiline,unsigned * numSubpatterns,const char ** errorPtr)2564 JSRegExp* jsRegExpCompile(const UChar* pattern, int patternLength,
2565                 JSRegExpIgnoreCaseOption ignoreCase, JSRegExpMultilineOption multiline,
2566                 unsigned* numSubpatterns, const char** errorPtr)
2567 {
2568     /* We can't pass back an error message if errorPtr is NULL; I guess the best we
2569      can do is just return NULL, but we can set a code value if there is a code pointer. */
2570     if (!errorPtr)
2571         return 0;
2572     *errorPtr = NULL;
2573 
2574     CompileData cd;
2575 
2576     ErrorCode errorcode = ERR0;
2577     /* Call this once just to count the brackets. */
2578     calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
2579     /* Call it again to compute the length. */
2580     int length = calculateCompiledPatternLength(pattern, patternLength, ignoreCase, cd, errorcode);
2581     if (errorcode)
2582         return returnError(errorcode, errorPtr);
2583 
2584     if (length > MAX_PATTERN_SIZE)
2585         return returnError(ERR16, errorPtr);
2586 
2587     size_t size = length + sizeof(JSRegExp);
2588 #if REGEXP_HISTOGRAM
2589     size_t stringOffset = (size + sizeof(UChar) - 1) / sizeof(UChar) * sizeof(UChar);
2590     size = stringOffset + patternLength * sizeof(UChar);
2591 #endif
2592     JSRegExp* re = reinterpret_cast<JSRegExp*>(new char[size]);
2593 
2594     if (!re)
2595         return returnError(ERR13, errorPtr);
2596 
2597     re->options = (ignoreCase ? IgnoreCaseOption : 0) | (multiline ? MatchAcrossMultipleLinesOption : 0);
2598 
2599     /* The starting points of the name/number translation table and of the code are
2600      passed around in the compile data block. */
2601 
2602     const unsigned char* codeStart = (const unsigned char*)(re + 1);
2603 
2604     /* Set up a starting, non-extracting bracket, then compile the expression. On
2605      error, errorcode will be set non-zero, so we don't need to look at the result
2606      of the function here. */
2607 
2608     const UChar* ptr = (const UChar*)pattern;
2609     const UChar* patternEnd = pattern + patternLength;
2610     unsigned char* code = const_cast<unsigned char*>(codeStart);
2611     int firstByte, reqByte;
2612     int bracketCount = 0;
2613     if (!cd.needOuterBracket)
2614         compileBranch(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, &firstByte, &reqByte, cd);
2615     else {
2616         *code = OP_BRA;
2617         compileBracket(re->options, &bracketCount, &code, &ptr, patternEnd, &errorcode, 0, &firstByte, &reqByte, cd);
2618     }
2619     re->topBracket = bracketCount;
2620     re->topBackref = cd.topBackref;
2621 
2622     /* If not reached end of pattern on success, there's an excess bracket. */
2623 
2624     if (errorcode == 0 && ptr < patternEnd)
2625         errorcode = ERR10;
2626 
2627     /* Fill in the terminating state and check for disastrous overflow, but
2628      if debugging, leave the test till after things are printed out. */
2629 
2630     *code++ = OP_END;
2631 
2632     ASSERT(code - codeStart <= length);
2633     if (code - codeStart > length)
2634         errorcode = ERR7;
2635 
2636     /* Give an error if there's back reference to a non-existent capturing
2637      subpattern. */
2638 
2639     if (re->topBackref > re->topBracket)
2640         errorcode = ERR15;
2641 
2642     /* Failed to compile, or error while post-processing */
2643 
2644     if (errorcode != ERR0) {
2645         delete [] reinterpret_cast<char*>(re);
2646         return returnError(errorcode, errorPtr);
2647     }
2648 
2649     /* If the anchored option was not passed, set the flag if we can determine that
2650      the pattern is anchored by virtue of ^ characters or \A or anything else (such
2651      as starting with .* when DOTALL is set).
2652 
2653      Otherwise, if we know what the first character has to be, save it, because that
2654      speeds up unanchored matches no end. If not, see if we can set the
2655      UseMultiLineFirstByteOptimizationOption flag. This is helpful for multiline matches when all branches
2656      start with ^. and also when all branches start with .* for non-DOTALL matches.
2657      */
2658 
2659     if (cd.needOuterBracket ? bracketIsAnchored(codeStart) : branchIsAnchored(codeStart))
2660         re->options |= IsAnchoredOption;
2661     else {
2662         if (firstByte < 0) {
2663             firstByte = (cd.needOuterBracket
2664                     ? bracketFindFirstAssertedCharacter(codeStart, false)
2665                     : branchFindFirstAssertedCharacter(codeStart, false))
2666                 | ((re->options & IgnoreCaseOption) ? REQ_IGNORE_CASE : 0);
2667         }
2668         if (firstByte >= 0) {
2669             int ch = firstByte & 255;
2670             if (ch < 127) {
2671                 re->firstByte = ((firstByte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? ch : firstByte;
2672                 re->options |= UseFirstByteOptimizationOption;
2673             }
2674         } else {
2675             if (cd.needOuterBracket ? bracketNeedsLineStart(codeStart, 0, cd.backrefMap) : branchNeedsLineStart(codeStart, 0, cd.backrefMap))
2676                 re->options |= UseMultiLineFirstByteOptimizationOption;
2677         }
2678     }
2679 
2680     /* For an anchored pattern, we use the "required byte" only if it follows a
2681      variable length item in the regex. Remove the caseless flag for non-caseable
2682      bytes. */
2683 
2684     if (reqByte >= 0 && (!(re->options & IsAnchoredOption) || (reqByte & REQ_VARY))) {
2685         int ch = reqByte & 255;
2686         if (ch < 127) {
2687             re->reqByte = ((reqByte & REQ_IGNORE_CASE) && flipCase(ch) == ch) ? (reqByte & ~REQ_IGNORE_CASE) : reqByte;
2688             re->options |= UseRequiredByteOptimizationOption;
2689         }
2690     }
2691 
2692 #if REGEXP_HISTOGRAM
2693     re->stringOffset = stringOffset;
2694     re->stringLength = patternLength;
2695     memcpy(reinterpret_cast<char*>(re) + stringOffset, pattern, patternLength * 2);
2696 #endif
2697 
2698     if (numSubpatterns)
2699         *numSubpatterns = re->topBracket;
2700     return re;
2701 }
2702 
jsRegExpFree(JSRegExp * re)2703 void jsRegExpFree(JSRegExp* re)
2704 {
2705     delete [] reinterpret_cast<char*>(re);
2706 }
2707