• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "ecmascript/regexp/regexp_parser.h"
17 
18 #include "ecmascript/base/string_helper.h"
19 #include "ecmascript/ecma_macros.h"
20 #include "ecmascript/regexp/regexp_opcode.h"
21 #include "libpandabase/utils/utils.h"
22 #include "securec.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uniset.h"
25 #define _NO_DEBUG_
26 
27 namespace panda::ecmascript {
28 static constexpr uint32_t CACHE_SIZE = 128;
29 static constexpr uint32_t CHAR_MAXS = 128;
30 static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
31     /* $ A-Z _ a-z */
32     0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
33 };
34 static RangeSet g_rangeD(0x30, 0x39);  // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
35 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
36 static RangeSet g_rangeS({
37     std::pair<uint32_t, uint32_t>(0x0009, 0x000D),  // NOLINTNEXTLINE(readability-magic-numbers)
38     std::pair<uint32_t, uint32_t>(0x0020, 0x0020),  // NOLINTNEXTLINE(readability-magic-numbers)
39     std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0),  // NOLINTNEXTLINE(readability-magic-numbers)
40     std::pair<uint32_t, uint32_t>(0x1680, 0x1680),  // NOLINTNEXTLINE(readability-magic-numbers)
41     std::pair<uint32_t, uint32_t>(0x2000, 0x200A),  // NOLINTNEXTLINE(readability-magic-numbers)
42     /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
43     /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
44     std::pair<uint32_t, uint32_t>(0x2028, 0x2029),  // NOLINTNEXTLINE(readability-magic-numbers)
45     std::pair<uint32_t, uint32_t>(0x202F, 0x202F),  // NOLINTNEXTLINE(readability-magic-numbers)
46     std::pair<uint32_t, uint32_t>(0x205F, 0x205F),  // NOLINTNEXTLINE(readability-magic-numbers)
47     std::pair<uint32_t, uint32_t>(0x3000, 0x3000),  // NOLINTNEXTLINE(readability-magic-numbers)
48     /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
49     std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF),  // NOLINTNEXTLINE(readability-magic-numbers)
50 });
51 
52 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
53 static RangeSet g_rangeW({
54     std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINTNEXTLINE(readability-magic-numbers)
55     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINTNEXTLINE(readability-magic-numbers)
56     std::pair<uint32_t, uint32_t>(0x005F, 0x005F),  // NOLINTNEXTLINE(readability-magic-numbers)
57     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINTNEXTLINE(readability-magic-numbers)
58 });
59 
60 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
61 static RangeSet g_regexpIdentifyStart({
62     std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINTNEXTLINE(readability-magic-numbers)
63     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINTNEXTLINE(readability-magic-numbers)
64     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINTNEXTLINE(readability-magic-numbers)
65 });
66 
67 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
68 static RangeSet g_regexpIdentifyContinue({
69     std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINTNEXTLINE(readability-magic-numbers)
70     std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINTNEXTLINE(readability-magic-numbers)
71     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINTNEXTLINE(readability-magic-numbers)
72     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINTNEXTLINE(readability-magic-numbers)
73 });
74 
Parse()75 void RegExpParser::Parse()
76 {
77     // dynbuffer head init [size,capture_count,statck_count,flags]
78     buffer_.EmitU32(0);
79     buffer_.EmitU32(0);
80     buffer_.EmitU32(0);
81     buffer_.EmitU32(0);
82     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
83     PrintF("Parse Pattern------\n");
84     // Pattern[U, N]::
85     //      Disjunction[?U, ?N]
86     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
87     Advance();
88     SaveStartOpCode saveStartOp;
89     int captureIndex = captureCount_++;
90     saveStartOp.EmitOpCode(&buffer_, captureIndex);
91     ParseDisjunction(false);
92     if (isError_) {
93         return;
94     }
95     if (c0_ != KEY_EOF) {
96         ParseError("extraneous characters at the end");
97         return;
98     }
99     SaveEndOpCode saveEndOp;
100     saveEndOp.EmitOpCode(&buffer_, captureIndex);
101     MatchEndOpCode matchEndOp;
102     matchEndOp.EmitOpCode(&buffer_, 0);
103     // dynbuffer head assignments
104     buffer_.PutU32(0, buffer_.size_);
105     buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
106     buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
107     buffer_.PutU32(FLAGS_OFFSET, flags_);
108 #ifndef _NO_DEBUG_
109     RegExpOpCode::DumpRegExpOpCode(std::cout, buffer_);
110 #endif
111 }
112 
ParseDisjunction(bool isBackward)113 void RegExpParser::ParseDisjunction(bool isBackward)
114 {
115     // check stack overflow because infinite recursion may occur
116     DoParserStackOverflowCheck("invalid regular expression.");
117     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
118     PrintF("Parse Disjunction------\n");
119     size_t start = buffer_.size_;
120     ParseAlternative(isBackward);
121     if (isError_) {
122         return;
123     }
124     do {
125         if (c0_ == '|') {
126             SplitNextOpCode splitOp;
127             uint32_t len = buffer_.size_ - start;
128             GotoOpCode gotoOp;
129             splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
130             uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
131             Advance();
132             ParseAlternative(isBackward);
133             gotoOp.UpdateOpPara(&buffer_, pos, buffer_.size_ - pos - gotoOp.GetSize());
134         }
135     } while (c0_ != KEY_EOF && c0_ != ')');
136 }
137 
ParseOctalLiteral()138 uint32_t RegExpParser::ParseOctalLiteral()
139 {
140     // For compatibility with some other browsers (not all), we parse
141     // up to three octal digits with a value below 256.
142     // ES#prod-annexB-LegacyOctalEscapeSequence
143     uint32_t value = c0_ - '0';
144     Advance();
145     if (c0_ >= '0' && c0_ <= '7') {
146         value = value * OCTAL_VALUE + c0_ - '0';
147         Advance();
148         if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
149             value = value * OCTAL_VALUE + c0_ - '0';
150             Advance();
151         }
152     }
153     return value;
154 }
155 
ParseUnlimitedLengthHexNumber(uint32_t maxValue,uint32_t * value)156 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
157 {
158     uint32_t x = 0;
159     int d = static_cast<int>(HexValue(c0_));
160     if (d < 0) {
161         return false;
162     }
163     while (d >= 0) {
164         if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
165             LOG_FULL(FATAL) << "value overflow";
166             return false;
167         }
168         x = x * HEX_VALUE + static_cast<uint32_t>(d);
169         if (x > maxValue) {
170             return false;
171         }
172         Advance();
173         d = static_cast<int>(HexValue(c0_));
174     }
175     *value = x;
176     return true;
177 }
178 
179 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uint32_t * value)180 bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
181 {
182     // Accept both \uxxxx and \u{xxxxxx} (if allowed).
183     // In the latter case, the number of hex digits between { } is arbitrary.
184     // \ and u have already been read.
185     if (c0_ == '{' && IsUtf16()) {
186         uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
187         Advance();
188         if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {  // NOLINTNEXTLINE(readability-magic-numbers)
189             if (c0_ == '}') {
190                 Advance();
191                 return true;
192             }
193         }
194         pc_ = start;
195         Advance();
196         return false;
197     }
198     // \u but no {, or \u{...} escapes not allowed.
199     bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
200     if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
201         // Attempt to read trail surrogate.
202         uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
203         if (*pc_ == 'u') {
204             Advance(UNICODE_HEX_ADVANCE);
205             uint32_t trail = 0;
206             if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
207                 *value = U16_GET_SUPPLEMENTARY((*value), (trail));  // NOLINTNEXTLINE(hicpp-signed-bitwise)
208                 return true;
209             }
210         }
211         pc_ = start;
212         Advance();
213     }
214     return result;
215 }
216 
ParseHexEscape(int length,uint32_t * value)217 bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
218 {
219     uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
220     uint32_t val = 0;
221     for (int i = 0; i < length; ++i) {
222         uint32_t c = c0_;
223         int d = static_cast<int>(HexValue(c));
224         if (d < 0) {
225             pc_ = start;
226             Advance();
227             return false;
228         }
229         val = val * HEX_VALUE + static_cast<uint32_t>(d);
230         Advance();
231     }
232     *value = val;
233     return true;
234 }
235 
236 // NOLINTNEXTLINE(readability-function-size)
ParseAlternative(bool isBackward)237 void RegExpParser::ParseAlternative(bool isBackward)
238 {
239     size_t start = buffer_.size_;
240     while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
241         if (isError_) {
242             return;
243         }
244         size_t atomBcStart = buffer_.GetSize();
245         int captureIndex = 0;
246         bool isAtom = false;
247         switch (c0_) {
248             case '^': {
249                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
250                 PrintF("Assertion %c line start \n", c0_);
251                 LineStartOpCode lineStartOp;
252                 lineStartOp.EmitOpCode(&buffer_, 0);
253                 Advance();
254                 break;
255             }
256             case '$': {
257                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
258                 PrintF("Assertion %c line end \n", c0_);
259                 LineEndOpCode lineEndOp;
260                 lineEndOp.EmitOpCode(&buffer_, 0);
261                 Advance();
262                 break;
263             }
264             case '\\': {
265                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
266                 PrintF("Escape %c \n", c0_);
267                 Advance();
268                 switch (c0_) {
269                     case 'b': {
270                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
271                         PrintF("Assertion %c \n", c0_);
272                         WordBoundaryOpCode wordBoundaryOp;
273                         wordBoundaryOp.EmitOpCode(&buffer_, 0);
274                         Advance();
275                         break;
276                     }
277                     case 'B': {
278                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
279                         PrintF("Assertion %c \n", c0_);
280                         NotWordBoundaryOpCode notWordBoundaryOp;
281                         notWordBoundaryOp.EmitOpCode(&buffer_, 0);
282                         Advance();
283                         break;
284                     }
285                     default: {
286                         isAtom = true;
287                         int atomValue = ParseAtomEscape(isBackward);
288                         if (atomValue != -1) {
289                             PrevOpCode prevOp;
290                             if (isBackward) {
291                                 prevOp.EmitOpCode(&buffer_, 0);
292                             }
293                             if (IsIgnoreCase()) {
294                                 if (!IsUtf16()) {
295                                     atomValue = Canonicalize(atomValue, false);
296                                 } else {
297                                     icu::UnicodeSet set(atomValue, atomValue);
298                                     set.closeOver(USET_CASE_INSENSITIVE);
299                                     set.removeAllStrings();
300                                     uint32_t size = static_cast<uint32_t>(set.size());
301                                     RangeOpCode rangeOp;
302                                     RangeSet rangeResult;
303                                     for (uint32_t idx = 0; idx < size; idx++) {
304                                         int32_t uc = set.charAt(idx);
305                                         RangeSet curRange(uc);
306                                         rangeResult.Insert(curRange);
307                                     }
308                                     rangeOp.InsertOpCode(&buffer_, rangeResult);
309                                     break;
310                                 }
311                             }
312                             if (atomValue <= UINT16_MAX) {
313                                 CharOpCode charOp;
314                                 charOp.EmitOpCode(&buffer_, atomValue);
315                             } else {
316                                 Char32OpCode charOp;
317                                 charOp.EmitOpCode(&buffer_, atomValue);
318                             }
319                             if (isBackward) {
320                                 prevOp.EmitOpCode(&buffer_, 0);
321                             }
322                         }
323                         break;
324                     }
325                 }
326                 break;
327             }
328             case '(': {
329                 Advance();
330                 isAtom = ParseAssertionCapture(&captureIndex, isBackward);
331                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
332                 Advance();
333                 break;
334             }
335             case '.': {
336                 PrevOpCode prevOp;
337                 if (isBackward) {
338                     prevOp.EmitOpCode(&buffer_, 0);
339                 }
340                 if (IsDotAll()) {
341                     AllOpCode allOp;
342                     allOp.EmitOpCode(&buffer_, 0);
343                 } else {
344                     DotsOpCode dotsOp;
345                     dotsOp.EmitOpCode(&buffer_, 0);
346                 }
347                 if (isBackward) {
348                     prevOp.EmitOpCode(&buffer_, 0);
349                 }
350                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
351                 PrintF("Atom %c match any \n", c0_);
352                 isAtom = true;
353                 Advance();
354                 break;
355             }
356             case '[': {
357                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
358                 PrintF("Atom %c match range \n", c0_);
359                 isAtom = true;
360                 PrevOpCode prevOp;
361                 Advance();
362                 if (isBackward) {
363                     prevOp.EmitOpCode(&buffer_, 0);
364                 }
365                 bool isInvert = false;
366                 if (c0_ == '^') {
367                     isInvert = true;
368                     Advance();
369                 }
370                 RangeSet rangeResult;
371                 if (!ParseClassRanges(&rangeResult)) {
372                     break;
373                 }
374                 if (isInvert) {
375                     rangeResult.Invert(IsUtf16());
376                 }
377                 uint32_t highValue = rangeResult.HighestValue();
378                 if (highValue <= UINT16_MAX) {
379                     RangeOpCode rangeOp;
380                     rangeOp.InsertOpCode(&buffer_, rangeResult);
381                 } else {
382                     Range32OpCode rangeOp;
383                     rangeOp.InsertOpCode(&buffer_, rangeResult);
384                 }
385 
386                 if (isBackward) {
387                     prevOp.EmitOpCode(&buffer_, 0);
388                 }
389                 break;
390             }
391             case '*':
392             case '+':
393             case '?':
394                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
395                 ParseError("nothing to repeat");
396                 return;
397             case '{': {
398                 uint8_t *begin = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
399                 int dummy;
400                 if (ParserIntervalQuantifier(&dummy, &dummy)) {
401                     ParseError("nothing to repeat");
402                     return;
403                 }
404                 pc_ = begin;
405                 Advance();
406             }
407                 [[fallthrough]];
408             case '}':
409             case ']':
410                 if (IsUtf16()) {
411                     ParseError("syntax error");
412                     return;
413                 }
414                 [[fallthrough]];
415             default: {
416                 // PatternCharacter
417                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
418                 PrintF("PatternCharacter %c\n", c0_);
419                 isAtom = true;
420                 {
421                     PrevOpCode prevOp;
422                     if (isBackward) {
423                         prevOp.EmitOpCode(&buffer_, 0);
424                     }
425                     uint32_t matchedChar = c0_;
426                     if (c0_ > (INT8_MAX + 1)) {
427                         Prev();
428                         int i = 0;
429                         UChar32 c;
430                         int32_t length = end_ - pc_ + 1;
431                         // NOLINTNEXTLINE(hicpp-signed-bitwise)
432                         U8_NEXT(pc_, i, length, c);  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
433                         matchedChar = static_cast<uint32_t>(c);
434                         pc_ += i;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
435                     }
436                     if (IsIgnoreCase()) {
437                         matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
438                     }
439                     if (matchedChar > UINT16_MAX) {
440                         Char32OpCode charOp;
441                         charOp.EmitOpCode(&buffer_, matchedChar);
442                     } else {
443                         CharOpCode charOp;
444                         charOp.EmitOpCode(&buffer_, matchedChar);
445                     }
446                     if (isBackward) {
447                         prevOp.EmitOpCode(&buffer_, 0);
448                     }
449                 }
450                 Advance();
451                 break;
452             }
453         }
454         if (isAtom && !isError_) {
455             ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
456         }
457         if (isBackward) {
458             size_t end = buffer_.GetSize();
459             size_t termSize = end - atomBcStart;
460             size_t moveSize = end - start;
461             buffer_.Expand(end + termSize);
462             if (memmove_s(buffer_.buf_ + start +  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
463                               termSize,           // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
464                           moveSize,
465                           buffer_.buf_ + start,  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
466                           moveSize) != EOK) {
467                 LOG_FULL(FATAL) << "memmove_s failed";
468                 UNREACHABLE();
469             }
470             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
471             if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
472                 LOG_FULL(FATAL) << "memcpy_s failed";
473                 UNREACHABLE();
474             }
475         }
476     }
477 }
478 
FindGroupName(const CString & name)479 int RegExpParser::FindGroupName(const CString &name)
480 {
481     size_t len = 0;
482     size_t nameLen = name.size();
483     const char *p = reinterpret_cast<char *>(groupNames_.buf_);
484     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
485     const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
486     int captureIndex = 1;
487     while (p < bufEnd) {
488         len = strlen(p);
489         if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
490             return captureIndex;
491         }
492         p += len + 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
493         captureIndex++;
494     }
495     return -1;
496 }
497 
ParseAssertionCapture(int * captureIndex,bool isBackward)498 bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
499 {
500     bool isAtom = false;
501     do {
502         if (c0_ == '?') {
503             Advance();
504             switch (c0_) {
505                 // (?=Disjunction[?U, ?N])
506                 case '=': {
507                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
508                     PrintF("Assertion(?= Disjunction)\n");
509                     Advance();
510                     uint32_t start = buffer_.size_;
511                     ParseDisjunction(isBackward);
512                     MatchOpCode matchOp;
513                     matchOp.EmitOpCode(&buffer_, 0);
514                     MatchAheadOpCode matchAheadOp;
515                     uint32_t len = buffer_.size_ - start;
516                     matchAheadOp.InsertOpCode(&buffer_, start, len);
517                     break;
518                 }
519                 // (?!Disjunction[?U, ?N])
520                 case '!': {
521                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
522                     PrintF("Assertion(?! Disjunction)\n");
523                     uint32_t start = buffer_.size_;
524                     Advance();
525                     ParseDisjunction(isBackward);
526                     MatchOpCode matchOp;
527                     matchOp.EmitOpCode(&buffer_, 0);
528                     NegativeMatchAheadOpCode matchAheadOp;
529                     uint32_t len = buffer_.size_ - start;
530                     matchAheadOp.InsertOpCode(&buffer_, start, len);
531                     break;
532                 }
533                 case '<': {
534                     Advance();
535                     // (?<=Disjunction[?U, ?N])
536                     if (c0_ == '=') {
537                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
538                         PrintF("Assertion(?<= Disjunction)\n");
539                         Advance();
540                         uint32_t start = buffer_.size_;
541                         ParseDisjunction(true);
542                         MatchOpCode matchOp;
543                         matchOp.EmitOpCode(&buffer_, 0);
544                         MatchAheadOpCode matchAheadOp;
545                         uint32_t len = buffer_.size_ - start;
546                         matchAheadOp.InsertOpCode(&buffer_, start, len);
547                         // (?<!Disjunction[?U, ?N])
548                     } else if (c0_ == '!') {
549                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
550                         PrintF("Assertion(?<! Disjunction)\n");
551                         Advance();
552                         uint32_t start = buffer_.size_;
553                         ParseDisjunction(true);
554                         MatchOpCode matchOp;
555                         matchOp.EmitOpCode(&buffer_, 0);
556                         NegativeMatchAheadOpCode matchAheadOp;
557                         uint32_t len = buffer_.size_ - start;
558                         matchAheadOp.InsertOpCode(&buffer_, start, len);
559                     } else {
560                         Prev();
561                         CString name;
562                         auto **pp = const_cast<const uint8_t **>(&pc_);
563                         if (!ParseGroupSpecifier(pp, name)) {
564                             ParseError("GroupName Syntax error.");
565                             return false;
566                         }
567                         if (FindGroupName(name) > 0) {
568                             ParseError("Duplicate GroupName error.");
569                             return false;
570                         }
571                         groupNames_.EmitStr(name.c_str());
572                         newGroupNames_.push_back(name);
573                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
574                         PrintF("group name %s", name.c_str());
575                         Advance();
576                         goto parseCapture;  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
577                     }
578                     break;
579                 }
580                 // (?:Disjunction[?U, ?N])
581                 case ':':
582                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
583                     PrintF("Atom(?<: Disjunction)\n");
584                     isAtom = true;
585                     Advance();
586                     ParseDisjunction(isBackward);
587                     break;
588                 default:
589                     Advance();
590                     ParseError("? Syntax error.");
591                     return false;
592             }
593             if (isError_) {
594                 return false;
595             }
596         } else {
597             groupNames_.EmitChar(0);
598         parseCapture:
599             isAtom = true;
600             *captureIndex = captureCount_++;
601             SaveEndOpCode saveEndOp;
602             SaveStartOpCode saveStartOp;
603             if (isBackward) {
604                 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
605             } else {
606                 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
607             }
608             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
609             PrintF("capture start %d \n", *captureIndex);
610             ParseDisjunction(isBackward);
611             if (isError_) {
612                 return false;
613             }
614             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
615             PrintF("capture end %d \n", *captureIndex);
616             if (isBackward) {
617                 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
618             } else {
619                 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
620             }
621         }
622     } while (c0_ != ')' && c0_ != KEY_EOF);
623     if (c0_ != ')') {
624         ParseError("capture syntax error");
625         return false;
626     }
627     return isAtom;
628 }
629 
ParseDecimalDigits()630 int RegExpParser::ParseDecimalDigits()
631 {
632     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
633     PrintF("Parse DecimalDigits------\n");
634     uint32_t result = 0;
635     bool overflow = false;
636     while (true) {
637         if (c0_ < '0' || c0_ > '9') {
638             break;
639         }
640         if (!overflow) {
641             if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
642                 overflow = true;
643             } else {
644                 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
645             }
646         }
647         Advance();
648     }
649     if (overflow) {
650         return INT32_MAX;
651     }
652     return result;
653 }
654 
ParserIntervalQuantifier(int * pmin,int * pmax)655 bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
656 {
657     // Quantifier::
658     //     QuantifierPrefix
659     //     QuantifierPrefix?
660     // QuantifierPrefix::
661     // *
662     // +
663     // ?
664     // {DecimalDigits}
665     // {DecimalDigits,}
666     // {DecimalDigits,DecimalDigits}
667     Advance();
668     *pmin = ParseDecimalDigits();
669     *pmax = *pmin;
670     switch (c0_) {
671         case ',': {
672             Advance();
673             if (c0_ == '}') {
674                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
675                 PrintF("QuantifierPrefix{DecimalDigits,}\n");
676                 *pmax = INT32_MAX;
677                 Advance();
678             } else {
679                 *pmax = ParseDecimalDigits();
680                 if (c0_ == '}') {
681                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
682                     PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
683                     Advance();
684                 } else {
685                     return false;
686                 }
687             }
688             break;
689         }
690         case '}':
691             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
692             PrintF("QuantifierPrefix{DecimalDigits}\n");
693             Advance();
694             break;
695         default:
696             Advance();
697             return false;
698     }
699     return true;
700 }
701 
ParseQuantifier(size_t atomBcStart,int captureStart,int captureEnd)702 void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
703 {
704     int min = -1;
705     int max = -1;
706     bool isGreedy = true;
707     switch (c0_) {
708         case '*':
709             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
710             PrintF("QuantifierPrefix %c\n", c0_);
711             min = 0;
712             max = INT32_MAX;
713             Advance();
714             break;
715         case '+':
716             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
717             PrintF("QuantifierPrefix %c\n", c0_);
718             min = 1;
719             max = INT32_MAX;
720             Advance();
721             break;
722         case '?':
723             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
724             PrintF("QuantifierPrefix %c\n", c0_);
725             Advance();
726             min = 0;
727             max = 1;
728             break;
729         case '{': {
730             uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
731             if (!ParserIntervalQuantifier(&min, &max)) {
732                 pc_ = start;
733                 Advance();  // back to '{'
734                 return;
735             }
736             if (min > max) {
737                 ParseError("Invalid repetition count");
738                 return;
739             }
740             break;
741         }
742         default:
743             break;
744     }
745     if (c0_ == '?') {
746         isGreedy = false;
747         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
748         PrintF("Quantifier::QuantifierPrefix?\n");
749         Advance();
750     } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
751         ParseError("nothing to repeat");
752         return;
753     }
754     if (min != -1 && max != -1) {
755         stackCount_++;
756         PushOpCode pushOp;
757         pushOp.InsertOpCode(&buffer_, atomBcStart);
758         atomBcStart += pushOp.GetSize();
759 
760         if (captureStart != 0) {
761             SaveResetOpCode saveResetOp;
762             saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
763         }
764 
765         // zero advance check
766         if (max == INT32_MAX) {
767             stackCount_++;
768             PushCharOpCode pushCharOp;
769             pushCharOp.InsertOpCode(&buffer_, atomBcStart);
770             CheckCharOpCode checkCharOp;
771             // NOLINTNEXTLINE(readability-magic-numbers)
772             checkCharOp.EmitOpCode(&buffer_, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
773         }
774 
775         if (isGreedy) {
776             LoopGreedyOpCode loopOp;
777             loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
778         } else {
779             LoopOpCode loopOp;
780             loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
781         }
782 
783         if (min == 0) {
784             if (isGreedy) {
785                 SplitNextOpCode splitNextOp;
786                 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
787             } else {
788                 SplitFirstOpCode splitFirstOp;
789                 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
790             }
791         }
792 
793         PopOpCode popOp;
794         popOp.EmitOpCode(&buffer_);
795     }
796 }
797 
ParseGroupSpecifier(const uint8_t ** pp,CString & name)798 bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, CString &name)
799 {
800     const uint8_t *p = *pp;
801     uint32_t c = 0;
802     char buffer[CACHE_SIZE] = {0};
803     char *q = buffer;
804     while (true) {
805         if (p <= end_) {
806             c = *p;
807         } else {
808             c = KEY_EOF;
809         }
810         if (c == '\\') {
811             p++;
812             if (*p != 'u') {
813                 return false;
814             }
815             if (!ParseUnicodeEscape(&c)) {
816                 return false;
817             }
818         } else if (c == '>') {
819             break;
820         } else if (c > CACHE_SIZE && c != KEY_EOF) {
821             c = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
822         } else if (c != KEY_EOF) {
823             p++;
824         } else {
825             return false;
826         }
827         if (q == buffer) {
828             if (!IsIdentFirst(c)) {
829                 return false;
830             }
831         } else {
832             if (!u_isIDPart(c)) {
833                 return false;
834             }
835         }
836         if (q != nullptr) {
837             *q++ = c;
838         }
839     } // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
840     p++;
841     *pp = p;
842     name = buffer;
843     return true;
844 }
845 
ParseCaptureCount(const char * groupName)846 int RegExpParser::ParseCaptureCount(const char *groupName)
847 {
848     const uint8_t *p = nullptr;
849     int captureIndex = 1;
850     CString name;
851     hasNamedCaptures_ = 0;
852     for (p = base_; p < end_; p++) {  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
853         switch (*p) {
854             case '(': {
855                 if (p[1] == '?') {  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
856                     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
857                     if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
858                         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
859                         p[CAPTURE_CONUT_ADVANCE] != '=') {
860                         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
861                         hasNamedCaptures_ = 1;
862                         p += CAPTURE_CONUT_ADVANCE;
863                         if (groupName != nullptr) {
864                             if (ParseGroupSpecifier(&p, name)) {
865                                 if (strcmp(name.c_str(), groupName) == 0) {
866                                     return captureIndex;
867                                 }
868                             }
869                         }
870                         captureIndex++;
871                     }
872                 } else {
873                     captureIndex++;
874                 }
875                 break;
876             }
877             case '\\':
878                 p++;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
879                 break;
880             case '[': {
881                 while (p < end_ && *p != ']') {
882                     if (*p == '\\') {
883                         p++;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
884                     }
885                     p++;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
886                 }
887                 break;
888             }
889             default:
890                 break;
891         }
892     }
893     return captureIndex;
894 }
895 
896 // NOLINTNEXTLINE(readability-function-size)
ParseAtomEscape(bool isBackward)897 int RegExpParser::ParseAtomEscape(bool isBackward)
898 {
899     // AtomEscape[U, N]::
900     //     DecimalEscape
901     //     CharacterClassEscape[?U]
902     //     CharacterEscape[?U]
903     //     [+N]kGroupName[?U]
904     int result = -1;
905     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
906     PrintF("Parse AtomEscape------\n");
907     PrevOpCode prevOp;
908     switch (c0_) {
909         case KEY_EOF:
910             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
911             ParseError("unexpected end");
912             break;
913         // DecimalEscape
914         case '1':
915         case '2':
916         case '3':
917         case '4':
918         case '5':
919         case '6':
920         case '7':
921         case '8':
922         case '9': {
923             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
924             PrintF("NonZeroDigit %c\n", c0_);
925             int capture = ParseDecimalDigits();
926             if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
927                 ParseError("invalid backreference count");
928                 break;
929             }
930             if (isBackward) {
931                 BackwardBackReferenceOpCode backReferenceOp;
932                 backReferenceOp.EmitOpCode(&buffer_, capture);
933             } else {
934                 BackReferenceOpCode backReferenceOp;
935                 backReferenceOp.EmitOpCode(&buffer_, capture);
936             }
937             break;
938         }
939         // CharacterClassEscape
940         case 'd': {
941             // [0-9]
942             RangeOpCode rangeOp;
943             if (isBackward) {
944                 prevOp.EmitOpCode(&buffer_, 0);
945             }
946             rangeOp.InsertOpCode(&buffer_, g_rangeD);
947             goto parseLookBehind;
948         }
949         case 'D': {
950             // [^0-9]
951             RangeSet atomRange(g_rangeD);
952             atomRange.Invert(IsUtf16());
953             Range32OpCode rangeOp;
954             if (isBackward) {
955                 prevOp.EmitOpCode(&buffer_, 0);
956             }
957             rangeOp.InsertOpCode(&buffer_, atomRange);
958             goto parseLookBehind;
959         }
960         case 's': {
961             // [\f\n\r\t\v]
962             RangeOpCode rangeOp;
963             if (isBackward) {
964                 prevOp.EmitOpCode(&buffer_, 0);
965             }
966             rangeOp.InsertOpCode(&buffer_, g_rangeS);
967             goto parseLookBehind;
968         }
969         case 'S': {
970             RangeSet atomRange(g_rangeS);
971             Range32OpCode rangeOp;
972             atomRange.Invert(IsUtf16());
973             if (isBackward) {
974                 prevOp.EmitOpCode(&buffer_, 0);
975             }
976             rangeOp.InsertOpCode(&buffer_, atomRange);
977             goto parseLookBehind;
978         }
979         case 'w': {
980             // [A-Za-z0-9]
981             RangeOpCode rangeOp;
982             if (isBackward) {
983                 prevOp.EmitOpCode(&buffer_, 0);
984             }
985             rangeOp.InsertOpCode(&buffer_, g_rangeW);
986             goto parseLookBehind;
987         }
988         case 'W': {
989             // [^A-Za-z0-9]
990             RangeSet atomRange(g_rangeW);
991             atomRange.Invert(IsUtf16());
992             Range32OpCode rangeOp;
993             if (isBackward) {
994                 prevOp.EmitOpCode(&buffer_, 0);
995             }
996             rangeOp.InsertOpCode(&buffer_, atomRange);
997             goto parseLookBehind;
998         }
999         // P{UnicodePropertyValueExpression}
1000         // p{UnicodePropertyValueExpression}
1001         case 'P':
1002         case 'p':
1003         // [+N]kGroupName[?U]
1004         case 'k': {
1005             Advance();
1006             if (c0_ != '<') {
1007                 if (!IsUtf16() || HasNamedCaptures()) {
1008                     ParseError("expecting group name.");
1009                     break;
1010                 }
1011             }
1012             Advance();
1013             Prev();
1014             CString name;
1015             auto **pp = const_cast<const uint8_t **>(&pc_);
1016             if (!ParseGroupSpecifier(pp, name)) {
1017                 ParseError("GroupName Syntax error.");
1018                 break;
1019             }
1020             int postion = FindGroupName(name);
1021             if (postion < 0) {
1022                 postion = ParseCaptureCount(name.c_str());
1023                 if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1024                     ParseError("group name not defined");
1025                     break;
1026                 }
1027             }
1028             if (isBackward) {
1029                 BackwardBackReferenceOpCode backReferenceOp;
1030                 backReferenceOp.EmitOpCode(&buffer_, postion);
1031             } else {
1032                 BackReferenceOpCode backReferenceOp;
1033                 backReferenceOp.EmitOpCode(&buffer_, postion);
1034             }
1035             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1036             Advance();
1037             break;
1038         }
1039         parseLookBehind: {
1040             if (isBackward) {
1041                 prevOp.EmitOpCode(&buffer_, 0);
1042             }
1043             Advance();
1044             break;
1045         }
1046         default:
1047             result = ParseCharacterEscape();
1048             break;
1049     }
1050     return result;
1051 }
1052 
RecountCaptures()1053 int RegExpParser::RecountCaptures()
1054 {
1055     if (totalCaptureCount_ < 0) {
1056         const char *name = reinterpret_cast<const char*>(groupNames_.buf_);
1057         totalCaptureCount_ = ParseCaptureCount(name);
1058     }
1059     return totalCaptureCount_;
1060 }
HasNamedCaptures()1061 bool RegExpParser::HasNamedCaptures()
1062 {
1063     if (hasNamedCaptures_ < 0) {
1064         RecountCaptures();
1065     }
1066     return false;
1067 }
1068 
ParseCharacterEscape()1069 int RegExpParser::ParseCharacterEscape()
1070 {
1071     // CharacterEscape[U]::
1072     //     ControlEscape
1073     //     c ControlLetter
1074     //     0 [lookahead ∉ DecimalDigit]
1075     //     HexEscapeSequence
1076     //     RegExpUnicodeEscapeSequence[?U]
1077     //     IdentityEscape[?U]
1078     uint32_t result = 0;
1079     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1080     switch (c0_) {
1081         // ControlEscape
1082         case 'f':
1083             result = '\f';
1084             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1085             PrintF("ControlEscape %c\n", c0_);
1086             Advance();
1087             break;
1088         case 'n':
1089             result = '\n';
1090             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1091             PrintF("ControlEscape %c\n", c0_);
1092             Advance();
1093             break;
1094         case 'r':
1095             result = '\r';
1096             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1097             PrintF("ControlEscape %c\n", c0_);
1098             Advance();
1099             break;
1100         case 't':
1101             result = '\t';
1102             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1103             PrintF("ControlEscape %c\n", c0_);
1104             Advance();
1105             break;
1106         case 'v':
1107             result = '\v';
1108             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1109             PrintF("ControlEscape %c\n", c0_);
1110             Advance();
1111             break;
1112         // c ControlLetter
1113         case 'c': {
1114             Advance();
1115             if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1116                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1117                 PrintF("ControlLetter %c\n", c0_);
1118                 result = static_cast<uint32_t>(c0_) & 0x1f;  // NOLINTNEXTLINE(readability-magic-numbers)
1119                 Advance();
1120             } else {
1121                 if (!IsUtf16()) {
1122                     pc_--;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1123                     result = '\\';
1124                 } else {
1125                     ParseError("Invalid control letter");
1126                     return -1;
1127                 }
1128             }
1129             break;
1130         }
1131         case '0': {
1132             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1133             PrintF("CharacterEscape 0 [lookahead ∉ DecimalDigit]\n");
1134             if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) {  // NOLINTNEXTLINE(readability-magic-numbers)
1135                 Advance();
1136                 result = 0;
1137                 break;
1138             }
1139             [[fallthrough]];
1140         }
1141         case '1':
1142         case '2':
1143         case '3':
1144         case '4':
1145         case '5':
1146         case '6':
1147         case '7': {
1148             if (IsUtf16()) {
1149                 // With /u, decimal escape is not interpreted as octal character code.
1150                 ParseError("Invalid class escape");
1151                 return 0;
1152             }
1153             result = ParseOctalLiteral();
1154             break;
1155         }
1156         // ParseHexEscapeSequence
1157         // ParseRegExpUnicodeEscapeSequence
1158         case 'x': {
1159             Advance();
1160             if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1161                 return result;
1162             }
1163             if (IsUtf16()) {
1164                 ParseError("Invalid class escape");
1165                 return -1;
1166             }
1167             result = 'x';
1168             break;
1169         }
1170         case 'u': {
1171             Advance();
1172             if (ParseUnicodeEscape(&result)) {
1173                 return result;
1174             }
1175             if (IsUtf16()) {
1176                 // With /u, invalid escapes are not treated as identity escapes.
1177                 ParseError("Invalid unicode escape");
1178                 return 0;
1179             }
1180             // If \u is not followed by a two-digit hexadecimal, treat it
1181             // as an identity escape.
1182             result = 'u';
1183             break;
1184         }
1185         // IdentityEscape[?U]
1186         case '$':
1187         case '(':
1188         case ')':
1189         case '*':
1190         case '+':
1191         case '.':
1192         case '/':
1193         case '?':
1194         case '[':
1195         case '\\':
1196         case ']':
1197         case '^':
1198         case '{':
1199         case '|':
1200         case '}':
1201             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1202             PrintF("IdentityEscape %c\n", c0_);
1203             result = c0_;
1204             Advance();
1205             break;
1206         default: {
1207             if (IsUtf16()) {
1208                 ParseError("Invalid unicode escape");
1209                 return 0;
1210             }
1211             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1212             PrintF("SourceCharacter %c\n", c0_);
1213             result = c0_;
1214             if (result < CHAR_MAXS) {
1215                 Advance();
1216             } else {
1217                 Prev();
1218                 const uint8_t *p = pc_;
1219                 result = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
1220                 int offset = static_cast<int>(p - pc_);
1221                 Advance(offset + 1);
1222             }
1223             break;
1224         }
1225     }
1226     return static_cast<int>(result);
1227 }
1228 
ParseClassRanges(RangeSet * result)1229 bool RegExpParser::ParseClassRanges(RangeSet *result)
1230 {
1231     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1232     PrintF("Parse ClassRanges------\n");
1233     while (c0_ != ']') {
1234         RangeSet s1;
1235         bool needInter = false;
1236         uint32_t c1 = ParseClassAtom(&s1);
1237         if (c1 == UINT32_MAX) {
1238             ParseError("invalid class range");
1239             return false;
1240         }
1241         needInter = NeedIntersection(c1);
1242         int next_c0 = *pc_;
1243         if (c0_ == '-' && next_c0 != ']') {
1244             if (c1 == CLASS_RANGE_BASE) {
1245                 if (IsUtf16()) {
1246                     ParseError("invalid class range");
1247                     return false;
1248                 }
1249                 result->Insert(s1);
1250                 continue;
1251             }
1252             Advance();
1253             RangeSet s2;
1254             uint32_t c2 = ParseClassAtom(&s2);
1255             if (c2 == UINT32_MAX) {
1256                 ParseError("invalid class range");
1257                 return false;
1258             }
1259             if (c2 == CLASS_RANGE_BASE) {
1260                 if (IsUtf16()) {
1261                     ParseError("invalid class range");
1262                     return false;
1263                 }
1264                 result->Insert(s2);
1265                 continue;
1266             }
1267             if (c1 < INT8_MAX) {
1268                 if (c1 > c2) {
1269                     ParseError("invalid class range");
1270                     return false;
1271                 }
1272             }
1273             needInter = NeedIntersection(c2);
1274             result->Insert(c1, c2);
1275             if (IsIgnoreCase() && needInter) {
1276                 ProcessIntersection(result);
1277             }
1278         } else {
1279             result->Insert(s1);
1280             if (!(IsIgnoreCase() && needInter)) {
1281                 continue;
1282             }
1283             if (c1 <= 'z' && c1 >= 'a') {
1284                 result->Insert(RangeSet(c1 - 'a' + 'A'));
1285             } else {
1286                 result->Insert(RangeSet(c1 - 'A' + 'a'));
1287             }
1288         }
1289     }
1290     Advance();
1291     return true;
1292 }
1293 
ParseClassAtom(RangeSet * atom)1294 uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1295 {
1296     uint32_t ret = UINT32_MAX;
1297     switch (c0_) {
1298         case '\\': {
1299             Advance();
1300             ret = static_cast<uint32_t>(ParseClassEscape(atom));
1301             break;
1302         }
1303         case KEY_EOF:
1304             break;
1305         case 0: {
1306             if (pc_ >= end_) {
1307                 return UINT32_MAX;
1308             }
1309             [[fallthrough]];
1310         }
1311         default: {
1312             uint32_t value = c0_;
1313             size_t u16_size = 0;
1314             if (c0_ > INT8_MAX) {  // NOLINTNEXTLINE(readability-magic-numbers)
1315                 pc_ -= 1;          // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1316                 auto u16_result = base::utf_helper::ConvertUtf8ToUtf16Pair(pc_, true);
1317                 value = u16_result.first;
1318                 u16_size = u16_result.second;
1319                 Advance(u16_size + 1);
1320             } else {
1321                 Advance();
1322             }
1323             atom->Insert(RangeSet(value));
1324             ret = value;
1325             break;
1326         }
1327     }
1328     return ret;
1329 }
1330 
ParseClassEscape(RangeSet * atom)1331 int RegExpParser::ParseClassEscape(RangeSet *atom)
1332 {
1333     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1334     PrintF("Parse ClassEscape------\n");
1335     int result = -1;
1336     switch (c0_) {
1337         case 'b':
1338             Advance();
1339             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1340             PrintF("ClassEscape %c", 'b');
1341             result = '\b';
1342             atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1343             break;
1344         case '-':
1345             Advance();
1346             result = '-';
1347             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1348             PrintF("ClassEscape %c", '-');
1349             atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1350             break;
1351         // CharacterClassEscape
1352         case 'd':
1353         case 'D':
1354             result = CLASS_RANGE_BASE;
1355             atom->Insert(g_rangeD);
1356             if (c0_ == 'D') {
1357                 atom->Invert(IsUtf16());
1358             }
1359             Advance();
1360             break;
1361         case 's':
1362         case 'S':
1363             result = CLASS_RANGE_BASE;
1364             atom->Insert(g_rangeS);
1365             if (c0_ == 'S') {
1366                 atom->Invert(IsUtf16());
1367             }
1368             Advance();
1369             break;
1370         case 'w':
1371         case 'W':
1372             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1373             PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1374             result = CLASS_RANGE_BASE;
1375             atom->Insert(g_rangeW);
1376             if (c0_ == 'W') {
1377                 atom->Invert(IsUtf16());
1378             }
1379             Advance();
1380             break;
1381         // P{UnicodePropertyValueExpression}
1382         // p{UnicodePropertyValueExpression}
1383         case 'P':
1384         case 'p':
1385             PrintF("Warning: \\p is not supported in ECMA 2015!");
1386             Advance();
1387             if (c0_ == '{') {
1388                 Advance();
1389                 if (c0_ == '}') {
1390                     break;  // p{}, invalid
1391                 }
1392                 bool isValue = false;
1393                 ParseUnicodePropertyValueCharacters(&isValue);
1394                 if (!isValue && c0_ == '=') {
1395                     // UnicodePropertyName = UnicodePropertyValue
1396                     Advance();
1397                     if (c0_ == '}') {
1398                         break;  // p{xxx=}, invalid
1399                     }
1400                     ParseUnicodePropertyValueCharacters(&isValue);
1401                 }
1402                 if (c0_ != '}') {
1403                     break;  // p{xxx, invalid
1404                 }
1405                 // should do atom->Invert() here after ECMA 9.0
1406                 Advance();
1407                 result = CLASS_RANGE_BASE;
1408             }
1409             break;
1410         default:
1411             result = ParseCharacterEscape();
1412             int value = result;
1413             if (IsIgnoreCase()) {
1414                 value = Canonicalize(value, IsUtf16());
1415             }
1416             atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1417             break;
1418     }
1419     return result;
1420 }
1421 
ParseUnicodePropertyValueCharacters(bool * isValue)1422 void RegExpParser::ParseUnicodePropertyValueCharacters(bool *isValue)
1423 {
1424     if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1425         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1426         PrintF("UnicodePropertyCharacter::ControlLetter %c\n", c0_);
1427     } else if (c0_ == '_') {
1428         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1429         PrintF("UnicodePropertyCharacter:: _ \n");
1430     } else if (c0_ >= '0' && c0_ <= '9') {
1431         *isValue = true;
1432         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1433         PrintF("UnicodePropertyValueCharacter::DecimalDigit %c\n", c0_);
1434     } else {
1435         return;
1436     }
1437     Advance();
1438     ParseUnicodePropertyValueCharacters(isValue);
1439 }
1440 
1441 // NOLINTNEXTLINE(cert-dcl50-cpp)
PrintF(const char * fmt,...)1442 void RegExpParser::PrintF(const char *fmt, ...)
1443 {
1444 #ifndef _NO_DEBUG_
1445     va_list args;
1446     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,)
1447     va_start(args, fmt);
1448     vprintf(fmt, args);
1449     va_end(args);
1450 #else
1451     (void)fmt;
1452 #endif
1453 }
1454 
ParseError(const char * errorMessage)1455 void RegExpParser::ParseError(const char *errorMessage)
1456 {
1457     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1458     PrintF("error: ");
1459     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1460     PrintF(errorMessage);
1461     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1462     PrintF("\n");
1463     SetIsError();
1464     size_t length = strlen(errorMessage) + 1;
1465     if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1466         LOG_FULL(FATAL) << "memcpy_s failed";
1467         UNREACHABLE();
1468     }
1469 }
1470 
IsIdentFirst(uint32_t c)1471 int RegExpParser::IsIdentFirst(uint32_t c)
1472 {
1473     if (c < CACHE_SIZE) {
1474         return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1; // 5: Shift five bits 31: and operation binary of 31
1475     } else {
1476         return static_cast<int>(u_isIDStart(c));
1477     }
1478 }
1479 
Canonicalize(int c,bool isUnicode)1480 int RegExpParser::Canonicalize(int c, bool isUnicode)
1481 {
1482     if (c < TMP_BUF_SIZE) {  // NOLINTNEXTLINE(readability-magic-numbers)
1483         if (c >= 'a' && c <= 'z') {
1484             c = c - 'a' + 'A';
1485         }
1486     } else {
1487         int cur = c;
1488         if (isUnicode) {
1489             c = u_tolower(static_cast<UChar32>(c));
1490             if (c >= 'a' && c <= 'z') {
1491                 c = cur;
1492             }
1493         } else {
1494             c = u_toupper(static_cast<UChar32>(c));
1495             if (c >= 'A' && c <= 'Z') {
1496                 c = cur;
1497             }
1498         }
1499     }
1500     return c;
1501 }
1502 
NeedIntersection(uint32_t c)1503 bool RegExpParser::NeedIntersection(uint32_t c)
1504 {
1505     return (c <= 'z' && c >= 'a') || (c <= 'Z' && c >= 'A');
1506 }
1507 
DoParserStackOverflowCheck(const char * errorMessage)1508 void RegExpParser::DoParserStackOverflowCheck(const char *errorMessage)
1509 {
1510     if (UNLIKELY(thread_->GetCurrentStackPosition() < thread_->GetStackLimit())) {
1511         LOG_ECMA(ERROR) << "Stack overflow! current:" << thread_->GetCurrentStackPosition() <<
1512             " limit:" << thread_->GetStackLimit();
1513         ParseError(errorMessage);
1514         return;
1515     }
1516 }
1517 }  // namespace panda::ecmascript
1518