• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "ecmascript/regexp/regexp_parser.h"
17 
18 #include "ecmascript/base/string_helper.h"
19 #include "ecmascript/ecma_macros.h"
20 #include "ecmascript/regexp/regexp_opcode.h"
21 #include "libpandabase/utils/utils.h"
22 #include "securec.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uniset.h"
25 #define _NO_DEBUG_
26 
27 namespace panda::ecmascript {
28 static constexpr uint32_t CACHE_SIZE = 128;
29 static constexpr uint32_t CHAR_MAXS = 128;
30 static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
31     /* $ A-Z _ a-z */
32     0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
33 };
34 static RangeSet g_rangeD(0x30, 0x39);  // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
35 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
36 static RangeSet g_rangeS({
37     std::pair<uint32_t, uint32_t>(0x0009, 0x000D),  // NOLINTNEXTLINE(readability-magic-numbers)
38     std::pair<uint32_t, uint32_t>(0x0020, 0x0020),  // NOLINTNEXTLINE(readability-magic-numbers)
39     std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0),  // NOLINTNEXTLINE(readability-magic-numbers)
40     std::pair<uint32_t, uint32_t>(0x1680, 0x1680),  // NOLINTNEXTLINE(readability-magic-numbers)
41     std::pair<uint32_t, uint32_t>(0x2000, 0x200A),  // NOLINTNEXTLINE(readability-magic-numbers)
42     /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
43     /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
44     std::pair<uint32_t, uint32_t>(0x2028, 0x2029),  // NOLINTNEXTLINE(readability-magic-numbers)
45     std::pair<uint32_t, uint32_t>(0x202F, 0x202F),  // NOLINTNEXTLINE(readability-magic-numbers)
46     std::pair<uint32_t, uint32_t>(0x205F, 0x205F),  // NOLINTNEXTLINE(readability-magic-numbers)
47     std::pair<uint32_t, uint32_t>(0x3000, 0x3000),  // NOLINTNEXTLINE(readability-magic-numbers)
48     /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
49     std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF),  // NOLINTNEXTLINE(readability-magic-numbers)
50 });
51 
52 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
53 static RangeSet g_rangeW({
54     std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINTNEXTLINE(readability-magic-numbers)
55     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINTNEXTLINE(readability-magic-numbers)
56     std::pair<uint32_t, uint32_t>(0x005F, 0x005F),  // NOLINTNEXTLINE(readability-magic-numbers)
57     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINTNEXTLINE(readability-magic-numbers)
58 });
59 
60 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
61 static RangeSet g_regexpIdentifyStart({
62     std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINTNEXTLINE(readability-magic-numbers)
63     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINTNEXTLINE(readability-magic-numbers)
64     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINTNEXTLINE(readability-magic-numbers)
65 });
66 
67 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
68 static RangeSet g_regexpIdentifyContinue({
69     std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINTNEXTLINE(readability-magic-numbers)
70     std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINTNEXTLINE(readability-magic-numbers)
71     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINTNEXTLINE(readability-magic-numbers)
72     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINTNEXTLINE(readability-magic-numbers)
73 });
74 
Parse()75 void RegExpParser::Parse()
76 {
77     // dynbuffer head init [size,capture_count,statck_count,flags]
78     buffer_.EmitU32(0);
79     buffer_.EmitU32(0);
80     buffer_.EmitU32(0);
81     buffer_.EmitU32(0);
82     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
83     PrintF("Parse Pattern------\n");
84     // Pattern[U, N]::
85     //      Disjunction[?U, ?N]
86     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
87     Advance();
88     SaveStartOpCode saveStartOp;
89     int captureIndex = captureCount_++;
90     saveStartOp.EmitOpCode(&buffer_, captureIndex);
91     ParseDisjunction(false);
92     if (c0_ != KEY_EOF) {
93         ParseError("extraneous characters at the end");
94         return;
95     }
96     SaveEndOpCode saveEndOp;
97     saveEndOp.EmitOpCode(&buffer_, captureIndex);
98     MatchEndOpCode matchEndOp;
99     matchEndOp.EmitOpCode(&buffer_, 0);
100     // dynbuffer head assignments
101     buffer_.PutU32(0, buffer_.size_);
102     buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
103     buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
104     buffer_.PutU32(FLAGS_OFFSET, flags_);
105 #ifndef _NO_DEBUG_
106     RegExpOpCode::DumpRegExpOpCode(std::cout, buffer_);
107 #endif
108 }
109 
ParseDisjunction(bool isBackward)110 void RegExpParser::ParseDisjunction(bool isBackward)
111 {
112     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
113     PrintF("Parse Disjunction------\n");
114     size_t start = buffer_.size_;
115     ParseAlternative(isBackward);
116     if (isError_) {
117         return;
118     }
119     do {
120         if (c0_ == '|') {
121             SplitNextOpCode splitOp;
122             uint32_t len = buffer_.size_ - start;
123             GotoOpCode gotoOp;
124             splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
125             uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
126             Advance();
127             ParseAlternative(isBackward);
128             gotoOp.UpdateOpPara(&buffer_, pos, buffer_.size_ - pos - gotoOp.GetSize());
129         }
130     } while (c0_ != KEY_EOF && c0_ != ')');
131 }
132 
ParseOctalLiteral()133 uint32_t RegExpParser::ParseOctalLiteral()
134 {
135     // For compatibility with some other browsers (not all), we parse
136     // up to three octal digits with a value below 256.
137     // ES#prod-annexB-LegacyOctalEscapeSequence
138     uint32_t value = c0_ - '0';
139     Advance();
140     if (c0_ >= '0' && c0_ <= '7') {
141         value = value * OCTAL_VALUE + c0_ - '0';
142         Advance();
143         if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
144             value = value * OCTAL_VALUE + c0_ - '0';
145             Advance();
146         }
147     }
148     return value;
149 }
150 
ParseUnlimitedLengthHexNumber(uint32_t maxValue,uint32_t * value)151 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
152 {
153     uint32_t x = 0;
154     int d = static_cast<int>(HexValue(c0_));
155     if (d < 0) {
156         return false;
157     }
158     while (d >= 0) {
159         if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
160             LOG_FULL(FATAL) << "value overflow";
161             return false;
162         }
163         x = x * HEX_VALUE + static_cast<uint32_t>(d);
164         if (x > maxValue) {
165             return false;
166         }
167         Advance();
168         d = static_cast<int>(HexValue(c0_));
169     }
170     *value = x;
171     return true;
172 }
173 
174 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uint32_t * value)175 bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
176 {
177     // Accept both \uxxxx and \u{xxxxxx} (if allowed).
178     // In the latter case, the number of hex digits between { } is arbitrary.
179     // \ and u have already been read.
180     if (c0_ == '{' && IsUtf16()) {
181         uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
182         Advance();
183         if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {  // NOLINTNEXTLINE(readability-magic-numbers)
184             if (c0_ == '}') {
185                 Advance();
186                 return true;
187             }
188         }
189         pc_ = start;
190         Advance();
191         return false;
192     }
193     // \u but no {, or \u{...} escapes not allowed.
194     bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
195     if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
196         // Attempt to read trail surrogate.
197         uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
198         if (*pc_ == 'u') {
199             Advance(UNICODE_HEX_ADVANCE);
200             uint32_t trail = 0;
201             if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
202                 *value = U16_GET_SUPPLEMENTARY((*value), (trail));  // NOLINTNEXTLINE(hicpp-signed-bitwise)
203                 return true;
204             }
205         }
206         pc_ = start;
207         Advance();
208     }
209     return result;
210 }
211 
ParseHexEscape(int length,uint32_t * value)212 bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
213 {
214     uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
215     uint32_t val = 0;
216     for (int i = 0; i < length; ++i) {
217         uint32_t c = c0_;
218         int d = static_cast<int>(HexValue(c));
219         if (d < 0) {
220             pc_ = start;
221             Advance();
222             return false;
223         }
224         val = val * HEX_VALUE + static_cast<uint32_t>(d);
225         Advance();
226     }
227     *value = val;
228     return true;
229 }
230 
231 // NOLINTNEXTLINE(readability-function-size)
ParseAlternative(bool isBackward)232 void RegExpParser::ParseAlternative(bool isBackward)
233 {
234     size_t start = buffer_.size_;
235     while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
236         if (isError_) {
237             return;
238         }
239         size_t atomBcStart = buffer_.GetSize();
240         int captureIndex = 0;
241         bool isAtom = false;
242         switch (c0_) {
243             case '^': {
244                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
245                 PrintF("Assertion %c line start \n", c0_);
246                 LineStartOpCode lineStartOp;
247                 lineStartOp.EmitOpCode(&buffer_, 0);
248                 Advance();
249                 break;
250             }
251             case '$': {
252                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
253                 PrintF("Assertion %c line end \n", c0_);
254                 LineEndOpCode lineEndOp;
255                 lineEndOp.EmitOpCode(&buffer_, 0);
256                 Advance();
257                 break;
258             }
259             case '\\': {
260                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
261                 PrintF("Escape %c \n", c0_);
262                 Advance();
263                 switch (c0_) {
264                     case 'b': {
265                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
266                         PrintF("Assertion %c \n", c0_);
267                         WordBoundaryOpCode wordBoundaryOp;
268                         wordBoundaryOp.EmitOpCode(&buffer_, 0);
269                         Advance();
270                         break;
271                     }
272                     case 'B': {
273                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
274                         PrintF("Assertion %c \n", c0_);
275                         NotWordBoundaryOpCode notWordBoundaryOp;
276                         notWordBoundaryOp.EmitOpCode(&buffer_, 0);
277                         Advance();
278                         break;
279                     }
280                     default: {
281                         isAtom = true;
282                         int atomValue = ParseAtomEscape(isBackward);
283                         if (atomValue != -1) {
284                             PrevOpCode prevOp;
285                             if (isBackward) {
286                                 prevOp.EmitOpCode(&buffer_, 0);
287                             }
288                             if (IsIgnoreCase()) {
289                                 if (!IsUtf16()) {
290                                     atomValue = Canonicalize(atomValue, false);
291                                 } else {
292                                     icu::UnicodeSet set(atomValue, atomValue);
293                                     set.closeOver(USET_CASE_INSENSITIVE);
294                                     set.removeAllStrings();
295                                     int32_t size = set.size();
296                                     RangeOpCode rangeOp;
297                                     RangeSet rangeResult;
298                                     for (int32_t idx = 0; idx < size; idx++) {
299                                         int32_t uc = set.charAt(idx);
300                                         RangeSet curRange(uc);
301                                         rangeResult.Insert(curRange);
302                                     }
303                                     rangeOp.InsertOpCode(&buffer_, rangeResult);
304                                     break;
305                                 }
306                             }
307                             if (atomValue <= UINT16_MAX) {
308                                 CharOpCode charOp;
309                                 charOp.EmitOpCode(&buffer_, atomValue);
310                             } else {
311                                 Char32OpCode charOp;
312                                 charOp.EmitOpCode(&buffer_, atomValue);
313                             }
314                             if (isBackward) {
315                                 prevOp.EmitOpCode(&buffer_, 0);
316                             }
317                         }
318                         break;
319                     }
320                 }
321                 break;
322             }
323             case '(': {
324                 Advance();
325                 isAtom = ParseAssertionCapture(&captureIndex, isBackward);
326                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
327                 Advance();
328                 break;
329             }
330             case '.': {
331                 PrevOpCode prevOp;
332                 if (isBackward) {
333                     prevOp.EmitOpCode(&buffer_, 0);
334                 }
335                 if (IsDotAll()) {
336                     AllOpCode allOp;
337                     allOp.EmitOpCode(&buffer_, 0);
338                 } else {
339                     DotsOpCode dotsOp;
340                     dotsOp.EmitOpCode(&buffer_, 0);
341                 }
342                 if (isBackward) {
343                     prevOp.EmitOpCode(&buffer_, 0);
344                 }
345                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
346                 PrintF("Atom %c match any \n", c0_);
347                 isAtom = true;
348                 Advance();
349                 break;
350             }
351             case '[': {
352                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
353                 PrintF("Atom %c match range \n", c0_);
354                 isAtom = true;
355                 PrevOpCode prevOp;
356                 Advance();
357                 if (isBackward) {
358                     prevOp.EmitOpCode(&buffer_, 0);
359                 }
360                 bool isInvert = false;
361                 if (c0_ == '^') {
362                     isInvert = true;
363                     Advance();
364                 }
365                 RangeSet rangeResult;
366                 if (!ParseClassRanges(&rangeResult)) {
367                     break;
368                 }
369                 if (isInvert) {
370                     rangeResult.Invert(IsUtf16());
371                 }
372                 uint32_t highValue = rangeResult.HighestValue();
373                 if (highValue <= UINT16_MAX) {
374                     RangeOpCode rangeOp;
375                     rangeOp.InsertOpCode(&buffer_, rangeResult);
376                 } else {
377                     Range32OpCode rangeOp;
378                     rangeOp.InsertOpCode(&buffer_, rangeResult);
379                 }
380 
381                 if (isBackward) {
382                     prevOp.EmitOpCode(&buffer_, 0);
383                 }
384                 break;
385             }
386             case '*':
387             case '+':
388             case '?':
389                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
390                 ParseError("nothing to repeat");
391                 return;
392             case '{': {
393                 uint8_t *begin = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
394                 int dummy;
395                 if (ParserIntervalQuantifier(&dummy, &dummy)) {
396                     ParseError("nothing to repeat");
397                     return;
398                 }
399                 pc_ = begin;
400                 Advance();
401             }
402                 [[fallthrough]];
403             case '}':
404             case ']':
405                 if (IsUtf16()) {
406                     ParseError("syntax error");
407                     return;
408                 }
409                 [[fallthrough]];
410             default: {
411                 // PatternCharacter
412                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
413                 PrintF("PatternCharacter %c\n", c0_);
414                 isAtom = true;
415                 {
416                     PrevOpCode prevOp;
417                     if (isBackward) {
418                         prevOp.EmitOpCode(&buffer_, 0);
419                     }
420                     uint32_t matchedChar = c0_;
421                     if (c0_ > (INT8_MAX + 1)) {
422                         Prev();
423                         int i = 0;
424                         UChar32 c;
425                         int32_t length = end_ - pc_ + 1;
426                         // NOLINTNEXTLINE(hicpp-signed-bitwise)
427                         U8_NEXT(pc_, i, length, c);  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
428                         matchedChar = static_cast<uint32_t>(c);
429                         pc_ += i;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
430                     }
431                     if (IsIgnoreCase()) {
432                         matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
433                     }
434                     if (matchedChar > UINT16_MAX) {
435                         Char32OpCode charOp;
436                         charOp.EmitOpCode(&buffer_, matchedChar);
437                     } else {
438                         CharOpCode charOp;
439                         charOp.EmitOpCode(&buffer_, matchedChar);
440                     }
441                     if (isBackward) {
442                         prevOp.EmitOpCode(&buffer_, 0);
443                     }
444                 }
445                 Advance();
446                 break;
447             }
448         }
449         if (isAtom && !isError_) {
450             ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
451         }
452         if (isBackward) {
453             size_t end = buffer_.GetSize();
454             size_t termSize = end - atomBcStart;
455             size_t moveSize = end - start;
456             buffer_.Expand(end + termSize);
457             if (memmove_s(buffer_.buf_ + start +  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
458                               termSize,           // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
459                           moveSize,
460                           buffer_.buf_ + start,  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
461                           moveSize) != EOK) {
462                 LOG_FULL(FATAL) << "memmove_s failed";
463                 UNREACHABLE();
464             }
465             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
466             if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
467                 LOG_FULL(FATAL) << "memcpy_s failed";
468                 UNREACHABLE();
469             }
470         }
471     }
472 }
473 
FindGroupName(const CString & name)474 int RegExpParser::FindGroupName(const CString &name)
475 {
476     size_t len = 0;
477     size_t nameLen = name.size();
478     const char *p = reinterpret_cast<char *>(groupNames_.buf_);
479     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
480     const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
481     int captureIndex = 1;
482     while (p < bufEnd) {
483         len = strlen(p);
484         if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
485             return captureIndex;
486         }
487         p += len + 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
488         captureIndex++;
489     }
490     return -1;
491 }
492 
ParseAssertionCapture(int * captureIndex,bool isBackward)493 bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
494 {
495     bool isAtom = false;
496     do {
497         if (c0_ == '?') {
498             Advance();
499             switch (c0_) {
500                 // (?=Disjunction[?U, ?N])
501                 case '=': {
502                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
503                     PrintF("Assertion(?= Disjunction)\n");
504                     Advance();
505                     uint32_t start = buffer_.size_;
506                     ParseDisjunction(isBackward);
507                     MatchOpCode matchOp;
508                     matchOp.EmitOpCode(&buffer_, 0);
509                     MatchAheadOpCode matchAheadOp;
510                     uint32_t len = buffer_.size_ - start;
511                     matchAheadOp.InsertOpCode(&buffer_, start, len);
512                     break;
513                 }
514                 // (?!Disjunction[?U, ?N])
515                 case '!': {
516                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
517                     PrintF("Assertion(?! Disjunction)\n");
518                     uint32_t start = buffer_.size_;
519                     Advance();
520                     ParseDisjunction(isBackward);
521                     MatchOpCode matchOp;
522                     matchOp.EmitOpCode(&buffer_, 0);
523                     NegativeMatchAheadOpCode matchAheadOp;
524                     uint32_t len = buffer_.size_ - start;
525                     matchAheadOp.InsertOpCode(&buffer_, start, len);
526                     break;
527                 }
528                 case '<': {
529                     Advance();
530                     // (?<=Disjunction[?U, ?N])
531                     if (c0_ == '=') {
532                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
533                         PrintF("Assertion(?<= Disjunction)\n");
534                         Advance();
535                         uint32_t start = buffer_.size_;
536                         ParseDisjunction(true);
537                         MatchOpCode matchOp;
538                         matchOp.EmitOpCode(&buffer_, 0);
539                         MatchAheadOpCode matchAheadOp;
540                         uint32_t len = buffer_.size_ - start;
541                         matchAheadOp.InsertOpCode(&buffer_, start, len);
542                         // (?<!Disjunction[?U, ?N])
543                     } else if (c0_ == '!') {
544                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
545                         PrintF("Assertion(?<! Disjunction)\n");
546                         Advance();
547                         uint32_t start = buffer_.size_;
548                         ParseDisjunction(true);
549                         MatchOpCode matchOp;
550                         matchOp.EmitOpCode(&buffer_, 0);
551                         NegativeMatchAheadOpCode matchAheadOp;
552                         uint32_t len = buffer_.size_ - start;
553                         matchAheadOp.InsertOpCode(&buffer_, start, len);
554                     } else {
555                         Prev();
556                         CString name;
557                         auto **pp = const_cast<const uint8_t **>(&pc_);
558                         if (!ParseGroupSpecifier(pp, name)) {
559                             ParseError("GroupName Syntax error.");
560                             return false;
561                         }
562                         if (FindGroupName(name) > 0) {
563                             ParseError("Duplicate GroupName error.");
564                             return false;
565                         }
566                         groupNames_.EmitStr(name.c_str());
567                         newGroupNames_.push_back(name);
568                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
569                         PrintF("group name %s", name.c_str());
570                         Advance();
571                         goto parseCapture;  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
572                     }
573                     break;
574                 }
575                 // (?:Disjunction[?U, ?N])
576                 case ':':
577                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
578                     PrintF("Atom(?<: Disjunction)\n");
579                     isAtom = true;
580                     Advance();
581                     ParseDisjunction(isBackward);
582                     break;
583                 default:
584                     Advance();
585                     ParseError("? Syntax error.");
586                     return false;
587             }
588             if (isError_) {
589                 return false;
590             }
591         } else {
592             groupNames_.EmitChar(0);
593         parseCapture:
594             isAtom = true;
595             *captureIndex = captureCount_++;
596             SaveEndOpCode saveEndOp;
597             SaveStartOpCode saveStartOp;
598             if (isBackward) {
599                 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
600             } else {
601                 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
602             }
603             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
604             PrintF("capture start %d \n", *captureIndex);
605             ParseDisjunction(isBackward);
606             if (isError_) {
607                 return false;
608             }
609             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
610             PrintF("capture end %d \n", *captureIndex);
611             if (isBackward) {
612                 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
613             } else {
614                 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
615             }
616         }
617     } while (c0_ != ')' && c0_ != KEY_EOF);
618     if (c0_ != ')') {
619         ParseError("capture syntax error");
620         return false;
621     }
622     return isAtom;
623 }
624 
ParseDecimalDigits()625 int RegExpParser::ParseDecimalDigits()
626 {
627     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
628     PrintF("Parse DecimalDigits------\n");
629     uint32_t result = 0;
630     bool overflow = false;
631     while (true) {
632         if (c0_ < '0' || c0_ > '9') {
633             break;
634         }
635         if (!overflow) {
636             if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
637                 overflow = true;
638             } else {
639                 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
640             }
641         }
642         Advance();
643     }
644     if (overflow) {
645         return INT32_MAX;
646     }
647     return result;
648 }
649 
ParserIntervalQuantifier(int * pmin,int * pmax)650 bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
651 {
652     // Quantifier::
653     //     QuantifierPrefix
654     //     QuantifierPrefix?
655     // QuantifierPrefix::
656     // *
657     // +
658     // ?
659     // {DecimalDigits}
660     // {DecimalDigits,}
661     // {DecimalDigits,DecimalDigits}
662     Advance();
663     *pmin = ParseDecimalDigits();
664     *pmax = *pmin;
665     switch (c0_) {
666         case ',': {
667             Advance();
668             if (c0_ == '}') {
669                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
670                 PrintF("QuantifierPrefix{DecimalDigits,}\n");
671                 *pmax = INT32_MAX;
672                 Advance();
673             } else {
674                 *pmax = ParseDecimalDigits();
675                 if (c0_ == '}') {
676                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
677                     PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
678                     Advance();
679                 } else {
680                     return false;
681                 }
682             }
683             break;
684         }
685         case '}':
686             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
687             PrintF("QuantifierPrefix{DecimalDigits}\n");
688             Advance();
689             break;
690         default:
691             Advance();
692             return false;
693     }
694     return true;
695 }
696 
ParseQuantifier(size_t atomBcStart,int captureStart,int captureEnd)697 void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
698 {
699     int min = -1;
700     int max = -1;
701     bool isGreedy = true;
702     switch (c0_) {
703         case '*':
704             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
705             PrintF("QuantifierPrefix %c\n", c0_);
706             min = 0;
707             max = INT32_MAX;
708             Advance();
709             break;
710         case '+':
711             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
712             PrintF("QuantifierPrefix %c\n", c0_);
713             min = 1;
714             max = INT32_MAX;
715             Advance();
716             break;
717         case '?':
718             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
719             PrintF("QuantifierPrefix %c\n", c0_);
720             Advance();
721             min = 0;
722             max = 1;
723             break;
724         case '{': {
725             uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
726             if (!ParserIntervalQuantifier(&min, &max)) {
727                 pc_ = start;
728                 Advance();  // back to '{'
729                 return;
730             }
731             if (min > max) {
732                 ParseError("Invalid repetition count");
733                 return;
734             }
735             break;
736         }
737         default:
738             break;
739     }
740     if (c0_ == '?') {
741         isGreedy = false;
742         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
743         PrintF("Quantifier::QuantifierPrefix?\n");
744         Advance();
745     } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
746         ParseError("nothing to repeat");
747         return;
748     }
749     if (min != -1 && max != -1) {
750         stackCount_++;
751         PushOpCode pushOp;
752         pushOp.InsertOpCode(&buffer_, atomBcStart);
753         atomBcStart += pushOp.GetSize();
754 
755         if (captureStart != 0) {
756             SaveResetOpCode saveResetOp;
757             saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
758         }
759 
760         // zero advance check
761         if (max == INT32_MAX) {
762             stackCount_++;
763             PushCharOpCode pushCharOp;
764             pushCharOp.InsertOpCode(&buffer_, atomBcStart);
765             CheckCharOpCode checkCharOp;
766             // NOLINTNEXTLINE(readability-magic-numbers)
767             checkCharOp.EmitOpCode(&buffer_, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
768         }
769 
770         if (isGreedy) {
771             LoopGreedyOpCode loopOp;
772             loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
773         } else {
774             LoopOpCode loopOp;
775             loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
776         }
777 
778         if (min == 0) {
779             if (isGreedy) {
780                 SplitNextOpCode splitNextOp;
781                 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
782             } else {
783                 SplitFirstOpCode splitFirstOp;
784                 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
785             }
786         }
787 
788         PopOpCode popOp;
789         popOp.EmitOpCode(&buffer_);
790     }
791 }
792 
ParseGroupSpecifier(const uint8_t ** pp,CString & name)793 bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, CString &name)
794 {
795     const uint8_t *p = *pp;
796     uint32_t c = 0;
797     char buffer[CACHE_SIZE] = {0};
798     char *q = buffer;
799     while (true) {
800         if (p <= end_) {
801             c = *p;
802         } else {
803             c = KEY_EOF;
804         }
805         if (c == '\\') {
806             p++;
807             if (*p != 'u') {
808                 return false;
809             }
810             if (!ParseUnicodeEscape(&c)) {
811                 return false;
812             }
813         } else if (c == '>') {
814             break;
815         } else if (c > CACHE_SIZE && c != KEY_EOF) {
816             c = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
817         } else if (c != KEY_EOF) {
818             p++;
819         } else {
820             return false;
821         }
822         if (q == buffer) {
823             if (!IsIdentFirst(c)) {
824                 return false;
825             }
826         } else {
827             if (!u_isIDPart(c)) {
828                 return false;
829             }
830         }
831         if (q != nullptr) {
832             *q++ = c;
833         }
834     } // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
835     p++;
836     *pp = p;
837     name = buffer;
838     return true;
839 }
840 
ParseCaptureCount(const char * groupName)841 int RegExpParser::ParseCaptureCount(const char *groupName)
842 {
843     const uint8_t *p = nullptr;
844     int captureIndex = 1;
845     CString name;
846     hasNamedCaptures_ = 0;
847     for (p = base_; p < end_; p++) {  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
848         switch (*p) {
849             case '(': {
850                 if (p[1] == '?') {  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
851                     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
852                     if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
853                         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
854                         p[CAPTURE_CONUT_ADVANCE] != '=') {
855                         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
856                         hasNamedCaptures_ = 1;
857                         p += CAPTURE_CONUT_ADVANCE;
858                         if (groupName != nullptr) {
859                             if (ParseGroupSpecifier(&p, name)) {
860                                 if (strcmp(name.c_str(), groupName) == 0) {
861                                     return captureIndex;
862                                 }
863                             }
864                         }
865                         captureIndex++;
866                     }
867                 } else {
868                     captureIndex++;
869                 }
870                 break;
871             }
872             case '\\':
873                 p++;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
874                 break;
875             case '[': {
876                 while (p < end_ && *p != ']') {
877                     if (*p == '\\') {
878                         p++;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
879                     }
880                     p++;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
881                 }
882                 break;
883             }
884             default:
885                 break;
886         }
887     }
888     return captureIndex;
889 }
890 
891 // NOLINTNEXTLINE(readability-function-size)
ParseAtomEscape(bool isBackward)892 int RegExpParser::ParseAtomEscape(bool isBackward)
893 {
894     // AtomEscape[U, N]::
895     //     DecimalEscape
896     //     CharacterClassEscape[?U]
897     //     CharacterEscape[?U]
898     //     [+N]kGroupName[?U]
899     int result = -1;
900     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
901     PrintF("Parse AtomEscape------\n");
902     PrevOpCode prevOp;
903     switch (c0_) {
904         case KEY_EOF:
905             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
906             ParseError("unexpected end");
907             break;
908         // DecimalEscape
909         case '1':
910         case '2':
911         case '3':
912         case '4':
913         case '5':
914         case '6':
915         case '7':
916         case '8':
917         case '9': {
918             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
919             PrintF("NonZeroDigit %c\n", c0_);
920             int capture = ParseDecimalDigits();
921             if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
922                 ParseError("invalid backreference count");
923                 break;
924             }
925             if (isBackward) {
926                 BackwardBackReferenceOpCode backReferenceOp;
927                 backReferenceOp.EmitOpCode(&buffer_, capture);
928             } else {
929                 BackReferenceOpCode backReferenceOp;
930                 backReferenceOp.EmitOpCode(&buffer_, capture);
931             }
932             break;
933         }
934         // CharacterClassEscape
935         case 'd': {
936             // [0-9]
937             RangeOpCode rangeOp;
938             if (isBackward) {
939                 prevOp.EmitOpCode(&buffer_, 0);
940             }
941             rangeOp.InsertOpCode(&buffer_, g_rangeD);
942             goto parseLookBehind;
943             break;
944         }
945         case 'D': {
946             // [^0-9]
947             RangeSet atomRange(g_rangeD);
948             atomRange.Invert(IsUtf16());
949             Range32OpCode rangeOp;
950             if (isBackward) {
951                 prevOp.EmitOpCode(&buffer_, 0);
952             }
953             rangeOp.InsertOpCode(&buffer_, atomRange);
954             goto parseLookBehind;
955             break;
956         }
957         case 's': {
958             // [\f\n\r\t\v]
959             RangeOpCode rangeOp;
960             if (isBackward) {
961                 prevOp.EmitOpCode(&buffer_, 0);
962             }
963             rangeOp.InsertOpCode(&buffer_, g_rangeS);
964             goto parseLookBehind;
965             break;
966         }
967         case 'S': {
968             RangeSet atomRange(g_rangeS);
969             Range32OpCode rangeOp;
970             atomRange.Invert(IsUtf16());
971             if (isBackward) {
972                 prevOp.EmitOpCode(&buffer_, 0);
973             }
974             rangeOp.InsertOpCode(&buffer_, atomRange);
975             goto parseLookBehind;
976             break;
977         }
978         case 'w': {
979             // [A-Za-z0-9]
980             RangeOpCode rangeOp;
981             if (isBackward) {
982                 prevOp.EmitOpCode(&buffer_, 0);
983             }
984             rangeOp.InsertOpCode(&buffer_, g_rangeW);
985             goto parseLookBehind;
986             break;
987         }
988         case 'W': {
989             // [^A-Za-z0-9]
990             RangeSet atomRange(g_rangeW);
991             atomRange.Invert(IsUtf16());
992             Range32OpCode rangeOp;
993             if (isBackward) {
994                 prevOp.EmitOpCode(&buffer_, 0);
995             }
996             rangeOp.InsertOpCode(&buffer_, atomRange);
997             goto parseLookBehind;
998             break;
999         }
1000         // P{UnicodePropertyValueExpression}
1001         // p{UnicodePropertyValueExpression}
1002         case 'P':
1003         case 'p':
1004         // [+N]kGroupName[?U]
1005         case 'k': {
1006             Advance();
1007             if (c0_ != '<') {
1008                 if (!IsUtf16() || HasNamedCaptures()) {
1009                     ParseError("expecting group name.");
1010                     break;
1011                 }
1012             }
1013             Advance();
1014             Prev();
1015             CString name;
1016             auto **pp = const_cast<const uint8_t **>(&pc_);
1017             if (!ParseGroupSpecifier(pp, name)) {
1018                 ParseError("GroupName Syntax error.");
1019                 break;
1020             }
1021             int postion = FindGroupName(name);
1022             if (postion < 0) {
1023                 postion = ParseCaptureCount(name.c_str());
1024                 if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1025                     ParseError("group name not defined");
1026                     break;
1027                 }
1028             }
1029             if (isBackward) {
1030                 BackwardBackReferenceOpCode backReferenceOp;
1031                 backReferenceOp.EmitOpCode(&buffer_, postion);
1032             } else {
1033                 BackReferenceOpCode backReferenceOp;
1034                 backReferenceOp.EmitOpCode(&buffer_, postion);
1035             }
1036             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1037             Advance();
1038             break;
1039         }
1040         parseLookBehind: {
1041             if (isBackward) {
1042                 prevOp.EmitOpCode(&buffer_, 0);
1043             }
1044             Advance();
1045             break;
1046         }
1047         default:
1048             result = ParseCharacterEscape();
1049             break;
1050     }
1051     return result;
1052 }
1053 
RecountCaptures()1054 int RegExpParser::RecountCaptures()
1055 {
1056     if (totalCaptureCount_ < 0) {
1057         const char *name = reinterpret_cast<const char*>(groupNames_.buf_);
1058         totalCaptureCount_ = ParseCaptureCount(name);
1059     }
1060     return totalCaptureCount_;
1061 }
HasNamedCaptures()1062 bool RegExpParser::HasNamedCaptures()
1063 {
1064     if (hasNamedCaptures_ < 0) {
1065         RecountCaptures();
1066     }
1067     return false;
1068 }
1069 
ParseCharacterEscape()1070 int RegExpParser::ParseCharacterEscape()
1071 {
1072     // CharacterEscape[U]::
1073     //     ControlEscape
1074     //     c ControlLetter
1075     //     0 [lookahead ∉ DecimalDigit]
1076     //     HexEscapeSequence
1077     //     RegExpUnicodeEscapeSequence[?U]
1078     //     IdentityEscape[?U]
1079     uint32_t result = 0;
1080     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1081     switch (c0_) {
1082         // ControlEscape
1083         case 'f':
1084             result = '\f';
1085             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1086             PrintF("ControlEscape %c\n", c0_);
1087             Advance();
1088             break;
1089         case 'n':
1090             result = '\n';
1091             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1092             PrintF("ControlEscape %c\n", c0_);
1093             Advance();
1094             break;
1095         case 'r':
1096             result = '\r';
1097             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1098             PrintF("ControlEscape %c\n", c0_);
1099             Advance();
1100             break;
1101         case 't':
1102             result = '\t';
1103             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1104             PrintF("ControlEscape %c\n", c0_);
1105             Advance();
1106             break;
1107         case 'v':
1108             result = '\v';
1109             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1110             PrintF("ControlEscape %c\n", c0_);
1111             Advance();
1112             break;
1113         // c ControlLetter
1114         case 'c': {
1115             Advance();
1116             if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1117                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1118                 PrintF("ControlLetter %c\n", c0_);
1119                 result = static_cast<uint32_t>(c0_) & 0x1f;  // NOLINTNEXTLINE(readability-magic-numbers)
1120                 Advance();
1121             } else {
1122                 if (!IsUtf16()) {
1123                     pc_--;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1124                     result = '\\';
1125                 } else {
1126                     ParseError("Invalid control letter");
1127                     return -1;
1128                 }
1129             }
1130             break;
1131         }
1132         case '0': {
1133             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1134             PrintF("CharacterEscape 0 [lookahead ∉ DecimalDigit]\n");
1135             if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) {  // NOLINTNEXTLINE(readability-magic-numbers)
1136                 Advance();
1137                 result = 0;
1138                 break;
1139             }
1140             [[fallthrough]];
1141         }
1142         case '1':
1143         case '2':
1144         case '3':
1145         case '4':
1146         case '5':
1147         case '6':
1148         case '7': {
1149             if (IsUtf16()) {
1150                 // With /u, decimal escape is not interpreted as octal character code.
1151                 ParseError("Invalid class escape");
1152                 return 0;
1153             }
1154             result = ParseOctalLiteral();
1155             break;
1156         }
1157         // ParseHexEscapeSequence
1158         // ParseRegExpUnicodeEscapeSequence
1159         case 'x': {
1160             Advance();
1161             if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1162                 return result;
1163             }
1164             if (IsUtf16()) {
1165                 ParseError("Invalid class escape");
1166                 return -1;
1167             }
1168             result = 'x';
1169             break;
1170         }
1171         case 'u': {
1172             Advance();
1173             if (ParseUnicodeEscape(&result)) {
1174                 return result;
1175             }
1176             if (IsUtf16()) {
1177                 // With /u, invalid escapes are not treated as identity escapes.
1178                 ParseError("Invalid unicode escape");
1179                 return 0;
1180             }
1181             // If \u is not followed by a two-digit hexadecimal, treat it
1182             // as an identity escape.
1183             result = 'u';
1184             break;
1185         }
1186         // IdentityEscape[?U]
1187         case '$':
1188         case '(':
1189         case ')':
1190         case '*':
1191         case '+':
1192         case '.':
1193         case '/':
1194         case '?':
1195         case '[':
1196         case '\\':
1197         case ']':
1198         case '^':
1199         case '{':
1200         case '|':
1201         case '}':
1202             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1203             PrintF("IdentityEscape %c\n", c0_);
1204             result = c0_;
1205             Advance();
1206             break;
1207         default: {
1208             if (IsUtf16()) {
1209                 ParseError("Invalid unicode escape");
1210                 return 0;
1211             }
1212             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1213             PrintF("SourceCharacter %c\n", c0_);
1214             result = c0_;
1215             if (result < CHAR_MAXS) {
1216                 Advance();
1217             }
1218             break;
1219         }
1220     }
1221     return result;
1222 }
1223 
ParseClassRanges(RangeSet * result)1224 bool RegExpParser::ParseClassRanges(RangeSet *result)
1225 {
1226     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1227     PrintF("Parse ClassRanges------\n");
1228     while (c0_ != ']') {
1229         RangeSet s1;
1230         uint32_t c1 = ParseClassAtom(&s1);
1231         if (c1 == UINT32_MAX) {
1232             ParseError("invalid class range");
1233             return false;
1234         }
1235 
1236         int next_c0 = *pc_;
1237         if (c0_ == '-' && next_c0 != ']') {
1238             if (c1 == CLASS_RANGE_BASE) {
1239                 if (IsUtf16()) {
1240                     ParseError("invalid class range");
1241                     return false;
1242                 }
1243                 result->Insert(s1);
1244                 continue;
1245             }
1246             Advance();
1247             RangeSet s2;
1248             uint32_t c2 = ParseClassAtom(&s2);
1249             if (c2 == UINT32_MAX) {
1250                 ParseError("invalid class range");
1251                 return false;
1252             }
1253             if (c2 == CLASS_RANGE_BASE) {
1254                 if (IsUtf16()) {
1255                     ParseError("invalid class range");
1256                     return false;
1257                 }
1258                 result->Insert(s2);
1259                 continue;
1260             }
1261             if (c1 < INT8_MAX) {
1262                 if (c1 > c2) {
1263                     ParseError("invalid class range");
1264                     return false;
1265                 }
1266             }
1267             if (IsIgnoreCase()) {
1268                 c1 = static_cast<uint32_t>(Canonicalize(c1, IsUtf16()));
1269                 c2 = static_cast<uint32_t>(Canonicalize(c2, IsUtf16()));
1270             }
1271 
1272             result->Insert(c1, c2);
1273         } else {
1274             result->Insert(s1);
1275         }
1276     }
1277     Advance();
1278     return true;
1279 }
1280 
ParseClassAtom(RangeSet * atom)1281 uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1282 {
1283     uint32_t ret = UINT32_MAX;
1284     switch (c0_) {
1285         case '\\': {
1286             Advance();
1287             ret = static_cast<uint32_t>(ParseClassEscape(atom));
1288             break;
1289         }
1290         case KEY_EOF:
1291             break;
1292         case 0: {
1293             if (pc_ >= end_) {
1294                 return UINT32_MAX;
1295             }
1296             [[fallthrough]];
1297         }
1298         default: {
1299             uint32_t value = c0_;
1300             size_t u16_size = 0;
1301             if (c0_ > INT8_MAX) {  // NOLINTNEXTLINE(readability-magic-numbers)
1302                 pc_ -= 1;          // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1303                 auto u16_result = base::utf_helper::ConvertUtf8ToUtf16Pair(pc_, true);
1304                 value = u16_result.first;
1305                 u16_size = u16_result.second;
1306                 Advance(u16_size + 1);
1307             } else {
1308                 Advance();
1309             }
1310             if (IsIgnoreCase()) {
1311                 value = static_cast<uint32_t>(Canonicalize(value, IsUtf16()));
1312             }
1313             atom->Insert(RangeSet(value));
1314             ret = value;
1315             break;
1316         }
1317     }
1318     return ret;
1319 }
1320 
ParseClassEscape(RangeSet * atom)1321 int RegExpParser::ParseClassEscape(RangeSet *atom)
1322 {
1323     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1324     PrintF("Parse ClassEscape------\n");
1325     int result = -1;
1326     switch (c0_) {
1327         case 'b':
1328             Advance();
1329             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1330             PrintF("ClassEscape %c", 'b');
1331             result = '\b';
1332             atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1333             break;
1334         case '-':
1335             Advance();
1336             result = '-';
1337             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1338             PrintF("ClassEscape %c", '-');
1339             atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1340             break;
1341         // CharacterClassEscape
1342         case 'd':
1343         case 'D':
1344             result = CLASS_RANGE_BASE;
1345             atom->Insert(g_rangeD);
1346             if (c0_ == 'D') {
1347                 atom->Invert(IsUtf16());
1348             }
1349             Advance();
1350             break;
1351         case 's':
1352         case 'S':
1353             result = CLASS_RANGE_BASE;
1354             atom->Insert(g_rangeS);
1355             if (c0_ == 'S') {
1356                 atom->Invert(IsUtf16());
1357             }
1358             Advance();
1359             break;
1360         case 'w':
1361         case 'W':
1362             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1363             PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1364             result = CLASS_RANGE_BASE;
1365             atom->Insert(g_rangeW);
1366             if (c0_ == 'W') {
1367                 atom->Invert(IsUtf16());
1368             }
1369             Advance();
1370             break;
1371         // P{UnicodePropertyValueExpression}
1372         // p{UnicodePropertyValueExpression}
1373         case 'P':
1374         case 'p':
1375             PrintF("Warning: \\p is not supported in ECMA 2015!");
1376             Advance();
1377             if (c0_ == '{') {
1378                 Advance();
1379                 if (c0_ == '}') {
1380                     break;  // p{}, invalid
1381                 }
1382                 bool isValue = false;
1383                 ParseUnicodePropertyValueCharacters(&isValue);
1384                 if (!isValue && c0_ == '=') {
1385                     // UnicodePropertyName = UnicodePropertyValue
1386                     Advance();
1387                     if (c0_ == '}') {
1388                         break;  // p{xxx=}, invalid
1389                     }
1390                     ParseUnicodePropertyValueCharacters(&isValue);
1391                 }
1392                 if (c0_ != '}') {
1393                     break;  // p{xxx, invalid
1394                 }
1395                 // should do atom->Invert() here after ECMA 9.0
1396                 Advance();
1397                 result = CLASS_RANGE_BASE;
1398             }
1399             break;
1400         default:
1401             result = ParseCharacterEscape();
1402             int value = result;
1403             if (IsIgnoreCase()) {
1404                 value = Canonicalize(value, IsUtf16());
1405             }
1406             atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1407             break;
1408     }
1409     return result;
1410 }
1411 
ParseUnicodePropertyValueCharacters(bool * isValue)1412 void RegExpParser::ParseUnicodePropertyValueCharacters(bool *isValue)
1413 {
1414     if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1415         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1416         PrintF("UnicodePropertyCharacter::ControlLetter %c\n", c0_);
1417     } else if (c0_ == '_') {
1418         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1419         PrintF("UnicodePropertyCharacter:: _ \n");
1420     } else if (c0_ >= '0' && c0_ <= '9') {
1421         *isValue = true;
1422         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1423         PrintF("UnicodePropertyValueCharacter::DecimalDigit %c\n", c0_);
1424     } else {
1425         return;
1426     }
1427     Advance();
1428     ParseUnicodePropertyValueCharacters(isValue);
1429 }
1430 
1431 // NOLINTNEXTLINE(cert-dcl50-cpp)
PrintF(const char * fmt,...)1432 void RegExpParser::PrintF(const char *fmt, ...)
1433 {
1434 #ifndef _NO_DEBUG_
1435     va_list args;
1436     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,)
1437     va_start(args, fmt);
1438     vprintf(fmt, args);
1439     va_end(args);
1440 #else
1441     (void)fmt;
1442 #endif
1443 }
1444 
ParseError(const char * errorMessage)1445 void RegExpParser::ParseError(const char *errorMessage)
1446 {
1447     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1448     PrintF("error: ");
1449     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1450     PrintF(errorMessage);
1451     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1452     PrintF("\n");
1453     SetIsError();
1454     size_t length = strlen(errorMessage) + 1;
1455     if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1456         LOG_FULL(FATAL) << "memcpy_s failed";
1457         UNREACHABLE();
1458     }
1459 }
1460 
IsIdentFirst(uint32_t c)1461 int RegExpParser::IsIdentFirst(uint32_t c)
1462 {
1463     if (c < CACHE_SIZE) {
1464         return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1; // 5: Shift five bits 31: and operation binary of 31
1465     } else {
1466         return static_cast<int>(u_isIDStart(c));
1467     }
1468 }
1469 }  // namespace panda::ecmascript