• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "ecmascript/regexp/regexp_parser.h"
17 
18 #include "ecmascript/base/string_helper.h"
19 #include "ecmascript/ecma_macros.h"
20 #include "ecmascript/regexp/regexp_opcode.h"
21 #include "libpandabase/utils/utils.h"
22 #include "securec.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uniset.h"
25 #define _NO_DEBUG_
26 
27 namespace panda::ecmascript {
28 static constexpr uint32_t CACHE_SIZE = 128;
29 static constexpr uint32_t CHAR_MAXS = 128;
30 static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
31     /* $ A-Z _ a-z */
32     0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
33 };
34 static RangeSet g_rangeD(0x30, 0x39);  // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
35 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
36 static RangeSet g_rangeS({
37     std::pair<uint32_t, uint32_t>(0x0009, 0x000D),  // NOLINTNEXTLINE(readability-magic-numbers)
38     std::pair<uint32_t, uint32_t>(0x0020, 0x0020),  // NOLINTNEXTLINE(readability-magic-numbers)
39     std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0),  // NOLINTNEXTLINE(readability-magic-numbers)
40     std::pair<uint32_t, uint32_t>(0x1680, 0x1680),  // NOLINTNEXTLINE(readability-magic-numbers)
41     std::pair<uint32_t, uint32_t>(0x2000, 0x200A),  // NOLINTNEXTLINE(readability-magic-numbers)
42     /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
43     /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
44     std::pair<uint32_t, uint32_t>(0x2028, 0x2029),  // NOLINTNEXTLINE(readability-magic-numbers)
45     std::pair<uint32_t, uint32_t>(0x202F, 0x202F),  // NOLINTNEXTLINE(readability-magic-numbers)
46     std::pair<uint32_t, uint32_t>(0x205F, 0x205F),  // NOLINTNEXTLINE(readability-magic-numbers)
47     std::pair<uint32_t, uint32_t>(0x3000, 0x3000),  // NOLINTNEXTLINE(readability-magic-numbers)
48     /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
49     std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF),  // NOLINTNEXTLINE(readability-magic-numbers)
50 });
51 
52 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
53 static RangeSet g_rangeW({
54     std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINTNEXTLINE(readability-magic-numbers)
55     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINTNEXTLINE(readability-magic-numbers)
56     std::pair<uint32_t, uint32_t>(0x005F, 0x005F),  // NOLINTNEXTLINE(readability-magic-numbers)
57     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINTNEXTLINE(readability-magic-numbers)
58 });
59 
60 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
61 static RangeSet g_regexpIdentifyStart({
62     std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINTNEXTLINE(readability-magic-numbers)
63     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINTNEXTLINE(readability-magic-numbers)
64     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINTNEXTLINE(readability-magic-numbers)
65 });
66 
67 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
68 static RangeSet g_regexpIdentifyContinue({
69     std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINTNEXTLINE(readability-magic-numbers)
70     std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINTNEXTLINE(readability-magic-numbers)
71     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINTNEXTLINE(readability-magic-numbers)
72     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINTNEXTLINE(readability-magic-numbers)
73 });
74 
Parse()75 void RegExpParser::Parse()
76 {
77     // dynbuffer head init [size,capture_count,statck_count,flags]
78     buffer_.EmitU32(0);
79     buffer_.EmitU32(0);
80     buffer_.EmitU32(0);
81     buffer_.EmitU32(0);
82     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
83     PrintF("Parse Pattern------\n");
84     // Pattern[U, N]::
85     //      Disjunction[?U, ?N]
86     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
87     Advance();
88     SaveStartOpCode saveStartOp;
89     int captureIndex = captureCount_++;
90     saveStartOp.EmitOpCode(&buffer_, captureIndex);
91     ParseDisjunction(false);
92     if (c0_ != KEY_EOF) {
93         ParseError("extraneous characters at the end");
94         return;
95     }
96     SaveEndOpCode saveEndOp;
97     saveEndOp.EmitOpCode(&buffer_, captureIndex);
98     MatchEndOpCode matchEndOp;
99     matchEndOp.EmitOpCode(&buffer_, 0);
100     // dynbuffer head assignments
101     buffer_.PutU32(0, buffer_.size_);
102     buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
103     buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
104     buffer_.PutU32(FLAGS_OFFSET, flags_);
105 #ifndef _NO_DEBUG_
106     RegExpOpCode::DumpRegExpOpCode(std::cout, buffer_);
107 #endif
108 }
109 
ParseDisjunction(bool isBackward)110 void RegExpParser::ParseDisjunction(bool isBackward)
111 {
112     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
113     PrintF("Parse Disjunction------\n");
114     size_t start = buffer_.size_;
115     ParseAlternative(isBackward);
116     if (isError_) {
117         return;
118     }
119     do {
120         if (c0_ == '|') {
121             SplitNextOpCode splitOp;
122             uint32_t len = buffer_.size_ - start;
123             GotoOpCode gotoOp;
124             splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
125             uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
126             Advance();
127             ParseAlternative(isBackward);
128             gotoOp.UpdateOpPara(&buffer_, pos, buffer_.size_ - pos - gotoOp.GetSize());
129         }
130     } while (c0_ != KEY_EOF && c0_ != ')');
131 }
132 
ParseOctalLiteral()133 uint32_t RegExpParser::ParseOctalLiteral()
134 {
135     // For compatibility with some other browsers (not all), we parse
136     // up to three octal digits with a value below 256.
137     // ES#prod-annexB-LegacyOctalEscapeSequence
138     uint32_t value = c0_ - '0';
139     Advance();
140     if (c0_ >= '0' && c0_ <= '7') {
141         value = value * OCTAL_VALUE + c0_ - '0';
142         Advance();
143         if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
144             value = value * OCTAL_VALUE + c0_ - '0';
145             Advance();
146         }
147     }
148     return value;
149 }
150 
ParseUnlimitedLengthHexNumber(uint32_t maxValue,uint32_t * value)151 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
152 {
153     uint32_t x = 0;
154     int d = static_cast<int>(HexValue(c0_));
155     if (d < 0) {
156         return false;
157     }
158     while (d >= 0) {
159         if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
160             LOG_FULL(FATAL) << "value overflow";
161             return false;
162         }
163         x = x * HEX_VALUE + static_cast<uint32_t>(d);
164         if (x > maxValue) {
165             return false;
166         }
167         Advance();
168         d = static_cast<int>(HexValue(c0_));
169     }
170     *value = x;
171     return true;
172 }
173 
174 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uint32_t * value)175 bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
176 {
177     // Accept both \uxxxx and \u{xxxxxx} (if allowed).
178     // In the latter case, the number of hex digits between { } is arbitrary.
179     // \ and u have already been read.
180     if (c0_ == '{' && IsUtf16()) {
181         uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
182         Advance();
183         if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {  // NOLINTNEXTLINE(readability-magic-numbers)
184             if (c0_ == '}') {
185                 Advance();
186                 return true;
187             }
188         }
189         pc_ = start;
190         Advance();
191         return false;
192     }
193     // \u but no {, or \u{...} escapes not allowed.
194     bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
195     if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
196         // Attempt to read trail surrogate.
197         uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
198         if (*pc_ == 'u') {
199             Advance(UNICODE_HEX_ADVANCE);
200             uint32_t trail = 0;
201             if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
202                 *value = U16_GET_SUPPLEMENTARY((*value), (trail));  // NOLINTNEXTLINE(hicpp-signed-bitwise)
203                 return true;
204             }
205         }
206         pc_ = start;
207         Advance();
208     }
209     return result;
210 }
211 
ParseHexEscape(int length,uint32_t * value)212 bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
213 {
214     uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
215     uint32_t val = 0;
216     for (int i = 0; i < length; ++i) {
217         uint32_t c = c0_;
218         int d = static_cast<int>(HexValue(c));
219         if (d < 0) {
220             pc_ = start;
221             Advance();
222             return false;
223         }
224         val = val * HEX_VALUE + static_cast<uint32_t>(d);
225         Advance();
226     }
227     *value = val;
228     return true;
229 }
230 
231 // NOLINTNEXTLINE(readability-function-size)
ParseAlternative(bool isBackward)232 void RegExpParser::ParseAlternative(bool isBackward)
233 {
234     size_t start = buffer_.size_;
235     while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
236         if (isError_) {
237             return;
238         }
239         size_t atomBcStart = buffer_.GetSize();
240         int captureIndex = 0;
241         bool isAtom = false;
242         switch (c0_) {
243             case '^': {
244                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
245                 PrintF("Assertion %c line start \n", c0_);
246                 LineStartOpCode lineStartOp;
247                 lineStartOp.EmitOpCode(&buffer_, 0);
248                 Advance();
249                 break;
250             }
251             case '$': {
252                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
253                 PrintF("Assertion %c line end \n", c0_);
254                 LineEndOpCode lineEndOp;
255                 lineEndOp.EmitOpCode(&buffer_, 0);
256                 Advance();
257                 break;
258             }
259             case '\\': {
260                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
261                 PrintF("Escape %c \n", c0_);
262                 Advance();
263                 switch (c0_) {
264                     case 'b': {
265                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
266                         PrintF("Assertion %c \n", c0_);
267                         WordBoundaryOpCode wordBoundaryOp;
268                         wordBoundaryOp.EmitOpCode(&buffer_, 0);
269                         Advance();
270                         break;
271                     }
272                     case 'B': {
273                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
274                         PrintF("Assertion %c \n", c0_);
275                         NotWordBoundaryOpCode notWordBoundaryOp;
276                         notWordBoundaryOp.EmitOpCode(&buffer_, 0);
277                         Advance();
278                         break;
279                     }
280                     default: {
281                         isAtom = true;
282                         int atomValue = ParseAtomEscape(isBackward);
283                         if (atomValue != -1) {
284                             PrevOpCode prevOp;
285                             if (isBackward) {
286                                 prevOp.EmitOpCode(&buffer_, 0);
287                             }
288                             if (IsIgnoreCase()) {
289                                 if (!IsUtf16()) {
290                                     atomValue = Canonicalize(atomValue, false);
291                                 } else {
292                                     icu::UnicodeSet set(atomValue, atomValue);
293                                     set.closeOver(USET_CASE_INSENSITIVE);
294                                     set.removeAllStrings();
295                                     uint32_t size = static_cast<uint32_t>(set.size());
296                                     RangeOpCode rangeOp;
297                                     RangeSet rangeResult;
298                                     for (uint32_t idx = 0; idx < size; idx++) {
299                                         int32_t uc = set.charAt(idx);
300                                         RangeSet curRange(uc);
301                                         rangeResult.Insert(curRange);
302                                     }
303                                     rangeOp.InsertOpCode(&buffer_, rangeResult);
304                                     break;
305                                 }
306                             }
307                             if (atomValue <= UINT16_MAX) {
308                                 CharOpCode charOp;
309                                 charOp.EmitOpCode(&buffer_, atomValue);
310                             } else {
311                                 Char32OpCode charOp;
312                                 charOp.EmitOpCode(&buffer_, atomValue);
313                             }
314                             if (isBackward) {
315                                 prevOp.EmitOpCode(&buffer_, 0);
316                             }
317                         }
318                         break;
319                     }
320                 }
321                 break;
322             }
323             case '(': {
324                 Advance();
325                 isAtom = ParseAssertionCapture(&captureIndex, isBackward);
326                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
327                 Advance();
328                 break;
329             }
330             case '.': {
331                 PrevOpCode prevOp;
332                 if (isBackward) {
333                     prevOp.EmitOpCode(&buffer_, 0);
334                 }
335                 if (IsDotAll()) {
336                     AllOpCode allOp;
337                     allOp.EmitOpCode(&buffer_, 0);
338                 } else {
339                     DotsOpCode dotsOp;
340                     dotsOp.EmitOpCode(&buffer_, 0);
341                 }
342                 if (isBackward) {
343                     prevOp.EmitOpCode(&buffer_, 0);
344                 }
345                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
346                 PrintF("Atom %c match any \n", c0_);
347                 isAtom = true;
348                 Advance();
349                 break;
350             }
351             case '[': {
352                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
353                 PrintF("Atom %c match range \n", c0_);
354                 isAtom = true;
355                 PrevOpCode prevOp;
356                 Advance();
357                 if (isBackward) {
358                     prevOp.EmitOpCode(&buffer_, 0);
359                 }
360                 bool isInvert = false;
361                 if (c0_ == '^') {
362                     isInvert = true;
363                     Advance();
364                 }
365                 RangeSet rangeResult;
366                 if (!ParseClassRanges(&rangeResult)) {
367                     break;
368                 }
369                 if (isInvert) {
370                     rangeResult.Invert(IsUtf16());
371                 }
372                 uint32_t highValue = rangeResult.HighestValue();
373                 if (highValue <= UINT16_MAX) {
374                     RangeOpCode rangeOp;
375                     rangeOp.InsertOpCode(&buffer_, rangeResult);
376                 } else {
377                     Range32OpCode rangeOp;
378                     rangeOp.InsertOpCode(&buffer_, rangeResult);
379                 }
380 
381                 if (isBackward) {
382                     prevOp.EmitOpCode(&buffer_, 0);
383                 }
384                 break;
385             }
386             case '*':
387             case '+':
388             case '?':
389                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
390                 ParseError("nothing to repeat");
391                 return;
392             case '{': {
393                 uint8_t *begin = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
394                 int dummy;
395                 if (ParserIntervalQuantifier(&dummy, &dummy)) {
396                     ParseError("nothing to repeat");
397                     return;
398                 }
399                 pc_ = begin;
400                 Advance();
401             }
402                 [[fallthrough]];
403             case '}':
404             case ']':
405                 if (IsUtf16()) {
406                     ParseError("syntax error");
407                     return;
408                 }
409                 [[fallthrough]];
410             default: {
411                 // PatternCharacter
412                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
413                 PrintF("PatternCharacter %c\n", c0_);
414                 isAtom = true;
415                 {
416                     PrevOpCode prevOp;
417                     if (isBackward) {
418                         prevOp.EmitOpCode(&buffer_, 0);
419                     }
420                     uint32_t matchedChar = c0_;
421                     if (c0_ > (INT8_MAX + 1)) {
422                         Prev();
423                         int i = 0;
424                         UChar32 c;
425                         int32_t length = end_ - pc_ + 1;
426                         // NOLINTNEXTLINE(hicpp-signed-bitwise)
427                         U8_NEXT(pc_, i, length, c);  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
428                         matchedChar = static_cast<uint32_t>(c);
429                         pc_ += i;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
430                     }
431                     if (IsIgnoreCase()) {
432                         matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
433                     }
434                     if (matchedChar > UINT16_MAX) {
435                         Char32OpCode charOp;
436                         charOp.EmitOpCode(&buffer_, matchedChar);
437                     } else {
438                         CharOpCode charOp;
439                         charOp.EmitOpCode(&buffer_, matchedChar);
440                     }
441                     if (isBackward) {
442                         prevOp.EmitOpCode(&buffer_, 0);
443                     }
444                 }
445                 Advance();
446                 break;
447             }
448         }
449         if (isAtom && !isError_) {
450             ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
451         }
452         if (isBackward) {
453             size_t end = buffer_.GetSize();
454             size_t termSize = end - atomBcStart;
455             size_t moveSize = end - start;
456             buffer_.Expand(end + termSize);
457             if (memmove_s(buffer_.buf_ + start +  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
458                               termSize,           // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
459                           moveSize,
460                           buffer_.buf_ + start,  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
461                           moveSize) != EOK) {
462                 LOG_FULL(FATAL) << "memmove_s failed";
463                 UNREACHABLE();
464             }
465             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
466             if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
467                 LOG_FULL(FATAL) << "memcpy_s failed";
468                 UNREACHABLE();
469             }
470         }
471     }
472 }
473 
FindGroupName(const CString & name)474 int RegExpParser::FindGroupName(const CString &name)
475 {
476     size_t len = 0;
477     size_t nameLen = name.size();
478     const char *p = reinterpret_cast<char *>(groupNames_.buf_);
479     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
480     const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
481     int captureIndex = 1;
482     while (p < bufEnd) {
483         len = strlen(p);
484         if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
485             return captureIndex;
486         }
487         p += len + 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
488         captureIndex++;
489     }
490     return -1;
491 }
492 
ParseAssertionCapture(int * captureIndex,bool isBackward)493 bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
494 {
495     bool isAtom = false;
496     do {
497         if (c0_ == '?') {
498             Advance();
499             switch (c0_) {
500                 // (?=Disjunction[?U, ?N])
501                 case '=': {
502                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
503                     PrintF("Assertion(?= Disjunction)\n");
504                     Advance();
505                     uint32_t start = buffer_.size_;
506                     ParseDisjunction(isBackward);
507                     MatchOpCode matchOp;
508                     matchOp.EmitOpCode(&buffer_, 0);
509                     MatchAheadOpCode matchAheadOp;
510                     uint32_t len = buffer_.size_ - start;
511                     matchAheadOp.InsertOpCode(&buffer_, start, len);
512                     break;
513                 }
514                 // (?!Disjunction[?U, ?N])
515                 case '!': {
516                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
517                     PrintF("Assertion(?! Disjunction)\n");
518                     uint32_t start = buffer_.size_;
519                     Advance();
520                     ParseDisjunction(isBackward);
521                     MatchOpCode matchOp;
522                     matchOp.EmitOpCode(&buffer_, 0);
523                     NegativeMatchAheadOpCode matchAheadOp;
524                     uint32_t len = buffer_.size_ - start;
525                     matchAheadOp.InsertOpCode(&buffer_, start, len);
526                     break;
527                 }
528                 case '<': {
529                     Advance();
530                     // (?<=Disjunction[?U, ?N])
531                     if (c0_ == '=') {
532                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
533                         PrintF("Assertion(?<= Disjunction)\n");
534                         Advance();
535                         uint32_t start = buffer_.size_;
536                         ParseDisjunction(true);
537                         MatchOpCode matchOp;
538                         matchOp.EmitOpCode(&buffer_, 0);
539                         MatchAheadOpCode matchAheadOp;
540                         uint32_t len = buffer_.size_ - start;
541                         matchAheadOp.InsertOpCode(&buffer_, start, len);
542                         // (?<!Disjunction[?U, ?N])
543                     } else if (c0_ == '!') {
544                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
545                         PrintF("Assertion(?<! Disjunction)\n");
546                         Advance();
547                         uint32_t start = buffer_.size_;
548                         ParseDisjunction(true);
549                         MatchOpCode matchOp;
550                         matchOp.EmitOpCode(&buffer_, 0);
551                         NegativeMatchAheadOpCode matchAheadOp;
552                         uint32_t len = buffer_.size_ - start;
553                         matchAheadOp.InsertOpCode(&buffer_, start, len);
554                     } else {
555                         Prev();
556                         CString name;
557                         auto **pp = const_cast<const uint8_t **>(&pc_);
558                         if (!ParseGroupSpecifier(pp, name)) {
559                             ParseError("GroupName Syntax error.");
560                             return false;
561                         }
562                         if (FindGroupName(name) > 0) {
563                             ParseError("Duplicate GroupName error.");
564                             return false;
565                         }
566                         groupNames_.EmitStr(name.c_str());
567                         newGroupNames_.push_back(name);
568                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
569                         PrintF("group name %s", name.c_str());
570                         Advance();
571                         goto parseCapture;  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
572                     }
573                     break;
574                 }
575                 // (?:Disjunction[?U, ?N])
576                 case ':':
577                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
578                     PrintF("Atom(?<: Disjunction)\n");
579                     isAtom = true;
580                     Advance();
581                     ParseDisjunction(isBackward);
582                     break;
583                 default:
584                     Advance();
585                     ParseError("? Syntax error.");
586                     return false;
587             }
588             if (isError_) {
589                 return false;
590             }
591         } else {
592             groupNames_.EmitChar(0);
593         parseCapture:
594             isAtom = true;
595             *captureIndex = captureCount_++;
596             SaveEndOpCode saveEndOp;
597             SaveStartOpCode saveStartOp;
598             if (isBackward) {
599                 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
600             } else {
601                 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
602             }
603             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
604             PrintF("capture start %d \n", *captureIndex);
605             ParseDisjunction(isBackward);
606             if (isError_) {
607                 return false;
608             }
609             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
610             PrintF("capture end %d \n", *captureIndex);
611             if (isBackward) {
612                 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
613             } else {
614                 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
615             }
616         }
617     } while (c0_ != ')' && c0_ != KEY_EOF);
618     if (c0_ != ')') {
619         ParseError("capture syntax error");
620         return false;
621     }
622     return isAtom;
623 }
624 
ParseDecimalDigits()625 int RegExpParser::ParseDecimalDigits()
626 {
627     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
628     PrintF("Parse DecimalDigits------\n");
629     uint32_t result = 0;
630     bool overflow = false;
631     while (true) {
632         if (c0_ < '0' || c0_ > '9') {
633             break;
634         }
635         if (!overflow) {
636             if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
637                 overflow = true;
638             } else {
639                 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
640             }
641         }
642         Advance();
643     }
644     if (overflow) {
645         return INT32_MAX;
646     }
647     return result;
648 }
649 
ParserIntervalQuantifier(int * pmin,int * pmax)650 bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
651 {
652     // Quantifier::
653     //     QuantifierPrefix
654     //     QuantifierPrefix?
655     // QuantifierPrefix::
656     // *
657     // +
658     // ?
659     // {DecimalDigits}
660     // {DecimalDigits,}
661     // {DecimalDigits,DecimalDigits}
662     Advance();
663     *pmin = ParseDecimalDigits();
664     *pmax = *pmin;
665     switch (c0_) {
666         case ',': {
667             Advance();
668             if (c0_ == '}') {
669                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
670                 PrintF("QuantifierPrefix{DecimalDigits,}\n");
671                 *pmax = INT32_MAX;
672                 Advance();
673             } else {
674                 *pmax = ParseDecimalDigits();
675                 if (c0_ == '}') {
676                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
677                     PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
678                     Advance();
679                 } else {
680                     return false;
681                 }
682             }
683             break;
684         }
685         case '}':
686             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
687             PrintF("QuantifierPrefix{DecimalDigits}\n");
688             Advance();
689             break;
690         default:
691             Advance();
692             return false;
693     }
694     return true;
695 }
696 
ParseQuantifier(size_t atomBcStart,int captureStart,int captureEnd)697 void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
698 {
699     int min = -1;
700     int max = -1;
701     bool isGreedy = true;
702     switch (c0_) {
703         case '*':
704             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
705             PrintF("QuantifierPrefix %c\n", c0_);
706             min = 0;
707             max = INT32_MAX;
708             Advance();
709             break;
710         case '+':
711             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
712             PrintF("QuantifierPrefix %c\n", c0_);
713             min = 1;
714             max = INT32_MAX;
715             Advance();
716             break;
717         case '?':
718             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
719             PrintF("QuantifierPrefix %c\n", c0_);
720             Advance();
721             min = 0;
722             max = 1;
723             break;
724         case '{': {
725             uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
726             if (!ParserIntervalQuantifier(&min, &max)) {
727                 pc_ = start;
728                 Advance();  // back to '{'
729                 return;
730             }
731             if (min > max) {
732                 ParseError("Invalid repetition count");
733                 return;
734             }
735             break;
736         }
737         default:
738             break;
739     }
740     if (c0_ == '?') {
741         isGreedy = false;
742         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
743         PrintF("Quantifier::QuantifierPrefix?\n");
744         Advance();
745     } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
746         ParseError("nothing to repeat");
747         return;
748     }
749     if (min != -1 && max != -1) {
750         stackCount_++;
751         PushOpCode pushOp;
752         pushOp.InsertOpCode(&buffer_, atomBcStart);
753         atomBcStart += pushOp.GetSize();
754 
755         if (captureStart != 0) {
756             SaveResetOpCode saveResetOp;
757             saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
758         }
759 
760         // zero advance check
761         if (max == INT32_MAX) {
762             stackCount_++;
763             PushCharOpCode pushCharOp;
764             pushCharOp.InsertOpCode(&buffer_, atomBcStart);
765             CheckCharOpCode checkCharOp;
766             // NOLINTNEXTLINE(readability-magic-numbers)
767             checkCharOp.EmitOpCode(&buffer_, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
768         }
769 
770         if (isGreedy) {
771             LoopGreedyOpCode loopOp;
772             loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
773         } else {
774             LoopOpCode loopOp;
775             loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
776         }
777 
778         if (min == 0) {
779             if (isGreedy) {
780                 SplitNextOpCode splitNextOp;
781                 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
782             } else {
783                 SplitFirstOpCode splitFirstOp;
784                 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
785             }
786         }
787 
788         PopOpCode popOp;
789         popOp.EmitOpCode(&buffer_);
790     }
791 }
792 
ParseGroupSpecifier(const uint8_t ** pp,CString & name)793 bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, CString &name)
794 {
795     const uint8_t *p = *pp;
796     uint32_t c = 0;
797     char buffer[CACHE_SIZE] = {0};
798     char *q = buffer;
799     while (true) {
800         if (p <= end_) {
801             c = *p;
802         } else {
803             c = KEY_EOF;
804         }
805         if (c == '\\') {
806             p++;
807             if (*p != 'u') {
808                 return false;
809             }
810             if (!ParseUnicodeEscape(&c)) {
811                 return false;
812             }
813         } else if (c == '>') {
814             break;
815         } else if (c > CACHE_SIZE && c != KEY_EOF) {
816             c = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
817         } else if (c != KEY_EOF) {
818             p++;
819         } else {
820             return false;
821         }
822         if (q == buffer) {
823             if (!IsIdentFirst(c)) {
824                 return false;
825             }
826         } else {
827             if (!u_isIDPart(c)) {
828                 return false;
829             }
830         }
831         if (q != nullptr) {
832             *q++ = c;
833         }
834     } // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
835     p++;
836     *pp = p;
837     name = buffer;
838     return true;
839 }
840 
ParseCaptureCount(const char * groupName)841 int RegExpParser::ParseCaptureCount(const char *groupName)
842 {
843     const uint8_t *p = nullptr;
844     int captureIndex = 1;
845     CString name;
846     hasNamedCaptures_ = 0;
847     for (p = base_; p < end_; p++) {  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
848         switch (*p) {
849             case '(': {
850                 if (p[1] == '?') {  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
851                     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
852                     if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
853                         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
854                         p[CAPTURE_CONUT_ADVANCE] != '=') {
855                         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
856                         hasNamedCaptures_ = 1;
857                         p += CAPTURE_CONUT_ADVANCE;
858                         if (groupName != nullptr) {
859                             if (ParseGroupSpecifier(&p, name)) {
860                                 if (strcmp(name.c_str(), groupName) == 0) {
861                                     return captureIndex;
862                                 }
863                             }
864                         }
865                         captureIndex++;
866                     }
867                 } else {
868                     captureIndex++;
869                 }
870                 break;
871             }
872             case '\\':
873                 p++;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
874                 break;
875             case '[': {
876                 while (p < end_ && *p != ']') {
877                     if (*p == '\\') {
878                         p++;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
879                     }
880                     p++;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
881                 }
882                 break;
883             }
884             default:
885                 break;
886         }
887     }
888     return captureIndex;
889 }
890 
891 // NOLINTNEXTLINE(readability-function-size)
ParseAtomEscape(bool isBackward)892 int RegExpParser::ParseAtomEscape(bool isBackward)
893 {
894     // AtomEscape[U, N]::
895     //     DecimalEscape
896     //     CharacterClassEscape[?U]
897     //     CharacterEscape[?U]
898     //     [+N]kGroupName[?U]
899     int result = -1;
900     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
901     PrintF("Parse AtomEscape------\n");
902     PrevOpCode prevOp;
903     switch (c0_) {
904         case KEY_EOF:
905             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
906             ParseError("unexpected end");
907             break;
908         // DecimalEscape
909         case '1':
910         case '2':
911         case '3':
912         case '4':
913         case '5':
914         case '6':
915         case '7':
916         case '8':
917         case '9': {
918             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
919             PrintF("NonZeroDigit %c\n", c0_);
920             int capture = ParseDecimalDigits();
921             if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
922                 ParseError("invalid backreference count");
923                 break;
924             }
925             if (isBackward) {
926                 BackwardBackReferenceOpCode backReferenceOp;
927                 backReferenceOp.EmitOpCode(&buffer_, capture);
928             } else {
929                 BackReferenceOpCode backReferenceOp;
930                 backReferenceOp.EmitOpCode(&buffer_, capture);
931             }
932             break;
933         }
934         // CharacterClassEscape
935         case 'd': {
936             // [0-9]
937             RangeOpCode rangeOp;
938             if (isBackward) {
939                 prevOp.EmitOpCode(&buffer_, 0);
940             }
941             rangeOp.InsertOpCode(&buffer_, g_rangeD);
942             goto parseLookBehind;
943         }
944         case 'D': {
945             // [^0-9]
946             RangeSet atomRange(g_rangeD);
947             atomRange.Invert(IsUtf16());
948             Range32OpCode rangeOp;
949             if (isBackward) {
950                 prevOp.EmitOpCode(&buffer_, 0);
951             }
952             rangeOp.InsertOpCode(&buffer_, atomRange);
953             goto parseLookBehind;
954         }
955         case 's': {
956             // [\f\n\r\t\v]
957             RangeOpCode rangeOp;
958             if (isBackward) {
959                 prevOp.EmitOpCode(&buffer_, 0);
960             }
961             rangeOp.InsertOpCode(&buffer_, g_rangeS);
962             goto parseLookBehind;
963         }
964         case 'S': {
965             RangeSet atomRange(g_rangeS);
966             Range32OpCode rangeOp;
967             atomRange.Invert(IsUtf16());
968             if (isBackward) {
969                 prevOp.EmitOpCode(&buffer_, 0);
970             }
971             rangeOp.InsertOpCode(&buffer_, atomRange);
972             goto parseLookBehind;
973         }
974         case 'w': {
975             // [A-Za-z0-9]
976             RangeOpCode rangeOp;
977             if (isBackward) {
978                 prevOp.EmitOpCode(&buffer_, 0);
979             }
980             rangeOp.InsertOpCode(&buffer_, g_rangeW);
981             goto parseLookBehind;
982         }
983         case 'W': {
984             // [^A-Za-z0-9]
985             RangeSet atomRange(g_rangeW);
986             atomRange.Invert(IsUtf16());
987             Range32OpCode rangeOp;
988             if (isBackward) {
989                 prevOp.EmitOpCode(&buffer_, 0);
990             }
991             rangeOp.InsertOpCode(&buffer_, atomRange);
992             goto parseLookBehind;
993         }
994         // P{UnicodePropertyValueExpression}
995         // p{UnicodePropertyValueExpression}
996         case 'P':
997         case 'p':
998         // [+N]kGroupName[?U]
999         case 'k': {
1000             Advance();
1001             if (c0_ != '<') {
1002                 if (!IsUtf16() || HasNamedCaptures()) {
1003                     ParseError("expecting group name.");
1004                     break;
1005                 }
1006             }
1007             Advance();
1008             Prev();
1009             CString name;
1010             auto **pp = const_cast<const uint8_t **>(&pc_);
1011             if (!ParseGroupSpecifier(pp, name)) {
1012                 ParseError("GroupName Syntax error.");
1013                 break;
1014             }
1015             int postion = FindGroupName(name);
1016             if (postion < 0) {
1017                 postion = ParseCaptureCount(name.c_str());
1018                 if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1019                     ParseError("group name not defined");
1020                     break;
1021                 }
1022             }
1023             if (isBackward) {
1024                 BackwardBackReferenceOpCode backReferenceOp;
1025                 backReferenceOp.EmitOpCode(&buffer_, postion);
1026             } else {
1027                 BackReferenceOpCode backReferenceOp;
1028                 backReferenceOp.EmitOpCode(&buffer_, postion);
1029             }
1030             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1031             Advance();
1032             break;
1033         }
1034         parseLookBehind: {
1035             if (isBackward) {
1036                 prevOp.EmitOpCode(&buffer_, 0);
1037             }
1038             Advance();
1039             break;
1040         }
1041         default:
1042             result = ParseCharacterEscape();
1043             break;
1044     }
1045     return result;
1046 }
1047 
RecountCaptures()1048 int RegExpParser::RecountCaptures()
1049 {
1050     if (totalCaptureCount_ < 0) {
1051         const char *name = reinterpret_cast<const char*>(groupNames_.buf_);
1052         totalCaptureCount_ = ParseCaptureCount(name);
1053     }
1054     return totalCaptureCount_;
1055 }
HasNamedCaptures()1056 bool RegExpParser::HasNamedCaptures()
1057 {
1058     if (hasNamedCaptures_ < 0) {
1059         RecountCaptures();
1060     }
1061     return false;
1062 }
1063 
ParseCharacterEscape()1064 int RegExpParser::ParseCharacterEscape()
1065 {
1066     // CharacterEscape[U]::
1067     //     ControlEscape
1068     //     c ControlLetter
1069     //     0 [lookahead ∉ DecimalDigit]
1070     //     HexEscapeSequence
1071     //     RegExpUnicodeEscapeSequence[?U]
1072     //     IdentityEscape[?U]
1073     uint32_t result = 0;
1074     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1075     switch (c0_) {
1076         // ControlEscape
1077         case 'f':
1078             result = '\f';
1079             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1080             PrintF("ControlEscape %c\n", c0_);
1081             Advance();
1082             break;
1083         case 'n':
1084             result = '\n';
1085             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1086             PrintF("ControlEscape %c\n", c0_);
1087             Advance();
1088             break;
1089         case 'r':
1090             result = '\r';
1091             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1092             PrintF("ControlEscape %c\n", c0_);
1093             Advance();
1094             break;
1095         case 't':
1096             result = '\t';
1097             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1098             PrintF("ControlEscape %c\n", c0_);
1099             Advance();
1100             break;
1101         case 'v':
1102             result = '\v';
1103             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1104             PrintF("ControlEscape %c\n", c0_);
1105             Advance();
1106             break;
1107         // c ControlLetter
1108         case 'c': {
1109             Advance();
1110             if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1111                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1112                 PrintF("ControlLetter %c\n", c0_);
1113                 result = static_cast<uint32_t>(c0_) & 0x1f;  // NOLINTNEXTLINE(readability-magic-numbers)
1114                 Advance();
1115             } else {
1116                 if (!IsUtf16()) {
1117                     pc_--;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1118                     result = '\\';
1119                 } else {
1120                     ParseError("Invalid control letter");
1121                     return -1;
1122                 }
1123             }
1124             break;
1125         }
1126         case '0': {
1127             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1128             PrintF("CharacterEscape 0 [lookahead ∉ DecimalDigit]\n");
1129             if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) {  // NOLINTNEXTLINE(readability-magic-numbers)
1130                 Advance();
1131                 result = 0;
1132                 break;
1133             }
1134             [[fallthrough]];
1135         }
1136         case '1':
1137         case '2':
1138         case '3':
1139         case '4':
1140         case '5':
1141         case '6':
1142         case '7': {
1143             if (IsUtf16()) {
1144                 // With /u, decimal escape is not interpreted as octal character code.
1145                 ParseError("Invalid class escape");
1146                 return 0;
1147             }
1148             result = ParseOctalLiteral();
1149             break;
1150         }
1151         // ParseHexEscapeSequence
1152         // ParseRegExpUnicodeEscapeSequence
1153         case 'x': {
1154             Advance();
1155             if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1156                 return result;
1157             }
1158             if (IsUtf16()) {
1159                 ParseError("Invalid class escape");
1160                 return -1;
1161             }
1162             result = 'x';
1163             break;
1164         }
1165         case 'u': {
1166             Advance();
1167             if (ParseUnicodeEscape(&result)) {
1168                 return result;
1169             }
1170             if (IsUtf16()) {
1171                 // With /u, invalid escapes are not treated as identity escapes.
1172                 ParseError("Invalid unicode escape");
1173                 return 0;
1174             }
1175             // If \u is not followed by a two-digit hexadecimal, treat it
1176             // as an identity escape.
1177             result = 'u';
1178             break;
1179         }
1180         // IdentityEscape[?U]
1181         case '$':
1182         case '(':
1183         case ')':
1184         case '*':
1185         case '+':
1186         case '.':
1187         case '/':
1188         case '?':
1189         case '[':
1190         case '\\':
1191         case ']':
1192         case '^':
1193         case '{':
1194         case '|':
1195         case '}':
1196             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1197             PrintF("IdentityEscape %c\n", c0_);
1198             result = c0_;
1199             Advance();
1200             break;
1201         default: {
1202             if (IsUtf16()) {
1203                 ParseError("Invalid unicode escape");
1204                 return 0;
1205             }
1206             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1207             PrintF("SourceCharacter %c\n", c0_);
1208             result = c0_;
1209             if (result < CHAR_MAXS) {
1210                 Advance();
1211             } else {
1212                 Prev();
1213                 const uint8_t *p = pc_;
1214                 result = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
1215                 int offset = static_cast<int>(p - pc_);
1216                 Advance(offset + 1);
1217             }
1218             break;
1219         }
1220     }
1221     return static_cast<int>(result);
1222 }
1223 
ParseClassRanges(RangeSet * result)1224 bool RegExpParser::ParseClassRanges(RangeSet *result)
1225 {
1226     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1227     PrintF("Parse ClassRanges------\n");
1228     while (c0_ != ']') {
1229         RangeSet s1;
1230         bool needInter = false;
1231         uint32_t c1 = ParseClassAtom(&s1);
1232         if (c1 == UINT32_MAX) {
1233             ParseError("invalid class range");
1234             return false;
1235         }
1236         needInter = NeedIntersection(c1);
1237         int next_c0 = *pc_;
1238         if (c0_ == '-' && next_c0 != ']') {
1239             if (c1 == CLASS_RANGE_BASE) {
1240                 if (IsUtf16()) {
1241                     ParseError("invalid class range");
1242                     return false;
1243                 }
1244                 result->Insert(s1);
1245                 continue;
1246             }
1247             Advance();
1248             RangeSet s2;
1249             uint32_t c2 = ParseClassAtom(&s2);
1250             if (c2 == UINT32_MAX) {
1251                 ParseError("invalid class range");
1252                 return false;
1253             }
1254             if (c2 == CLASS_RANGE_BASE) {
1255                 if (IsUtf16()) {
1256                     ParseError("invalid class range");
1257                     return false;
1258                 }
1259                 result->Insert(s2);
1260                 continue;
1261             }
1262             if (c1 < INT8_MAX) {
1263                 if (c1 > c2) {
1264                     ParseError("invalid class range");
1265                     return false;
1266                 }
1267             }
1268             needInter = NeedIntersection(c2);
1269             result->Insert(c1, c2);
1270             if (IsIgnoreCase() && needInter) {
1271                 ProcessIntersection(result);
1272             }
1273         } else {
1274             result->Insert(s1);
1275             if (!(IsIgnoreCase() && needInter)) {
1276                 continue;
1277             }
1278             if (c1 <= 'z' && c1 >= 'a') {
1279                 result->Insert(RangeSet(c1 - 'a' + 'A'));
1280             } else {
1281                 result->Insert(RangeSet(c1 - 'A' + 'a'));
1282             }
1283         }
1284     }
1285     Advance();
1286     return true;
1287 }
1288 
ParseClassAtom(RangeSet * atom)1289 uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1290 {
1291     uint32_t ret = UINT32_MAX;
1292     switch (c0_) {
1293         case '\\': {
1294             Advance();
1295             ret = static_cast<uint32_t>(ParseClassEscape(atom));
1296             break;
1297         }
1298         case KEY_EOF:
1299             break;
1300         case 0: {
1301             if (pc_ >= end_) {
1302                 return UINT32_MAX;
1303             }
1304             [[fallthrough]];
1305         }
1306         default: {
1307             uint32_t value = c0_;
1308             size_t u16_size = 0;
1309             if (c0_ > INT8_MAX) {  // NOLINTNEXTLINE(readability-magic-numbers)
1310                 pc_ -= 1;          // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1311                 auto u16_result = base::utf_helper::ConvertUtf8ToUtf16Pair(pc_, true);
1312                 value = u16_result.first;
1313                 u16_size = u16_result.second;
1314                 Advance(u16_size + 1);
1315             } else {
1316                 Advance();
1317             }
1318             atom->Insert(RangeSet(value));
1319             ret = value;
1320             break;
1321         }
1322     }
1323     return ret;
1324 }
1325 
ParseClassEscape(RangeSet * atom)1326 int RegExpParser::ParseClassEscape(RangeSet *atom)
1327 {
1328     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1329     PrintF("Parse ClassEscape------\n");
1330     int result = -1;
1331     switch (c0_) {
1332         case 'b':
1333             Advance();
1334             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1335             PrintF("ClassEscape %c", 'b');
1336             result = '\b';
1337             atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1338             break;
1339         case '-':
1340             Advance();
1341             result = '-';
1342             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1343             PrintF("ClassEscape %c", '-');
1344             atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1345             break;
1346         // CharacterClassEscape
1347         case 'd':
1348         case 'D':
1349             result = CLASS_RANGE_BASE;
1350             atom->Insert(g_rangeD);
1351             if (c0_ == 'D') {
1352                 atom->Invert(IsUtf16());
1353             }
1354             Advance();
1355             break;
1356         case 's':
1357         case 'S':
1358             result = CLASS_RANGE_BASE;
1359             atom->Insert(g_rangeS);
1360             if (c0_ == 'S') {
1361                 atom->Invert(IsUtf16());
1362             }
1363             Advance();
1364             break;
1365         case 'w':
1366         case 'W':
1367             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1368             PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1369             result = CLASS_RANGE_BASE;
1370             atom->Insert(g_rangeW);
1371             if (c0_ == 'W') {
1372                 atom->Invert(IsUtf16());
1373             }
1374             Advance();
1375             break;
1376         // P{UnicodePropertyValueExpression}
1377         // p{UnicodePropertyValueExpression}
1378         case 'P':
1379         case 'p':
1380             PrintF("Warning: \\p is not supported in ECMA 2015!");
1381             Advance();
1382             if (c0_ == '{') {
1383                 Advance();
1384                 if (c0_ == '}') {
1385                     break;  // p{}, invalid
1386                 }
1387                 bool isValue = false;
1388                 ParseUnicodePropertyValueCharacters(&isValue);
1389                 if (!isValue && c0_ == '=') {
1390                     // UnicodePropertyName = UnicodePropertyValue
1391                     Advance();
1392                     if (c0_ == '}') {
1393                         break;  // p{xxx=}, invalid
1394                     }
1395                     ParseUnicodePropertyValueCharacters(&isValue);
1396                 }
1397                 if (c0_ != '}') {
1398                     break;  // p{xxx, invalid
1399                 }
1400                 // should do atom->Invert() here after ECMA 9.0
1401                 Advance();
1402                 result = CLASS_RANGE_BASE;
1403             }
1404             break;
1405         default:
1406             result = ParseCharacterEscape();
1407             int value = result;
1408             if (IsIgnoreCase()) {
1409                 value = Canonicalize(value, IsUtf16());
1410             }
1411             atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1412             break;
1413     }
1414     return result;
1415 }
1416 
ParseUnicodePropertyValueCharacters(bool * isValue)1417 void RegExpParser::ParseUnicodePropertyValueCharacters(bool *isValue)
1418 {
1419     if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1420         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1421         PrintF("UnicodePropertyCharacter::ControlLetter %c\n", c0_);
1422     } else if (c0_ == '_') {
1423         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1424         PrintF("UnicodePropertyCharacter:: _ \n");
1425     } else if (c0_ >= '0' && c0_ <= '9') {
1426         *isValue = true;
1427         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1428         PrintF("UnicodePropertyValueCharacter::DecimalDigit %c\n", c0_);
1429     } else {
1430         return;
1431     }
1432     Advance();
1433     ParseUnicodePropertyValueCharacters(isValue);
1434 }
1435 
1436 // NOLINTNEXTLINE(cert-dcl50-cpp)
PrintF(const char * fmt,...)1437 void RegExpParser::PrintF(const char *fmt, ...)
1438 {
1439 #ifndef _NO_DEBUG_
1440     va_list args;
1441     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,)
1442     va_start(args, fmt);
1443     vprintf(fmt, args);
1444     va_end(args);
1445 #else
1446     (void)fmt;
1447 #endif
1448 }
1449 
ParseError(const char * errorMessage)1450 void RegExpParser::ParseError(const char *errorMessage)
1451 {
1452     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1453     PrintF("error: ");
1454     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1455     PrintF(errorMessage);
1456     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1457     PrintF("\n");
1458     SetIsError();
1459     size_t length = strlen(errorMessage) + 1;
1460     if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1461         LOG_FULL(FATAL) << "memcpy_s failed";
1462         UNREACHABLE();
1463     }
1464 }
1465 
IsIdentFirst(uint32_t c)1466 int RegExpParser::IsIdentFirst(uint32_t c)
1467 {
1468     if (c < CACHE_SIZE) {
1469         return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1; // 5: Shift five bits 31: and operation binary of 31
1470     } else {
1471         return static_cast<int>(u_isIDStart(c));
1472     }
1473 }
1474 
Canonicalize(int c,bool isUnicode)1475 int RegExpParser::Canonicalize(int c, bool isUnicode)
1476 {
1477     if (c < TMP_BUF_SIZE) {  // NOLINTNEXTLINE(readability-magic-numbers)
1478         if (c >= 'a' && c <= 'z') {
1479             c = c - 'a' + 'A';
1480         }
1481     } else {
1482         int cur = c;
1483         if (isUnicode) {
1484             c = u_tolower(static_cast<UChar32>(c));
1485             if (c >= 'a' && c <= 'z') {
1486                 c = cur;
1487             }
1488         } else {
1489             c = u_toupper(static_cast<UChar32>(c));
1490             if (c >= 'A' && c <= 'Z') {
1491                 c = cur;
1492             }
1493         }
1494     }
1495     return c;
1496 }
1497 
NeedIntersection(uint32_t c)1498 bool RegExpParser::NeedIntersection(uint32_t c)
1499 {
1500     return (c <= 'z' && c >= 'a') || (c <= 'Z' && c >= 'A');
1501 }
1502 }  // namespace panda::ecmascript
1503