• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "runtime/regexp/ecmascript/regexp_parser.h"
17 #include "runtime/regexp/ecmascript/regexp_opcode.h"
18 #include "runtime/include/coretypes/string-inl.h"
19 
20 #include "libpandabase/utils/utils.h"
21 #include "libpandabase/utils/utf.h"
22 
23 #include "securec.h"
24 #include "unicode/uchar.h"
25 #include "unicode/uniset.h"
26 
27 #define NO_DEBUG
28 namespace {
29 constexpr int INVALID_UNICODE_FROM_UTF8 = -1;
30 
31 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
32 constexpr int UICODE_FROM_UTF8[] = {
33     0x80, 0xc0, 0xdf, 0xe0, 0xef, 0xf0, 0xf7, 0xf8, 0xfb, 0xfc, 0xfd,
34 };
35 
36 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
37 constexpr int UTF8_MIN_CODE[] = {
38     0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
39 };
40 
41 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
42 constexpr char UTF8_FIRST_CODE[] = {
43     0x1f, 0xf, 0x7, 0x3, 0x1,
44 };
45 
FromUtf8(int c,int l,const uint8_t * p,const uint8_t ** pp)46 int FromUtf8(int c, int l, const uint8_t *p, const uint8_t **pp)
47 {
48     uint32_t b;
49     // NOLINTNEXTLINE(hicpp-signed-bitwise)
50     c &= UTF8_FIRST_CODE[l - 1];
51     for (int i = 0; i < l; i++) {
52         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
53         b = *p++;
54         if (b < panda::utf::UTF8_2B_SECOND || b >= panda::utf::UTF8_2B_FIRST) {
55             return INVALID_UNICODE_FROM_UTF8;
56         }
57         // NOLINTNEXTLINE(hicpp-signed-bitwise)
58         c = (c << 6) | (b & panda::utf::UTF8_2B_THIRD);  // 6: Maximum Unicode range
59     }
60     if (c < UTF8_MIN_CODE[l - 1]) {
61         return INVALID_UNICODE_FROM_UTF8;
62     }
63     *pp = p;
64     return c;
65 }
66 
UnicodeFromUtf8(const uint8_t * p,int maxLen,const uint8_t ** pp)67 int UnicodeFromUtf8(const uint8_t *p, int maxLen, const uint8_t **pp)
68 {
69     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
70     int c = *p++;
71     if (c < UICODE_FROM_UTF8[0]) {
72         *pp = p;
73         return c;
74     }
75     int l = 0;
76     if (c >= UICODE_FROM_UTF8[1U] && c <= UICODE_FROM_UTF8[2U]) {         // 1 - 2: 0000 0080 - 0000 07FF
77         l = 1;                                                            // 1: 0000 0080 - 0000 07FF Unicode
78     } else if (c >= UICODE_FROM_UTF8[3U] && c <= UICODE_FROM_UTF8[4U]) {  // 3 - 4: 0000 0800 - 0000 FFFF
79         l = 2;                                                            // 2: 0000 0800 - 0000 FFFF Unicode
80     } else if (c >= UICODE_FROM_UTF8[5U] && c <= UICODE_FROM_UTF8[6U]) {  // 5 - 6: 0001 0000 - 0010 FFFF
81         l = 3;                                                            // 3: 0001 0000 - 0010 FFFF Unicode
82     } else if (c >= UICODE_FROM_UTF8[7U] && c <= UICODE_FROM_UTF8[8U]) {  // 7 - 8: 0020 0000 - 03FF FFFF
83         l = 4;                                                            // 4: 0020 0000 - 03FF FFFF Unicode
84         // NOLINTNEXTLINE(readability-magic-numbers)
85     } else if (c == UICODE_FROM_UTF8[9U] || c == UICODE_FROM_UTF8[10U]) {  // 9 - 10: 0400 0000 - 7FFF FFFF
86         l = 5;                                                             // 5: 0400 0000 - 7FFF FFFF Unicode
87     } else {
88         return INVALID_UNICODE_FROM_UTF8;
89     }
90     /* check that we have enough characters */
91     if (l > (maxLen - 1)) {
92         return INVALID_UNICODE_FROM_UTF8;
93     }
94     return FromUtf8(c, l, p, pp);
95 }
96 }  // namespace
97 
98 namespace panda {
99 static constexpr uint32_t CACHE_SIZE = 128;
100 static constexpr uint32_t CHAR_MAXS = 128;
101 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
102 static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
103     /* $ A-Z _ a-z */
104     0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE};
105 static RangeSet g_gRangeD(0x30, 0x39);  // NOLINT(fuchsia-statically-constructed-objects, readability-magic-numbers)
106 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
107 static RangeSet g_gRangeS({
108     std::pair<uint32_t, uint32_t>(0x0009, 0x000D),  // NOLINT(readability-magic-numbers)
109     std::pair<uint32_t, uint32_t>(0x0020, 0x0020),  // NOLINT(readability-magic-numbers)
110     std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0),  // NOLINT(readability-magic-numbers)
111     std::pair<uint32_t, uint32_t>(0x1680, 0x1680),  // NOLINT(readability-magic-numbers)
112     std::pair<uint32_t, uint32_t>(0x2000, 0x200A),  // NOLINT(readability-magic-numbers)
113     /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
114     /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
115     std::pair<uint32_t, uint32_t>(0x2028, 0x2029),  // NOLINT(readability-magic-numbers)
116     std::pair<uint32_t, uint32_t>(0x202F, 0x202F),  // NOLINT(readability-magic-numbers)
117     std::pair<uint32_t, uint32_t>(0x205F, 0x205F),  // NOLINT(readability-magic-numbers)
118     std::pair<uint32_t, uint32_t>(0x3000, 0x3000),  // NOLINT(readability-magic-numbers)
119     /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
120     std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF),  // NOLINT(readability-magic-numbers)
121 });
122 
123 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
124 static RangeSet g_gRangeW({
125     std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINT(readability-magic-numbers)
126     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINT(readability-magic-numbers)
127     std::pair<uint32_t, uint32_t>(0x005F, 0x005F),  // NOLINT(readability-magic-numbers)
128     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINT(readability-magic-numbers)
129 });
130 
131 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
132 static RangeSet g_gRegexpIdentifyStart({
133     std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINT(readability-magic-numbers)
134     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINT(readability-magic-numbers)
135     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINT(readability-magic-numbers)
136 });
137 
138 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
139 static RangeSet g_gRegexpIdentifyContinue({
140     std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINT(readability-magic-numbers)
141     std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINT(readability-magic-numbers)
142     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINT(readability-magic-numbers)
143     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINT(readability-magic-numbers)
144 });
145 
Parse()146 void RegExpParser::Parse()
147 {
148     // dynbuffer head init [size,capture_count,statck_count,flags]
149     buffer_.EmitU32(0);
150     buffer_.EmitU32(0);
151     buffer_.EmitU32(0);
152     buffer_.EmitU32(0);
153     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
154     PrintF("Parse Pattern------\n");
155     // Pattern[U, N]::
156     //      Disjunction[?U, ?N]
157     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
158     Advance();
159     SaveStartOpCode saveStartOp;
160     int captureIndex = captureCount_++;
161     saveStartOp.EmitOpCode(&buffer_, captureIndex);
162     ParseDisjunction(false);
163     if (c0_ != KEY_EOF) {
164         ParseError("extraneous characters at the end");
165         return;
166     }
167     SaveEndOpCode saveEndOp;
168     saveEndOp.EmitOpCode(&buffer_, captureIndex);
169     MatchEndOpCode matchEndOp;
170     matchEndOp.EmitOpCode(&buffer_, 0);
171     // dynbuffer head assignments
172     buffer_.PutU32(0, buffer_.size_);
173     buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
174     buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
175     buffer_.PutU32(FLAGS_OFFSET, flags_);
176 #ifndef NO_DEBUG
177     RegExpOpCode::DumpRegExpOpCode(std::cout, buffer_);
178 #endif
179 }
180 
ParseDisjunction(bool isBackward)181 void RegExpParser::ParseDisjunction(bool isBackward)
182 {
183     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
184     PrintF("Parse Disjunction------\n");
185     size_t start = buffer_.size_;
186     ParseAlternative(isBackward);
187     if (isError_) {
188         return;
189     }
190     do {
191         if (c0_ == '|') {
192             SplitNextOpCode splitOp;
193             uint32_t len = buffer_.size_ - start;
194             GotoOpCode gotoOp;
195             splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
196             uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
197             Advance();
198             ParseAlternative(isBackward);
199             gotoOp.UpdateOpPara(&buffer_, pos, buffer_.size_ - pos - gotoOp.GetSize());
200         }
201     } while (c0_ != KEY_EOF && c0_ != ')');
202 }
203 
ParseOctalLiteral()204 uint32_t RegExpParser::ParseOctalLiteral()
205 {
206     // For compatibility with some other browsers (not all), we parse
207     // up to three octal digits with a value below 256.
208     // ES#prod-annexB-LegacyOctalEscapeSequence
209     uint32_t value = c0_ - '0';
210     Advance();
211     if (c0_ >= '0' && c0_ <= '7') {
212         value = value * OCTAL_VALUE + c0_ - '0';
213         Advance();
214         if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
215             value = value * OCTAL_VALUE + c0_ - '0';
216             Advance();
217         }
218     }
219     return value;
220 }
221 
ParseUnlimitedLengthHexNumber(uint32_t maxValue,uint32_t * value)222 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
223 {
224     uint32_t x = 0;
225     int d = static_cast<int>(HexValue(c0_));
226     if (d < 0) {
227         return false;
228     }
229     while (d >= 0) {
230         if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
231             LOG(FATAL, COMMON) << "value overflow";
232             return false;
233         }
234         x = x * HEX_VALUE + static_cast<uint32_t>(d);
235         if (x > maxValue) {
236             return false;
237         }
238         Advance();
239         d = static_cast<int>(HexValue(c0_));
240     }
241     *value = x;
242     return true;
243 }
244 
245 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uint32_t * value)246 bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
247 {
248     // Accept both \uxxxx and \u{xxxxxx} (if allowed).
249     // In the latter case, the number of hex digits between { } is arbitrary.
250     // \ and u have already been read.
251     if (c0_ == '{' && IsUtf16()) {
252         uint8_t *start = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
253         Advance();
254         if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {  // NOLINT(readability-magic-numbers)
255             if (c0_ == '}') {
256                 Advance();
257                 return true;
258             }
259         }
260         pc_ = start;
261         Advance();
262         return false;
263     }
264     // \u but no {, or \u{...} escapes not allowed.
265     bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
266     if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
267         // Attempt to read trail surrogate.
268         uint8_t *start = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
269         if (*pc_ == 'u') {
270             Advance(UNICODE_HEX_ADVANCE);
271             uint32_t trail;
272             if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
273                 *value = U16_GET_SUPPLEMENTARY((*value), (trail));  // NOLINT(hicpp-signed-bitwise)
274                 return true;
275             }
276         }
277         pc_ = start;
278         Advance();
279     }
280     return result;
281 }
282 
ParseHexEscape(int length,uint32_t * value)283 bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
284 {
285     uint8_t *start = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
286     uint32_t val = 0;
287     for (int i = 0; i < length; ++i) {
288         uint32_t c = c0_;
289         int d = static_cast<int>(HexValue(c));
290         if (d < 0) {
291             pc_ = start;
292             Advance();
293             return false;
294         }
295         val = val * HEX_VALUE + static_cast<uint32_t>(d);
296         Advance();
297     }
298     *value = val;
299     return true;
300 }
301 
302 // NOLINTNEXTLINE(readability-function-size)
ParseAlternative(bool isBackward)303 void RegExpParser::ParseAlternative(bool isBackward)
304 {
305     size_t start = buffer_.size_;
306     while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
307         if (isError_) {
308             return;
309         }
310         size_t atomBcStart = buffer_.GetSize();
311         int captureIndex = 0;
312         bool isAtom = false;
313         switch (c0_) {
314             case '^': {
315                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
316                 PrintF("Assertion %c line start \n", c0_);
317                 LineStartOpCode lineStartOp;
318                 lineStartOp.EmitOpCode(&buffer_, 0);
319                 Advance();
320                 break;
321             }
322             case '$': {
323                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
324                 PrintF("Assertion %c line end \n", c0_);
325                 LineEndOpCode lineEndOp;
326                 lineEndOp.EmitOpCode(&buffer_, 0);
327                 Advance();
328                 break;
329             }
330             case '\\': {
331                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
332                 PrintF("Escape %c \n", c0_);
333                 Advance();
334                 switch (c0_) {
335                     case 'b': {
336                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
337                         PrintF("Assertion %c \n", c0_);
338                         WordBoundaryOpCode wordBoundaryOp;
339                         wordBoundaryOp.EmitOpCode(&buffer_, 0);
340                         Advance();
341                         break;
342                     }
343                     case 'B': {
344                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
345                         PrintF("Assertion %c \n", c0_);
346                         NotWordBoundaryOpCode notWordBoundaryOp;
347                         notWordBoundaryOp.EmitOpCode(&buffer_, 0);
348                         Advance();
349                         break;
350                     }
351                     default: {
352                         isAtom = true;
353                         int atomValue = ParseAtomEscape(isBackward);
354                         if (atomValue != -1) {
355                             if (IsIgnoreCase()) {
356                                 if (!IsUtf16()) {
357                                     atomValue = Canonicalize(atomValue, false);
358                                 } else {
359                                     icu::UnicodeSet set(atomValue, atomValue);
360                                     set.closeOver(USET_CASE_INSENSITIVE);
361                                     set.removeAllStrings();
362                                     int32_t size = set.size();
363                                     RangeOpCode rangeOp;
364                                     RangeSet rangeResult;
365                                     for (int32_t idx = 0; idx < size; idx++) {
366                                         int32_t uc = set.charAt(idx);
367                                         RangeSet curRange(uc);
368                                         rangeResult.Insert(curRange);
369                                     }
370                                     rangeOp.InsertOpCode(&buffer_, rangeResult);
371                                     break;
372                                 }
373                             }
374                             if (atomValue <= UINT16_MAX) {
375                                 CharOpCode charOp;
376                                 charOp.EmitOpCode(&buffer_, atomValue);
377                             } else {
378                                 Char32OpCode charOp;
379                                 charOp.EmitOpCode(&buffer_, atomValue);
380                             }
381                         }
382                         break;
383                     }
384                 }
385                 break;
386             }
387             case '(': {
388                 Advance();
389                 isAtom = ParseAssertionCapture(&captureIndex, isBackward);
390                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
391                 Advance();
392                 break;
393             }
394             case '.': {
395                 PrevOpCode prevOp;
396                 if (isBackward) {
397                     prevOp.EmitOpCode(&buffer_, 0);
398                 }
399                 if (IsDotAll()) {
400                     AllOpCode allOp;
401                     allOp.EmitOpCode(&buffer_, 0);
402                 } else {
403                     DotsOpCode dotsOp;
404                     dotsOp.EmitOpCode(&buffer_, 0);
405                 }
406                 if (isBackward) {
407                     prevOp.EmitOpCode(&buffer_, 0);
408                 }
409                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
410                 PrintF("Atom %c match any \n", c0_);
411                 isAtom = true;
412                 Advance();
413                 break;
414             }
415             case '[': {
416                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
417                 PrintF("Atom %c match range \n", c0_);
418                 isAtom = true;
419                 PrevOpCode prevOp;
420                 Advance();
421                 if (isBackward) {
422                     prevOp.EmitOpCode(&buffer_, 0);
423                 }
424                 bool isInvert = false;
425                 if (c0_ == '^') {
426                     isInvert = true;
427                     Advance();
428                 }
429                 RangeSet rangeResult;
430                 if (!ParseClassRanges(&rangeResult)) {
431                     break;
432                 }
433                 if (isInvert) {
434                     rangeResult.Invert(IsUtf16());
435                 }
436                 uint32_t highValue = rangeResult.HighestValue();
437                 if (highValue <= UINT16_MAX) {
438                     RangeOpCode rangeOp;
439                     rangeOp.InsertOpCode(&buffer_, rangeResult);
440                 } else {
441                     Range32OpCode rangeOp;
442                     rangeOp.InsertOpCode(&buffer_, rangeResult);
443                 }
444 
445                 if (isBackward) {
446                     prevOp.EmitOpCode(&buffer_, 0);
447                 }
448                 break;
449             }
450             case '*':
451             case '+':
452             case '?':
453                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
454                 ParseError("nothing to repeat");
455                 return;
456             case '{': {
457                 uint8_t *begin = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
458                 int dummy;
459                 if (ParserIntervalQuantifier(&dummy, &dummy)) {
460                     ParseError("nothing to repeat");
461                     return;
462                 }
463                 pc_ = begin;
464                 Advance();
465             }
466                 [[fallthrough]];
467             case '}':
468             case ']':
469                 if (IsUtf16()) {
470                     ParseError("syntax error");
471                     return;
472                 }
473                 [[fallthrough]];
474             default: {
475                 // PatternCharacter
476                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
477                 PrintF("PatternCharacter %c\n", c0_);
478                 isAtom = true;
479                 {
480                     PrevOpCode prevOp;
481                     if (isBackward) {
482                         prevOp.EmitOpCode(&buffer_, 0);
483                     }
484                     uint32_t matchedChar = c0_;
485                     if (c0_ > (INT8_MAX + 1)) {
486                         Prev();
487                         int i = 0;
488                         UChar32 c;
489                         int32_t length = end_ - pc_ + 1;
490                         // NOLINTNEXTLINE(hicpp-signed-bitwise)
491                         U8_NEXT(pc_, i, length, c);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
492                         matchedChar = static_cast<uint32_t>(c);
493                         pc_ += i;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
494                     }
495                     if (IsIgnoreCase()) {
496                         matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
497                     }
498                     if (matchedChar > UINT16_MAX) {
499                         Char32OpCode charOp;
500                         charOp.EmitOpCode(&buffer_, matchedChar);
501                     } else {
502                         CharOpCode charOp;
503                         charOp.EmitOpCode(&buffer_, matchedChar);
504                     }
505                     if (isBackward) {
506                         prevOp.EmitOpCode(&buffer_, 0);
507                     }
508                 }
509                 Advance();
510                 break;
511             }
512         }
513         if (isAtom && !isError_) {
514             ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
515         }
516         if (isBackward) {
517             size_t end = buffer_.GetSize();
518             size_t termSize = end - atomBcStart;
519             size_t moveSize = end - start;
520             buffer_.Expand(end + termSize);
521             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
522             if (memmove_s(buffer_.buf_ + start + termSize, moveSize, buffer_.buf_ + start, moveSize) != EOK) {
523                 LOG(FATAL, COMMON) << "memmove_s failed";
524                 UNREACHABLE();
525             }
526             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
527             if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
528                 LOG(FATAL, COMMON) << "memcpy_s failed";
529                 UNREACHABLE();
530             }
531         }
532     }
533 }
534 
FindGroupName(const PandaString & name)535 int RegExpParser::FindGroupName(const PandaString &name)
536 {
537     size_t len = 0;
538     size_t nameLen = name.size();
539     const char *p = reinterpret_cast<char *>(groupNames_.buf_);
540     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
541     const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
542     int captureIndex = 1;
543     while (p < bufEnd) {
544         len = strlen(p);
545         if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
546             return captureIndex;
547         }
548         p += len + 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
549         captureIndex++;
550     }
551     return -1;
552 }
553 
ParseAssertionCapture(int * captureIndex,bool isBackward)554 bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
555 {
556     bool isAtom = false;
557     do {
558         if (c0_ == '?') {
559             Advance();
560             switch (c0_) {
561                 // (?=Disjunction[?U, ?N])
562                 case '=': {
563                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
564                     PrintF("Assertion(?= Disjunction)\n");
565                     Advance();
566                     uint32_t start = buffer_.size_;
567                     ParseDisjunction(isBackward);
568                     MatchOpCode matchOp;
569                     matchOp.EmitOpCode(&buffer_, 0);
570                     MatchAheadOpCode matchAheadOp;
571                     uint32_t len = buffer_.size_ - start;
572                     matchAheadOp.InsertOpCode(&buffer_, start, len);
573                     break;
574                 }
575                 // (?!Disjunction[?U, ?N])
576                 case '!': {
577                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
578                     PrintF("Assertion(?! Disjunction)\n");
579                     uint32_t start = buffer_.size_;
580                     Advance();
581                     ParseDisjunction(isBackward);
582                     MatchOpCode matchOp;
583                     matchOp.EmitOpCode(&buffer_, 0);
584                     NegativeMatchAheadOpCode matchAheadOp;
585                     uint32_t len = buffer_.size_ - start;
586                     matchAheadOp.InsertOpCode(&buffer_, start, len);
587                     break;
588                 }
589                 case '<': {
590                     Advance();
591                     // (?<=Disjunction[?U, ?N])
592                     if (c0_ == '=') {
593                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
594                         PrintF("Assertion(?<= Disjunction)\n");
595                         Advance();
596                         uint32_t start = buffer_.size_;
597                         ParseDisjunction(true);
598                         MatchOpCode matchOp;
599                         matchOp.EmitOpCode(&buffer_, 0);
600                         MatchAheadOpCode matchAheadOp;
601                         uint32_t len = buffer_.size_ - start;
602                         matchAheadOp.InsertOpCode(&buffer_, start, len);
603                         // (?<!Disjunction[?U, ?N])
604                     } else if (c0_ == '!') {
605                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
606                         PrintF("Assertion(?<! Disjunction)\n");
607                         Advance();
608                         uint32_t start = buffer_.size_;
609                         ParseDisjunction(true);
610                         MatchOpCode matchOp;
611                         matchOp.EmitOpCode(&buffer_, 0);
612                         NegativeMatchAheadOpCode matchAheadOp;
613                         uint32_t len = buffer_.size_ - start;
614                         matchAheadOp.InsertOpCode(&buffer_, start, len);
615                     } else {
616                         Prev();
617                         PandaString name;
618                         auto **pp = const_cast<const uint8_t **>(&pc_);
619                         if (!ParseGroupSpecifier(pp, name)) {
620                             ParseError("GroupName Syntax error.");
621                             return false;
622                         }
623                         if (FindGroupName(name) > 0) {
624                             ParseError("Duplicate GroupName error.");
625                             return false;
626                         }
627                         groupNames_.EmitStr(name.c_str());
628                         newGroupNames_.push_back(name);
629                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
630                         PrintF("group name %s", name.c_str());
631                         Advance();
632                         goto parseCapture;  // NOLINT(cppcoreguidelines-avoid-goto)
633                     }
634                     break;
635                 }
636                 // (?:Disjunction[?U, ?N])
637                 case ':':
638                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
639                     PrintF("Atom(?<: Disjunction)\n");
640                     isAtom = true;
641                     Advance();
642                     ParseDisjunction(isBackward);
643                     break;
644                 default:
645                     Advance();
646                     ParseError("? Syntax error.");
647                     return false;
648             }
649         } else {
650             groupNames_.EmitChar(0);
651         parseCapture:
652             isAtom = true;
653             *captureIndex = captureCount_++;
654             SaveEndOpCode saveEndOp;
655             SaveStartOpCode saveStartOp;
656             if (isBackward) {
657                 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
658             } else {
659                 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
660             }
661             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
662             PrintF("capture start %d \n", *captureIndex);
663             ParseDisjunction(isBackward);
664             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
665             PrintF("capture end %d \n", *captureIndex);
666             if (isBackward) {
667                 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
668             } else {
669                 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
670             }
671         }
672     } while (c0_ != ')' && c0_ != KEY_EOF);
673     if (c0_ != ')') {
674         ParseError("capture syntax error");
675         return false;
676     }
677     return isAtom;
678 }
679 
ParseDecimalDigits()680 int RegExpParser::ParseDecimalDigits()
681 {
682     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
683     PrintF("Parse DecimalDigits------\n");
684     uint32_t result = 0;
685     bool overflow = false;
686     while (true) {
687         if (c0_ < '0' || c0_ > '9') {
688             break;
689         }
690         if (!overflow) {
691             if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
692                 overflow = true;
693             } else {
694                 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
695             }
696         }
697         Advance();
698     }
699     if (overflow) {
700         return INT32_MAX;
701     }
702     return result;
703 }
704 
ParserIntervalQuantifier(int * pmin,int * pmax)705 bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
706 {
707     // Quantifier::
708     //     QuantifierPrefix
709     //     QuantifierPrefix?
710     // QuantifierPrefix::
711     // *
712     // +
713     // ?
714     // {DecimalDigits}
715     // {DecimalDigits,}
716     // {DecimalDigits,DecimalDigits}
717     Advance();
718     *pmin = ParseDecimalDigits();
719     *pmax = *pmin;
720     switch (c0_) {
721         case ',': {
722             Advance();
723             if (c0_ == '}') {
724                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
725                 PrintF("QuantifierPrefix{DecimalDigits,}\n");
726                 *pmax = INT32_MAX;
727                 Advance();
728             } else {
729                 *pmax = ParseDecimalDigits();
730                 if (c0_ == '}') {
731                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
732                     PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
733                     Advance();
734                 } else {
735                     return false;
736                 }
737             }
738             break;
739         }
740         case '}':
741             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
742             PrintF("QuantifierPrefix{DecimalDigits}\n");
743             Advance();
744             break;
745         default:
746             Advance();
747             return false;
748     }
749     return true;
750 }
751 
ParseQuantifier(size_t atomBcStart,int captureStart,int captureEnd)752 void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
753 {
754     int min = -1;
755     int max = -1;
756     bool isGreedy = true;
757     switch (c0_) {
758         case '*':
759             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
760             PrintF("QuantifierPrefix %c\n", c0_);
761             min = 0;
762             max = INT32_MAX;
763             Advance();
764             break;
765         case '+':
766             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
767             PrintF("QuantifierPrefix %c\n", c0_);
768             min = 1;
769             max = INT32_MAX;
770             Advance();
771             break;
772         case '?':
773             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
774             PrintF("QuantifierPrefix %c\n", c0_);
775             Advance();
776             min = 0;
777             max = 1;
778             break;
779         case '{': {
780             uint8_t *start = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
781             if (!ParserIntervalQuantifier(&min, &max)) {
782                 pc_ = start;
783                 Advance();  // back to '{'
784                 return;
785             }
786             if (min > max) {
787                 ParseError("Invalid repetition count");
788                 return;
789             }
790             break;
791         }
792         default:
793             break;
794     }
795     if (c0_ == '?') {
796         isGreedy = false;
797         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
798         PrintF("Quantifier::QuantifierPrefix?\n");
799         Advance();
800     } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
801         ParseError("nothing to repeat");
802         return;
803     }
804     if (min != -1 && max != -1) {
805         stackCount_++;
806         PushOpCode pushOp;
807         pushOp.InsertOpCode(&buffer_, atomBcStart);
808         atomBcStart += pushOp.GetSize();
809 
810         if (captureStart != 0) {
811             SaveResetOpCode saveResetOp;
812             saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
813         }
814 
815         // zero advance check
816         if (max == INT32_MAX) {
817             stackCount_++;
818             PushCharOpCode pushCharOp;
819             pushCharOp.InsertOpCode(&buffer_, atomBcStart);
820             CheckCharOpCode checkCharOp;
821             // NOLINTNEXTLINE(readability-magic-numbers)
822             checkCharOp.EmitOpCode(&buffer_, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
823         }
824 
825         if (isGreedy) {
826             LoopGreedyOpCode loopOp;
827             loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
828         } else {
829             LoopOpCode loopOp;
830             loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
831         }
832 
833         if (min == 0) {
834             if (isGreedy) {
835                 SplitNextOpCode splitNextOp;
836                 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
837             } else {
838                 SplitFirstOpCode splitFirstOp;
839                 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
840             }
841         }
842 
843         PopOpCode popOp;
844         popOp.EmitOpCode(&buffer_);
845     }
846 }
847 
ParseGroupSpecifier(const uint8_t ** pp,PandaString & name)848 bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, PandaString &name)
849 {
850     const uint8_t *p = *pp;
851     uint32_t c;
852     std::array<char, CACHE_SIZE> buffer {};
853     char *q = buffer.data();
854     while (true) {
855         if (p <= end_) {
856             c = *p;
857         } else {
858             c = KEY_EOF;
859         }
860         if (c == '\\') {
861             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
862             p++;
863             if (*p != 'u') {
864                 return false;
865             }
866             if (!ParseUnicodeEscape(&c)) {
867                 return false;
868             }
869         } else if (c == '>') {
870             break;
871         } else if (c > CACHE_SIZE && c != KEY_EOF) {
872             c = static_cast<uint32_t>(UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
873         } else if (c != KEY_EOF) {
874             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
875             p++;
876         } else {
877             return false;
878         }
879         if (q == buffer.data()) {
880             if (IsIdentFirst(c) != 0) {
881                 return false;
882             }
883         } else {
884             if (!u_isIDPart(c)) {
885                 return false;
886             }
887         }
888         if (q != nullptr) {
889             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
890             *q++ = c;
891         }
892     }
893     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
894     p++;
895     *pp = p;
896     name = buffer.data();
897     return true;
898 }
899 
ParseCaptureCount(const char * groupName)900 int RegExpParser::ParseCaptureCount(const char *groupName)
901 {
902     const uint8_t *p = nullptr;
903     int captureIndex = 1;
904     PandaString name;
905     hasNamedCaptures_ = 0;
906     for (p = base_; p < end_; p++) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
907         switch (*p) {
908             case '(': {
909                 if (p[1] == '?') {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
910                     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
911                     if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
912                         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
913                         p[CAPTURE_CONUT_ADVANCE] != '=') {
914                         hasNamedCaptures_ = 1;
915                         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
916                         p += CAPTURE_CONUT_ADVANCE;
917                         if (groupName != nullptr) {
918                             if (ParseGroupSpecifier(&p, name)) {
919                                 if (strcmp(name.c_str(), groupName) == 0) {
920                                     return captureIndex;
921                                 }
922                             }
923                         }
924                         captureIndex++;
925                     }
926                 } else {
927                     captureIndex++;
928                 }
929                 break;
930             }
931             case '\\':
932                 p++;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
933                 break;
934             case '[': {
935                 while (p < end_ && *p != ']') {
936                     if (*p == '\\') {
937                         p++;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
938                     }
939                     p++;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
940                 }
941                 break;
942             }
943             default:
944                 break;
945         }
946     }
947     return captureIndex;
948 }
949 
950 // NOLINTNEXTLINE(readability-function-size)
ParseAtomEscape(bool isBackward)951 int RegExpParser::ParseAtomEscape(bool isBackward)
952 {
953     // AtomEscape[U, N]::
954     //     DecimalEscape
955     //     CharacterClassEscape[?U]
956     //     CharacterEscape[?U]
957     //     [+N]kGroupName[?U]
958     int result = -1;
959     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
960     PrintF("Parse AtomEscape------\n");
961     PrevOpCode prevOp;
962     switch (c0_) {
963         case KEY_EOF:
964             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
965             ParseError("unexpected end");
966             break;
967         // DecimalEscape
968         case '1':
969         case '2':
970         case '3':
971         case '4':
972         case '5':
973         case '6':
974         case '7':
975         case '8':
976         case '9': {
977             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
978             PrintF("NonZeroDigit %c\n", c0_);
979             int capture = ParseDecimalDigits();
980             if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
981                 ParseError("invalid backreference count");
982                 break;
983             }
984             if (isBackward) {
985                 BackwardBackReferenceOpCode backReferenceOp;
986                 backReferenceOp.EmitOpCode(&buffer_, capture);
987             } else {
988                 BackReferenceOpCode backReferenceOp;
989                 backReferenceOp.EmitOpCode(&buffer_, capture);
990             }
991             break;
992         }
993         // CharacterClassEscape
994         case 'd': {
995             // [0-9]
996             RangeOpCode rangeOp;
997             if (isBackward) {
998                 prevOp.EmitOpCode(&buffer_, 0);
999             }
1000             rangeOp.InsertOpCode(&buffer_, g_gRangeD);
1001             goto parseLookBehind;  // NOLINT(cppcoreguidelines-avoid-goto)
1002             break;
1003         }
1004         case 'D': {
1005             // [^0-9]
1006             RangeSet atomRange(g_gRangeD);
1007             atomRange.Invert(IsUtf16());
1008             Range32OpCode rangeOp;
1009             if (isBackward) {
1010                 prevOp.EmitOpCode(&buffer_, 0);
1011             }
1012             rangeOp.InsertOpCode(&buffer_, atomRange);
1013             goto parseLookBehind;  // NOLINT(cppcoreguidelines-avoid-goto)
1014             break;
1015         }
1016         case 's': {
1017             // [\f\n\r\t\v]
1018             RangeOpCode rangeOp;
1019             if (isBackward) {
1020                 prevOp.EmitOpCode(&buffer_, 0);
1021             }
1022             rangeOp.InsertOpCode(&buffer_, g_gRangeS);
1023             goto parseLookBehind;  // NOLINT(cppcoreguidelines-avoid-goto)
1024             break;
1025         }
1026         case 'S': {
1027             RangeSet atomRange(g_gRangeS);
1028             Range32OpCode rangeOp;
1029             atomRange.Invert(IsUtf16());
1030             if (isBackward) {
1031                 prevOp.EmitOpCode(&buffer_, 0);
1032             }
1033             rangeOp.InsertOpCode(&buffer_, atomRange);
1034             goto parseLookBehind;  // NOLINT(cppcoreguidelines-avoid-goto)
1035             break;
1036         }
1037         case 'w': {
1038             // [A-Za-z0-9]
1039             RangeOpCode rangeOp;
1040             if (isBackward) {
1041                 prevOp.EmitOpCode(&buffer_, 0);
1042             }
1043             rangeOp.InsertOpCode(&buffer_, g_gRangeW);
1044             goto parseLookBehind;  // NOLINT(cppcoreguidelines-avoid-goto)
1045             break;
1046         }
1047         case 'W': {
1048             // [^A-Za-z0-9]
1049             RangeSet atomRange(g_gRangeW);
1050             atomRange.Invert(IsUtf16());
1051             Range32OpCode rangeOp;
1052             if (isBackward) {
1053                 prevOp.EmitOpCode(&buffer_, 0);
1054             }
1055             rangeOp.InsertOpCode(&buffer_, atomRange);
1056             goto parseLookBehind;  // NOLINT(cppcoreguidelines-avoid-goto)
1057             break;
1058         }
1059         // P{UnicodePropertyValueExpression}
1060         // p{UnicodePropertyValueExpression}
1061         case 'P':
1062         case 'p':
1063         // [+N]kGroupName[?U]
1064         case 'k': {
1065             Advance();
1066             if (c0_ != '<') {
1067                 if (!IsUtf16() || HasNamedCaptures()) {
1068                     ParseError("expecting group name.");
1069                     break;
1070                 }
1071             }
1072             Advance();
1073             Prev();
1074             PandaString name;
1075             auto **pp = const_cast<const uint8_t **>(&pc_);
1076             if (!ParseGroupSpecifier(pp, name)) {
1077                 ParseError("GroupName Syntax error.");
1078                 break;
1079             }
1080             int postion = FindGroupName(name);
1081             if (postion < 0) {
1082                 postion = ParseCaptureCount(name.c_str());
1083                 if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1084                     ParseError("group name not defined");
1085                     break;
1086                 }
1087             }
1088             if (isBackward) {
1089                 BackwardBackReferenceOpCode backReferenceOp;
1090                 backReferenceOp.EmitOpCode(&buffer_, postion);
1091             } else {
1092                 BackReferenceOpCode backReferenceOp;
1093                 backReferenceOp.EmitOpCode(&buffer_, postion);
1094             }
1095             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1096             Advance();
1097             break;
1098         }
1099         parseLookBehind : {
1100             if (isBackward) {
1101                 prevOp.EmitOpCode(&buffer_, 0);
1102             }
1103             Advance();
1104             break;
1105         }
1106         default:
1107             result = ParseCharacterEscape();
1108             break;
1109     }
1110     return result;
1111 }
1112 
RecountCaptures()1113 int RegExpParser::RecountCaptures()
1114 {
1115     if (totalCaptureCount_ < 0) {
1116         const char *name = reinterpret_cast<const char *>(groupNames_.buf_);
1117         totalCaptureCount_ = ParseCaptureCount(name);
1118     }
1119     return totalCaptureCount_;
1120 }
HasNamedCaptures()1121 bool RegExpParser::HasNamedCaptures()
1122 {
1123     if (hasNamedCaptures_ < 0) {
1124         RecountCaptures();
1125     }
1126     return false;
1127 }
1128 
ParseCharacterEscape()1129 int RegExpParser::ParseCharacterEscape()
1130 {
1131     // CharacterEscape[U]::
1132     //     ControlEscape
1133     //     c ControlLetter
1134     //     0 [lookahead ∉ DecimalDigit]
1135     //     HexEscapeSequence
1136     //     RegExpUnicodeEscapeSequence[?U]
1137     //     IdentityEscape[?U]
1138     uint32_t result = 0;
1139     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1140     switch (c0_) {
1141         // ControlEscape
1142         case 'f':
1143             result = '\f';
1144             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1145             PrintF("ControlEscape %c\n", c0_);
1146             Advance();
1147             break;
1148         case 'n':
1149             result = '\n';
1150             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1151             PrintF("ControlEscape %c\n", c0_);
1152             Advance();
1153             break;
1154         case 'r':
1155             result = '\r';
1156             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1157             PrintF("ControlEscape %c\n", c0_);
1158             Advance();
1159             break;
1160         case 't':
1161             result = '\t';
1162             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1163             PrintF("ControlEscape %c\n", c0_);
1164             Advance();
1165             break;
1166         case 'v':
1167             result = '\v';
1168             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1169             PrintF("ControlEscape %c\n", c0_);
1170             Advance();
1171             break;
1172         // c ControlLetter
1173         case 'c': {
1174             Advance();
1175             if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1176                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1177                 PrintF("ControlLetter %c\n", c0_);
1178                 result = static_cast<uint32_t>(c0_) & 0x1f;  // NOLINT(readability-magic-numbers, hicpp-signed-bitwise)
1179                 Advance();
1180             } else {
1181                 if (!IsUtf16()) {
1182                     pc_--;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1183                     result = '\\';
1184                 } else {
1185                     ParseError("Invalid control letter");
1186                     return -1;
1187                 }
1188             }
1189             break;
1190         }
1191         case '0': {
1192             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1193             PrintF("CharacterEscape 0 [lookahead ∉ DecimalDigit]\n");
1194             if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) {  // NOLINT(readability-magic-numbers)
1195                 Advance();
1196                 result = 0;
1197                 break;
1198             }
1199             [[fallthrough]];
1200         }
1201         case '1':
1202         case '2':
1203         case '3':
1204         case '4':
1205         case '5':
1206         case '6':
1207         case '7': {
1208             if (IsUtf16()) {
1209                 // With /u, decimal escape is not interpreted as octal character code.
1210                 ParseError("Invalid class escape");
1211                 return 0;
1212             }
1213             result = ParseOctalLiteral();
1214             break;
1215         }
1216         // ParseHexEscapeSequence
1217         // ParseRegExpUnicodeEscapeSequence
1218         case 'x': {
1219             Advance();
1220             if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1221                 return result;
1222             }
1223             if (IsUtf16()) {
1224                 ParseError("Invalid class escape");
1225                 return -1;
1226             }
1227             result = 'x';
1228             break;
1229         }
1230         case 'u': {
1231             Advance();
1232             if (ParseUnicodeEscape(&result)) {
1233                 return result;
1234             }
1235             if (IsUtf16()) {
1236                 // With /u, invalid escapes are not treated as identity escapes.
1237                 ParseError("Invalid unicode escape");
1238                 return 0;
1239             }
1240             // If \u is not followed by a two-digit hexadecimal, treat it
1241             // as an identity escape.
1242             result = 'u';
1243             break;
1244         }
1245         // IdentityEscape[?U]
1246         case '$':
1247         case '(':
1248         case ')':
1249         case '*':
1250         case '+':
1251         case '.':
1252         case '/':
1253         case '?':
1254         case '[':
1255         case '\\':
1256         case ']':
1257         case '^':
1258         case '{':
1259         case '|':
1260         case '}':
1261             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1262             PrintF("IdentityEscape %c\n", c0_);
1263             result = c0_;
1264             Advance();
1265             break;
1266         default: {
1267             if (IsUtf16()) {
1268                 ParseError("Invalid unicode escape");
1269                 return 0;
1270             }
1271             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1272             PrintF("SourceCharacter %c\n", c0_);
1273             result = c0_;
1274             if (result < CHAR_MAXS) {
1275                 Advance();
1276             }
1277             break;
1278         }
1279     }
1280     return result;
1281 }
1282 
ParseClassRanges(RangeSet * result)1283 bool RegExpParser::ParseClassRanges(RangeSet *result)
1284 {
1285     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1286     PrintF("Parse ClassRanges------\n");
1287     while (c0_ != ']') {
1288         RangeSet s1;
1289         uint32_t c1 = ParseClassAtom(&s1);
1290         if (c1 == UINT32_MAX) {
1291             ParseError("invalid class range");
1292             return false;
1293         }
1294 
1295         int nextC0 = *pc_;
1296         if (c0_ == '-' && nextC0 != ']') {
1297             if (c1 == CLASS_RANGE_BASE) {
1298                 if (IsUtf16()) {
1299                     ParseError("invalid class range");
1300                     return false;
1301                 }
1302                 result->Insert(s1);
1303                 continue;
1304             }
1305             Advance();
1306             RangeSet s2;
1307             uint32_t c2 = ParseClassAtom(&s2);
1308             if (c2 == UINT32_MAX) {
1309                 ParseError("invalid class range");
1310                 return false;
1311             }
1312             if (c2 == CLASS_RANGE_BASE) {
1313                 if (IsUtf16()) {
1314                     ParseError("invalid class range");
1315                     return false;
1316                 }
1317                 result->Insert(s2);
1318                 continue;
1319             }
1320             if (c1 < INT8_MAX) {
1321                 if (c1 > c2) {
1322                     ParseError("invalid class range");
1323                     return false;
1324                 }
1325             }
1326             if (IsIgnoreCase()) {
1327                 c1 = static_cast<uint32_t>(Canonicalize(c1, IsUtf16()));
1328                 c2 = static_cast<uint32_t>(Canonicalize(c2, IsUtf16()));
1329             }
1330 
1331             result->Insert(c1, c2);
1332         } else {
1333             result->Insert(s1);
1334         }
1335     }
1336     Advance();
1337     return true;
1338 }
1339 
ParseClassAtom(RangeSet * atom)1340 uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1341 {
1342     uint32_t ret = UINT32_MAX;
1343     switch (c0_) {
1344         case '\\': {
1345             Advance();
1346             ret = static_cast<uint32_t>(ParseClassEscape(atom));
1347             break;
1348         }
1349         case KEY_EOF:
1350             break;
1351         case 0: {
1352             if (pc_ >= end_) {
1353                 return UINT32_MAX;
1354             }
1355             [[fallthrough]];
1356         }
1357         default: {
1358             uint32_t value = c0_;
1359             size_t u16Size = 0;
1360             if (c0_ > INT8_MAX) {
1361                 pc_ -= 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1362                 auto u16Result = utf::ConvertUtf8ToUtf16Pair(pc_, true);
1363                 value = u16Result.first;
1364                 u16Size = u16Result.second;
1365                 Advance(u16Size + 1);
1366             } else {
1367                 Advance();
1368             }
1369             if (IsIgnoreCase()) {
1370                 value = static_cast<uint32_t>(Canonicalize(value, IsUtf16()));
1371             }
1372             atom->Insert(RangeSet(value));
1373             ret = value;
1374             break;
1375         }
1376     }
1377     return ret;
1378 }
1379 
ParseClassEscape(RangeSet * atom)1380 int RegExpParser::ParseClassEscape(RangeSet *atom)
1381 {
1382     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1383     PrintF("Parse ClassEscape------\n");
1384     int result = -1;
1385     switch (c0_) {
1386         case 'b':
1387             Advance();
1388             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1389             PrintF("ClassEscape %c", 'b');
1390             result = '\b';
1391             atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1392             break;
1393         case '-':
1394             Advance();
1395             result = '-';
1396             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1397             PrintF("ClassEscape %c", '-');
1398             atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1399             break;
1400         // CharacterClassEscape
1401         case 'd':
1402         case 'D':
1403             result = CLASS_RANGE_BASE;
1404             atom->Insert(g_gRangeD);
1405             if (c0_ == 'D') {
1406                 atom->Invert(IsUtf16());
1407             }
1408             Advance();
1409             break;
1410         case 's':
1411         case 'S':
1412             result = CLASS_RANGE_BASE;
1413             atom->Insert(g_gRangeS);
1414             if (c0_ == 'S') {
1415                 atom->Invert(IsUtf16());
1416             }
1417             Advance();
1418             break;
1419         case 'w':
1420         case 'W':
1421             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1422             PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1423             result = CLASS_RANGE_BASE;
1424             atom->Insert(g_gRangeW);
1425             if (c0_ == 'W') {
1426                 atom->Invert(IsUtf16());
1427             }
1428             Advance();
1429             break;
1430         // P{UnicodePropertyValueExpression}
1431         // p{UnicodePropertyValueExpression}
1432         case 'P':
1433         case 'p':
1434             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1435             PrintF("Warning: \\p is not supported in ECMA 2015!");
1436             Advance();
1437             if (c0_ == '{') {
1438                 Advance();
1439                 if (c0_ == '}') {
1440                     break;  // p{}, invalid
1441                 }
1442                 bool isValue = false;
1443                 ParseUnicodePropertyValueCharacters(&isValue);
1444                 if (!isValue && c0_ == '=') {
1445                     // UnicodePropertyName = UnicodePropertyValue
1446                     Advance();
1447                     if (c0_ == '}') {
1448                         break;  // p{xxx=}, invalid
1449                     }
1450                     ParseUnicodePropertyValueCharacters(&isValue);
1451                 }
1452                 if (c0_ != '}') {
1453                     break;  // p{xxx, invalid
1454                 }
1455                 // should do atom->Invert() here after ECMA 9.0
1456                 Advance();
1457                 result = CLASS_RANGE_BASE;
1458             }
1459             break;
1460         default:
1461             result = ParseCharacterEscape();
1462             int value = result;
1463             if (IsIgnoreCase()) {
1464                 value = Canonicalize(value, IsUtf16());
1465             }
1466             atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1467             break;
1468     }
1469     return result;
1470 }
1471 
ParseUnicodePropertyValueCharacters(bool * isValue)1472 void RegExpParser::ParseUnicodePropertyValueCharacters(bool *isValue)
1473 {
1474     if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1475         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1476         PrintF("UnicodePropertyCharacter::ControlLetter %c\n", c0_);
1477     } else if (c0_ == '_') {
1478         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1479         PrintF("UnicodePropertyCharacter:: _ \n");
1480     } else if (c0_ >= '0' && c0_ <= '9') {
1481         *isValue = true;
1482         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1483         PrintF("UnicodePropertyValueCharacter::DecimalDigit %c\n", c0_);
1484     } else {
1485         return;
1486     }
1487     Advance();
1488     ParseUnicodePropertyValueCharacters(isValue);
1489 }
1490 
1491 // NOLINTNEXTLINE(cert-dcl50-cpp)
PrintF(const char * fmt,...)1492 void RegExpParser::PrintF(const char *fmt, ...)
1493 {
1494 #ifndef NO_DEBUG
1495     va_list args;
1496     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,)
1497     va_start(args, fmt);
1498     vprintf(fmt, args);
1499     va_end(args);
1500 #else
1501     (void)fmt;
1502 #endif
1503 }
1504 
ParseError(const char * errorMessage)1505 void RegExpParser::ParseError(const char *errorMessage)
1506 {
1507     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1508     PrintF("error: ");
1509     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1510     PrintF(errorMessage);
1511     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1512     PrintF("\n");
1513     SetIsError();
1514     size_t length = strlen(errorMessage) + 1;
1515     if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1516         LOG(FATAL, COMMON) << "memcpy_s failed";
1517         UNREACHABLE();
1518     }
1519 }
1520 
IsIdentFirst(uint32_t c)1521 int RegExpParser::IsIdentFirst(uint32_t c)
1522 {
1523     if (c < CACHE_SIZE) {
1524         // NOLINTNEXTLINE(hicpp-signed-bitwise
1525         return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1;  // 5: Shift five bits 31: and operation binary of 31
1526     }
1527     return static_cast<int>(u_isIDStart(c));
1528 }
1529 }  // namespace panda