• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "runtime/regexp/ecmascript/regexp_parser.h"
17 #include "runtime/regexp/ecmascript/regexp_opcode.h"
18 #include "runtime/include/coretypes/string-inl.h"
19 
20 #include "libpandabase/utils/utils.h"
21 #include "libpandabase/utils/utf.h"
22 
23 #include "securec.h"
24 #include "unicode/uchar.h"
25 #include "unicode/uniset.h"
26 
27 namespace {
28 constexpr int INVALID_UNICODE_FROM_UTF8 = -1;
29 
30 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
31 constexpr int UICODE_FROM_UTF8[] = {
32     0x80, 0xc0, 0xdf, 0xe0, 0xef, 0xf0, 0xf7, 0xf8, 0xfb, 0xfc, 0xfd,
33 };
34 
35 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
36 constexpr int UTF8_MIN_CODE[] = {
37     0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
38 };
39 
40 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
41 constexpr char UTF8_FIRST_CODE[] = {
42     0x1f, 0xf, 0x7, 0x3, 0x1,
43 };
44 
FromUtf8(int c,int l,const uint8_t * p,const uint8_t ** pp)45 int FromUtf8(int c, int l, const uint8_t *p, const uint8_t **pp)
46 {
47     uint32_t b;
48     // NOLINTNEXTLINE(hicpp-signed-bitwise)
49     c &= UTF8_FIRST_CODE[l - 1];
50     for (int i = 0; i < l; i++) {
51         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
52         b = *p++;
53         if (b < ark::utf::UTF8_2B_SECOND || b >= ark::utf::UTF8_2B_FIRST) {
54             return INVALID_UNICODE_FROM_UTF8;
55         }
56         // NOLINTNEXTLINE(hicpp-signed-bitwise)
57         c = (c << 6) | (b & ark::utf::UTF8_2B_THIRD);  // 6: Maximum Unicode range
58     }
59     if (c < UTF8_MIN_CODE[l - 1]) {
60         return INVALID_UNICODE_FROM_UTF8;
61     }
62     *pp = p;
63     return c;
64 }
65 
UnicodeFromUtf8(const uint8_t * p,int maxLen,const uint8_t ** pp)66 int UnicodeFromUtf8(const uint8_t *p, int maxLen, const uint8_t **pp)
67 {
68     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
69     int c = *p++;
70     if (c < UICODE_FROM_UTF8[0]) {
71         *pp = p;
72         return c;
73     }
74     int l = 0;
75     if (c >= UICODE_FROM_UTF8[1U] && c <= UICODE_FROM_UTF8[2U]) {         // 1 - 2: 0000 0080 - 0000 07FF
76         l = 1;                                                            // 1: 0000 0080 - 0000 07FF Unicode
77     } else if (c >= UICODE_FROM_UTF8[3U] && c <= UICODE_FROM_UTF8[4U]) {  // 3 - 4: 0000 0800 - 0000 FFFF
78         l = 2;                                                            // 2: 0000 0800 - 0000 FFFF Unicode
79     } else if (c >= UICODE_FROM_UTF8[5U] && c <= UICODE_FROM_UTF8[6U]) {  // 5 - 6: 0001 0000 - 0010 FFFF
80         l = 3;                                                            // 3: 0001 0000 - 0010 FFFF Unicode
81     } else if (c >= UICODE_FROM_UTF8[7U] && c <= UICODE_FROM_UTF8[8U]) {  // 7 - 8: 0020 0000 - 03FF FFFF
82         l = 4;                                                            // 4: 0020 0000 - 03FF FFFF Unicode
83         // NOLINTNEXTLINE(readability-magic-numbers)
84     } else if (c == UICODE_FROM_UTF8[9U] || c == UICODE_FROM_UTF8[10U]) {  // 9 - 10: 0400 0000 - 7FFF FFFF
85         l = 5;                                                             // 5: 0400 0000 - 7FFF FFFF Unicode
86     } else {
87         return INVALID_UNICODE_FROM_UTF8;
88     }
89     /* check that we have enough characters */
90     if (l > (maxLen - 1)) {
91         return INVALID_UNICODE_FROM_UTF8;
92     }
93     return FromUtf8(c, l, p, pp);
94 }
95 }  // namespace
96 
97 namespace ark {
98 static constexpr uint32_t CACHE_SIZE = 128;
99 static constexpr uint32_t CHAR_MAXS = 128;
100 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
101 static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
102     /* $ A-Z _ a-z */
103     0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE};
104 static RangeSet g_gRangeD(0x30, 0x39);  // NOLINT(fuchsia-statically-constructed-objects, readability-magic-numbers)
105 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
106 static RangeSet g_gRangeS({
107     std::pair<uint32_t, uint32_t>(0x0009, 0x000D),  // NOLINT(readability-magic-numbers)
108     std::pair<uint32_t, uint32_t>(0x0020, 0x0020),  // NOLINT(readability-magic-numbers)
109     std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0),  // NOLINT(readability-magic-numbers)
110     std::pair<uint32_t, uint32_t>(0x1680, 0x1680),  // NOLINT(readability-magic-numbers)
111     std::pair<uint32_t, uint32_t>(0x2000, 0x200A),  // NOLINT(readability-magic-numbers)
112     /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
113     /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
114     std::pair<uint32_t, uint32_t>(0x2028, 0x2029),  // NOLINT(readability-magic-numbers)
115     std::pair<uint32_t, uint32_t>(0x202F, 0x202F),  // NOLINT(readability-magic-numbers)
116     std::pair<uint32_t, uint32_t>(0x205F, 0x205F),  // NOLINT(readability-magic-numbers)
117     std::pair<uint32_t, uint32_t>(0x3000, 0x3000),  // NOLINT(readability-magic-numbers)
118     /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
119     std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF),  // NOLINT(readability-magic-numbers)
120 });
121 
122 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
123 static RangeSet g_gRangeW({
124     std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINT(readability-magic-numbers)
125     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINT(readability-magic-numbers)
126     std::pair<uint32_t, uint32_t>(0x005F, 0x005F),  // NOLINT(readability-magic-numbers)
127     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINT(readability-magic-numbers)
128 });
129 
130 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
131 static RangeSet g_gRegexpIdentifyStart({
132     std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINT(readability-magic-numbers)
133     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINT(readability-magic-numbers)
134     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINT(readability-magic-numbers)
135 });
136 
137 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
138 static RangeSet g_gRegexpIdentifyContinue({
139     std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINT(readability-magic-numbers)
140     std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINT(readability-magic-numbers)
141     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINT(readability-magic-numbers)
142     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINT(readability-magic-numbers)
143 });
144 
Parse()145 void RegExpParser::Parse()
146 {
147     // dynbuffer head init [size,capture_count,statck_count,flags]
148     buffer_.EmitU32(0);
149     buffer_.EmitU32(0);
150     buffer_.EmitU32(0);
151     buffer_.EmitU32(0);
152     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
153     PrintF("Parse Pattern------\n");
154     // Pattern[U, N]::
155     //      Disjunction[?U, ?N]
156     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
157     Advance();
158     SaveStartOpCode saveStartOp;
159     int captureIndex = captureCount_++;
160     saveStartOp.EmitOpCode(&buffer_, captureIndex);
161     ParseDisjunction(false);
162     if (c0_ != KEY_EOF) {
163         ParseError("extraneous characters at the end");
164         return;
165     }
166     SaveEndOpCode saveEndOp;
167     saveEndOp.EmitOpCode(&buffer_, captureIndex);
168     MatchEndOpCode matchEndOp;
169     matchEndOp.EmitOpCode(&buffer_, 0);
170     // dynbuffer head assignments
171     buffer_.PutU32(0, buffer_.size_);
172     buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
173     buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
174     buffer_.PutU32(FLAGS_OFFSET, flags_);
175 }
176 
ParseDisjunction(bool isBackward)177 void RegExpParser::ParseDisjunction(bool isBackward)
178 {
179     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
180     PrintF("Parse Disjunction------\n");
181     size_t start = buffer_.size_;
182     ParseAlternative(isBackward);
183     if (isError_) {
184         return;
185     }
186     do {
187         if (c0_ == '|') {
188             SplitNextOpCode splitOp;
189             uint32_t len = buffer_.size_ - start;
190             GotoOpCode gotoOp;
191             splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
192             uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
193             Advance();
194             ParseAlternative(isBackward);
195             gotoOp.UpdateOpPara(&buffer_, pos, buffer_.size_ - pos - gotoOp.GetSize());
196         }
197     } while (c0_ != KEY_EOF && c0_ != ')');
198 }
199 
ParseOctalLiteral()200 uint32_t RegExpParser::ParseOctalLiteral()
201 {
202     // For compatibility with some other browsers (not all), we parse
203     // up to three octal digits with a value below 256.
204     // ES#prod-annexB-LegacyOctalEscapeSequence
205     uint32_t value = c0_ - '0';
206     Advance();
207     if (c0_ >= '0' && c0_ <= '7') {
208         value = value * OCTAL_VALUE + c0_ - '0';
209         Advance();
210         if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
211             value = value * OCTAL_VALUE + c0_ - '0';
212             Advance();
213         }
214     }
215     return value;
216 }
217 
ParseUnlimitedLengthHexNumber(uint32_t maxValue,uint32_t * value)218 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
219 {
220     uint32_t x = 0;
221     int d = static_cast<int>(HexValue(c0_));
222     if (d < 0) {
223         return false;
224     }
225     while (d >= 0) {
226         if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
227             LOG(FATAL, COMMON) << "value overflow";
228             return false;
229         }
230         x = x * HEX_VALUE + static_cast<uint32_t>(d);
231         if (x > maxValue) {
232             return false;
233         }
234         Advance();
235         d = static_cast<int>(HexValue(c0_));
236     }
237     *value = x;
238     return true;
239 }
240 
241 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uint32_t * value)242 bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
243 {
244     // Accept both \uxxxx and \u{xxxxxx} (if allowed).
245     // In the latter case, the number of hex digits between { } is arbitrary.
246     // \ and u have already been read.
247     if (c0_ == '{' && IsUtf16()) {
248         uint8_t *start = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
249         Advance();
250         if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {  // NOLINT(readability-magic-numbers)
251             if (c0_ == '}') {
252                 Advance();
253                 return true;
254             }
255         }
256         pc_ = start;
257         Advance();
258         return false;
259     }
260     // \u but no {, or \u{...} escapes not allowed.
261     bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
262     if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
263         // Attempt to read trail surrogate.
264         uint8_t *start = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
265         if (*pc_ == 'u') {
266             Advance(UNICODE_HEX_ADVANCE);
267             uint32_t trail;
268             if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
269                 *value = U16_GET_SUPPLEMENTARY((*value), (trail));  // NOLINT(hicpp-signed-bitwise)
270                 return true;
271             }
272         }
273         pc_ = start;
274         Advance();
275     }
276     return result;
277 }
278 
ParseHexEscape(int length,uint32_t * value)279 bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
280 {
281     uint8_t *start = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
282     uint32_t val = 0;
283     for (int i = 0; i < length; ++i) {
284         uint32_t c = c0_;
285         int d = static_cast<int>(HexValue(c));
286         if (d < 0) {
287             pc_ = start;
288             Advance();
289             return false;
290         }
291         val = val * HEX_VALUE + static_cast<uint32_t>(d);
292         Advance();
293     }
294     *value = val;
295     return true;
296 }
297 
298 // NOLINTNEXTLINE(readability-function-size)
ParseAlternative(bool isBackward)299 void RegExpParser::ParseAlternative(bool isBackward)
300 {
301     size_t start = buffer_.size_;
302     while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
303         if (isError_) {
304             return;
305         }
306         size_t atomBcStart = buffer_.GetSize();
307         int captureIndex = 0;
308         bool isAtom = false;
309         switch (c0_) {
310             case '^': {
311                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
312                 PrintF("Assertion %c line start \n", c0_);
313                 LineStartOpCode lineStartOp;
314                 lineStartOp.EmitOpCode(&buffer_, 0);
315                 Advance();
316                 break;
317             }
318             case '$': {
319                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
320                 PrintF("Assertion %c line end \n", c0_);
321                 LineEndOpCode lineEndOp;
322                 lineEndOp.EmitOpCode(&buffer_, 0);
323                 Advance();
324                 break;
325             }
326             case '\\': {
327                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
328                 PrintF("Escape %c \n", c0_);
329                 Advance();
330                 switch (c0_) {
331                     case 'b': {
332                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
333                         PrintF("Assertion %c \n", c0_);
334                         WordBoundaryOpCode wordBoundaryOp;
335                         wordBoundaryOp.EmitOpCode(&buffer_, 0);
336                         Advance();
337                         break;
338                     }
339                     case 'B': {
340                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
341                         PrintF("Assertion %c \n", c0_);
342                         NotWordBoundaryOpCode notWordBoundaryOp;
343                         notWordBoundaryOp.EmitOpCode(&buffer_, 0);
344                         Advance();
345                         break;
346                     }
347                     default: {
348                         isAtom = true;
349                         int atomValue = ParseAtomEscape(isBackward);
350                         if (atomValue != -1) {
351                             if (IsIgnoreCase()) {
352                                 if (!IsUtf16()) {
353                                     atomValue = Canonicalize(atomValue, false);
354                                 } else {
355                                     icu::UnicodeSet set(atomValue, atomValue);
356                                     set.closeOver(USET_CASE_INSENSITIVE);
357                                     set.removeAllStrings();
358                                     int32_t size = set.size();
359                                     RangeOpCode rangeOp;
360                                     RangeSet rangeResult;
361                                     for (int32_t idx = 0; idx < size; idx++) {
362                                         int32_t uc = set.charAt(idx);
363                                         RangeSet curRange(uc);
364                                         rangeResult.Insert(curRange);
365                                     }
366                                     rangeOp.InsertOpCode(&buffer_, rangeResult);
367                                     break;
368                                 }
369                             }
370                             if (atomValue <= UINT16_MAX) {
371                                 CharOpCode charOp;
372                                 charOp.EmitOpCode(&buffer_, atomValue);
373                             } else {
374                                 Char32OpCode charOp;
375                                 charOp.EmitOpCode(&buffer_, atomValue);
376                             }
377                         }
378                         break;
379                     }
380                 }
381                 break;
382             }
383             case '(': {
384                 Advance();
385                 isAtom = ParseAssertionCapture(&captureIndex, isBackward);
386                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
387                 Advance();
388                 break;
389             }
390             case '.': {
391                 PrevOpCode prevOp;
392                 if (isBackward) {
393                     prevOp.EmitOpCode(&buffer_, 0);
394                 }
395                 if (IsDotAll()) {
396                     AllOpCode allOp;
397                     allOp.EmitOpCode(&buffer_, 0);
398                 } else {
399                     DotsOpCode dotsOp;
400                     dotsOp.EmitOpCode(&buffer_, 0);
401                 }
402                 if (isBackward) {
403                     prevOp.EmitOpCode(&buffer_, 0);
404                 }
405                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
406                 PrintF("Atom %c match any \n", c0_);
407                 isAtom = true;
408                 Advance();
409                 break;
410             }
411             case '[': {
412                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
413                 PrintF("Atom %c match range \n", c0_);
414                 isAtom = true;
415                 PrevOpCode prevOp;
416                 Advance();
417                 if (isBackward) {
418                     prevOp.EmitOpCode(&buffer_, 0);
419                 }
420                 bool isInvert = false;
421                 if (c0_ == '^') {
422                     isInvert = true;
423                     Advance();
424                 }
425                 RangeSet rangeResult;
426                 if (!ParseClassRanges(&rangeResult)) {
427                     break;
428                 }
429                 if (isInvert) {
430                     rangeResult.Invert(IsUtf16());
431                 }
432                 uint32_t highValue = rangeResult.HighestValue();
433                 if (highValue <= UINT16_MAX) {
434                     RangeOpCode rangeOp;
435                     rangeOp.InsertOpCode(&buffer_, rangeResult);
436                 } else {
437                     Range32OpCode rangeOp;
438                     rangeOp.InsertOpCode(&buffer_, rangeResult);
439                 }
440 
441                 if (isBackward) {
442                     prevOp.EmitOpCode(&buffer_, 0);
443                 }
444                 break;
445             }
446             case '*':
447             case '+':
448             case '?':
449                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
450                 ParseError("nothing to repeat");
451                 return;
452             case '{': {
453                 uint8_t *begin = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
454                 int dummy;
455                 if (ParserIntervalQuantifier(&dummy, &dummy)) {
456                     ParseError("nothing to repeat");
457                     return;
458                 }
459                 pc_ = begin;
460                 Advance();
461             }
462                 [[fallthrough]];
463             case '}':
464             case ']':
465                 if (IsUtf16()) {
466                     ParseError("syntax error");
467                     return;
468                 }
469                 [[fallthrough]];
470             default: {
471                 // PatternCharacter
472                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
473                 PrintF("PatternCharacter %c\n", c0_);
474                 isAtom = true;
475                 {
476                     PrevOpCode prevOp;
477                     if (isBackward) {
478                         prevOp.EmitOpCode(&buffer_, 0);
479                     }
480                     uint32_t matchedChar = c0_;
481                     if (c0_ > (INT8_MAX + 1)) {
482                         Prev();
483                         int i = 0;
484                         UChar32 c;
485                         int32_t length = end_ - pc_ + 1;
486                         // NOLINTNEXTLINE(hicpp-signed-bitwise)
487                         U8_NEXT(pc_, i, length, c);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
488                         matchedChar = static_cast<uint32_t>(c);
489                         pc_ += i;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
490                     }
491                     if (IsIgnoreCase()) {
492                         matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
493                     }
494                     if (matchedChar > UINT16_MAX) {
495                         Char32OpCode charOp;
496                         charOp.EmitOpCode(&buffer_, matchedChar);
497                     } else {
498                         CharOpCode charOp;
499                         charOp.EmitOpCode(&buffer_, matchedChar);
500                     }
501                     if (isBackward) {
502                         prevOp.EmitOpCode(&buffer_, 0);
503                     }
504                 }
505                 Advance();
506                 break;
507             }
508         }
509         if (isAtom && !isError_) {
510             ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
511         }
512         if (isBackward) {
513             size_t end = buffer_.GetSize();
514             size_t termSize = end - atomBcStart;
515             size_t moveSize = end - start;
516             buffer_.Expand(end + termSize);
517             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
518             if (memmove_s(buffer_.buf_ + start + termSize, moveSize, buffer_.buf_ + start, moveSize) != EOK) {
519                 LOG(FATAL, COMMON) << "memmove_s failed";
520                 UNREACHABLE();
521             }
522             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
523             if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
524                 LOG(FATAL, COMMON) << "memcpy_s failed";
525                 UNREACHABLE();
526             }
527         }
528     }
529 }
530 
FindGroupName(const PandaString & name)531 int RegExpParser::FindGroupName(const PandaString &name)
532 {
533     size_t len;
534     size_t nameLen = name.size();
535     const char *p = reinterpret_cast<char *>(groupNames_.buf_);
536     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
537     const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
538     int captureIndex = 1;
539     while (p < bufEnd) {
540         len = strlen(p);
541         if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
542             return captureIndex;
543         }
544         p += len + 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
545         captureIndex++;
546     }
547     return -1;
548 }
549 
ParseAssertionCapture(int * captureIndex,bool isBackward)550 bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
551 {
552     bool isAtom = false;
553     do {
554         if (c0_ == '?') {
555             Advance();
556             switch (c0_) {
557                 // (?=Disjunction[?U, ?N])
558                 case '=': {
559                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
560                     PrintF("Assertion(?= Disjunction)\n");
561                     Advance();
562                     uint32_t start = buffer_.size_;
563                     ParseDisjunction(isBackward);
564                     MatchOpCode matchOp;
565                     matchOp.EmitOpCode(&buffer_, 0);
566                     MatchAheadOpCode matchAheadOp;
567                     uint32_t len = buffer_.size_ - start;
568                     matchAheadOp.InsertOpCode(&buffer_, start, len);
569                     break;
570                 }
571                 // (?!Disjunction[?U, ?N])
572                 case '!': {
573                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
574                     PrintF("Assertion(?! Disjunction)\n");
575                     uint32_t start = buffer_.size_;
576                     Advance();
577                     ParseDisjunction(isBackward);
578                     MatchOpCode matchOp;
579                     matchOp.EmitOpCode(&buffer_, 0);
580                     NegativeMatchAheadOpCode matchAheadOp;
581                     uint32_t len = buffer_.size_ - start;
582                     matchAheadOp.InsertOpCode(&buffer_, start, len);
583                     break;
584                 }
585                 case '<': {
586                     Advance();
587                     // (?<=Disjunction[?U, ?N])
588                     if (c0_ == '=') {
589                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
590                         PrintF("Assertion(?<= Disjunction)\n");
591                         Advance();
592                         uint32_t start = buffer_.size_;
593                         ParseDisjunction(true);
594                         MatchOpCode matchOp;
595                         matchOp.EmitOpCode(&buffer_, 0);
596                         MatchAheadOpCode matchAheadOp;
597                         uint32_t len = buffer_.size_ - start;
598                         matchAheadOp.InsertOpCode(&buffer_, start, len);
599                         // (?<!Disjunction[?U, ?N])
600                     } else if (c0_ == '!') {
601                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
602                         PrintF("Assertion(?<! Disjunction)\n");
603                         Advance();
604                         uint32_t start = buffer_.size_;
605                         ParseDisjunction(true);
606                         MatchOpCode matchOp;
607                         matchOp.EmitOpCode(&buffer_, 0);
608                         NegativeMatchAheadOpCode matchAheadOp;
609                         uint32_t len = buffer_.size_ - start;
610                         matchAheadOp.InsertOpCode(&buffer_, start, len);
611                     } else {
612                         Prev();
613                         PandaString name;
614                         auto **pp = const_cast<const uint8_t **>(&pc_);
615                         if (!ParseGroupSpecifier(pp, name)) {
616                             ParseError("GroupName Syntax error.");
617                             return false;
618                         }
619                         if (FindGroupName(name) > 0) {
620                             ParseError("Duplicate GroupName error.");
621                             return false;
622                         }
623                         groupNames_.EmitStr(name.c_str());
624                         newGroupNames_.push_back(name);
625                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
626                         PrintF("group name %s", name.c_str());
627                         Advance();
628                         goto parseCapture;  // NOLINT(cppcoreguidelines-avoid-goto)
629                     }
630                     break;
631                 }
632                 // (?:Disjunction[?U, ?N])
633                 case ':':
634                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
635                     PrintF("Atom(?<: Disjunction)\n");
636                     isAtom = true;
637                     Advance();
638                     ParseDisjunction(isBackward);
639                     break;
640                 default:
641                     Advance();
642                     ParseError("? Syntax error.");
643                     return false;
644             }
645         } else {
646             groupNames_.EmitChar(0);
647         parseCapture:
648             isAtom = true;
649             *captureIndex = captureCount_++;
650             SaveEndOpCode saveEndOp;
651             SaveStartOpCode saveStartOp;
652             if (isBackward) {
653                 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
654             } else {
655                 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
656             }
657             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
658             PrintF("capture start %d \n", *captureIndex);
659             ParseDisjunction(isBackward);
660             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
661             PrintF("capture end %d \n", *captureIndex);
662             if (isBackward) {
663                 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
664             } else {
665                 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
666             }
667         }
668     } while (c0_ != ')' && c0_ != KEY_EOF);
669     if (c0_ != ')') {
670         ParseError("capture syntax error");
671         return false;
672     }
673     return isAtom;
674 }
675 
ParseDecimalDigits()676 int RegExpParser::ParseDecimalDigits()
677 {
678     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
679     PrintF("Parse DecimalDigits------\n");
680     uint32_t result = 0;
681     bool overflow = false;
682     while (true) {
683         if (c0_ < '0' || c0_ > '9') {
684             break;
685         }
686         if (!overflow) {
687             if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
688                 overflow = true;
689             } else {
690                 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
691             }
692         }
693         Advance();
694     }
695     if (overflow) {
696         return INT32_MAX;
697     }
698     return result;
699 }
700 
ParserIntervalQuantifier(int * pmin,int * pmax)701 bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
702 {
703     // Quantifier::
704     //     QuantifierPrefix
705     //     QuantifierPrefix?
706     // QuantifierPrefix::
707     // *
708     // +
709     // ?
710     // {DecimalDigits}
711     // {DecimalDigits,}
712     // {DecimalDigits,DecimalDigits}
713     Advance();
714     *pmin = ParseDecimalDigits();
715     *pmax = *pmin;
716     switch (c0_) {
717         case ',': {
718             Advance();
719             if (c0_ == '}') {
720                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
721                 PrintF("QuantifierPrefix{DecimalDigits,}\n");
722                 *pmax = INT32_MAX;
723                 Advance();
724             } else {
725                 *pmax = ParseDecimalDigits();
726                 if (c0_ == '}') {
727                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
728                     PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
729                     Advance();
730                 } else {
731                     return false;
732                 }
733             }
734             break;
735         }
736         case '}':
737             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
738             PrintF("QuantifierPrefix{DecimalDigits}\n");
739             Advance();
740             break;
741         default:
742             Advance();
743             return false;
744     }
745     return true;
746 }
747 
ParseQuantifierPrefix(int & min,int & max,bool & isGreedy)748 bool RegExpParser::ParseQuantifierPrefix(int &min, int &max, bool &isGreedy)
749 {
750     switch (c0_) {
751         case '*':
752             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
753             PrintF("QuantifierPrefix %c\n", c0_);
754             min = 0;
755             max = INT32_MAX;
756             Advance();
757             break;
758         case '+':
759             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
760             PrintF("QuantifierPrefix %c\n", c0_);
761             min = 1;
762             max = INT32_MAX;
763             Advance();
764             break;
765         case '?':
766             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
767             PrintF("QuantifierPrefix %c\n", c0_);
768             Advance();
769             min = 0;
770             max = 1;
771             break;
772         case '{': {
773             uint8_t *start = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
774             if (!ParserIntervalQuantifier(&min, &max)) {
775                 pc_ = start;
776                 Advance();  // back to '{'
777                 return false;
778             }
779             if (min > max) {
780                 ParseError("Invalid repetition count");
781                 return false;
782             }
783             break;
784         }
785         default:
786             break;
787     }
788     if (c0_ == '?') {
789         isGreedy = false;
790         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
791         PrintF("Quantifier::QuantifierPrefix?\n");
792         Advance();
793     } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
794         ParseError("nothing to repeat");
795         return false;
796     }
797     return true;
798 }
799 
ParseQuantifier(size_t atomBcStart,int captureStart,int captureEnd)800 void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
801 {
802     int min = -1;
803     int max = -1;
804     bool isGreedy = true;
805     if (!ParseQuantifierPrefix(min, max, isGreedy)) {
806         return;
807     }
808     if (min != -1 && max != -1) {
809         stackCount_++;
810         PushOpCode pushOp;
811         pushOp.InsertOpCode(&buffer_, atomBcStart);
812         atomBcStart += pushOp.GetSize();
813 
814         if (captureStart != 0) {
815             SaveResetOpCode saveResetOp;
816             saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
817         }
818 
819         // zero advance check
820         if (max == INT32_MAX) {
821             stackCount_++;
822             PushCharOpCode pushCharOp;
823             pushCharOp.InsertOpCode(&buffer_, atomBcStart);
824             CheckCharOpCode checkCharOp;
825             // NOLINTNEXTLINE(readability-magic-numbers)
826             checkCharOp.EmitOpCode(&buffer_, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
827         }
828 
829         if (isGreedy) {
830             LoopGreedyOpCode loopOp;
831             loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
832         } else {
833             LoopOpCode loopOp;
834             loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
835         }
836 
837         if (min == 0) {
838             if (isGreedy) {
839                 SplitNextOpCode splitNextOp;
840                 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
841             } else {
842                 SplitFirstOpCode splitFirstOp;
843                 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
844             }
845         }
846 
847         PopOpCode popOp;
848         popOp.EmitOpCode(&buffer_);
849     }
850 }
851 
ParseGroupSpecifier(const uint8_t ** pp,PandaString & name)852 bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, PandaString &name)
853 {
854     const uint8_t *p = *pp;
855     uint32_t c;
856     std::array<char, CACHE_SIZE> buffer {};
857     char *q = buffer.data();
858     while (true) {
859         if (p <= end_) {
860             c = *p;
861         } else {
862             c = KEY_EOF;
863         }
864         if (c == '\\') {
865             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
866             p++;
867             if (*p != 'u') {
868                 return false;
869             }
870             if (!ParseUnicodeEscape(&c)) {
871                 return false;
872             }
873         } else if (c == '>') {
874             break;
875         } else if (c > CACHE_SIZE && c != KEY_EOF) {
876             c = static_cast<uint32_t>(UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
877         } else if (c != KEY_EOF) {
878             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
879             p++;
880         } else {
881             return false;
882         }
883         if (q == buffer.data()) {
884             if (IsIdentFirst(c) != 0) {
885                 return false;
886             }
887         } else {
888             if (!u_isIDPart(c)) {
889                 return false;
890             }
891         }
892         if (q != nullptr) {
893             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
894             *q++ = c;
895         }
896     }
897     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
898     p++;
899     *pp = p;
900     name = buffer.data();
901     return true;
902 }
903 
ParseCaptureCount(const char * groupName)904 int RegExpParser::ParseCaptureCount(const char *groupName)
905 {
906     const uint8_t *p = nullptr;
907     int captureIndex = 1;
908     PandaString name;
909     hasNamedCaptures_ = 0;
910     for (p = base_; p < end_; p++) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
911         switch (*p) {
912             case '(': {
913                 if (p[1] == '?') {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
914                     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
915                     if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
916                         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
917                         p[CAPTURE_CONUT_ADVANCE] != '=') {
918                         hasNamedCaptures_ = 1;
919                         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
920                         p += CAPTURE_CONUT_ADVANCE;
921                         if (groupName != nullptr) {
922                             if (ParseGroupSpecifier(&p, name)) {
923                                 if (strcmp(name.c_str(), groupName) == 0) {
924                                     return captureIndex;
925                                 }
926                             }
927                         }
928                         captureIndex++;
929                     }
930                 } else {
931                     captureIndex++;
932                 }
933                 break;
934             }
935             case '\\':
936                 p++;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
937                 break;
938             case '[': {
939                 while (p < end_ && *p != ']') {
940                     if (*p == '\\') {
941                         p++;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
942                     }
943                     p++;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
944                 }
945                 break;
946             }
947             default:
948                 break;
949         }
950     }
951     return captureIndex;
952 }
953 
954 // NOLINTNEXTLINE(readability-function-size)
ParseAtomEscape(bool isBackward)955 int RegExpParser::ParseAtomEscape(bool isBackward)
956 {
957     // AtomEscape[U, N]::
958     //     DecimalEscape
959     //     CharacterClassEscape[?U]
960     //     CharacterEscape[?U]
961     //     [+N]kGroupName[?U]
962     int result = -1;
963     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
964     PrintF("Parse AtomEscape------\n");
965     PrevOpCode prevOp;
966     switch (c0_) {
967         case KEY_EOF:
968             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
969             ParseError("unexpected end");
970             break;
971         // DecimalEscape
972         case '1':
973         case '2':
974         case '3':
975         case '4':
976         case '5':
977         case '6':
978         case '7':
979         case '8':
980         case '9': {
981             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
982             PrintF("NonZeroDigit %c\n", c0_);
983             int capture = ParseDecimalDigits();
984             if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
985                 ParseError("invalid backreference count");
986                 break;
987             }
988             if (isBackward) {
989                 BackwardBackReferenceOpCode backReferenceOp;
990                 backReferenceOp.EmitOpCode(&buffer_, capture);
991             } else {
992                 BackReferenceOpCode backReferenceOp;
993                 backReferenceOp.EmitOpCode(&buffer_, capture);
994             }
995             break;
996         }
997         // CharacterClassEscape
998         case 'd': {
999             // [0-9]
1000             RangeOpCode rangeOp;
1001             if (isBackward) {
1002                 prevOp.EmitOpCode(&buffer_, 0);
1003             }
1004             rangeOp.InsertOpCode(&buffer_, g_gRangeD);
1005             goto parseLookBehind;  // NOLINT(cppcoreguidelines-avoid-goto)
1006         }
1007         case 'D': {
1008             // [^0-9]
1009             RangeSet atomRange(g_gRangeD);
1010             atomRange.Invert(IsUtf16());
1011             Range32OpCode rangeOp;
1012             if (isBackward) {
1013                 prevOp.EmitOpCode(&buffer_, 0);
1014             }
1015             rangeOp.InsertOpCode(&buffer_, atomRange);
1016             goto parseLookBehind;  // NOLINT(cppcoreguidelines-avoid-goto)
1017         }
1018         case 's': {
1019             // [\f\n\r\t\v]
1020             RangeOpCode rangeOp;
1021             if (isBackward) {
1022                 prevOp.EmitOpCode(&buffer_, 0);
1023             }
1024             rangeOp.InsertOpCode(&buffer_, g_gRangeS);
1025             goto parseLookBehind;  // NOLINT(cppcoreguidelines-avoid-goto)
1026         }
1027         case 'S': {
1028             RangeSet atomRange(g_gRangeS);
1029             Range32OpCode rangeOp;
1030             atomRange.Invert(IsUtf16());
1031             if (isBackward) {
1032                 prevOp.EmitOpCode(&buffer_, 0);
1033             }
1034             rangeOp.InsertOpCode(&buffer_, atomRange);
1035             goto parseLookBehind;  // NOLINT(cppcoreguidelines-avoid-goto)
1036         }
1037         case 'w': {
1038             // [A-Za-z0-9]
1039             RangeOpCode rangeOp;
1040             if (isBackward) {
1041                 prevOp.EmitOpCode(&buffer_, 0);
1042             }
1043             rangeOp.InsertOpCode(&buffer_, g_gRangeW);
1044             goto parseLookBehind;  // NOLINT(cppcoreguidelines-avoid-goto)
1045         }
1046         case 'W': {
1047             // [^A-Za-z0-9]
1048             RangeSet atomRange(g_gRangeW);
1049             atomRange.Invert(IsUtf16());
1050             Range32OpCode rangeOp;
1051             if (isBackward) {
1052                 prevOp.EmitOpCode(&buffer_, 0);
1053             }
1054             rangeOp.InsertOpCode(&buffer_, atomRange);
1055             goto parseLookBehind;  // NOLINT(cppcoreguidelines-avoid-goto)
1056         }
1057         // P{UnicodePropertyValueExpression}
1058         // p{UnicodePropertyValueExpression}
1059         case 'P':
1060         case 'p':
1061         // [+N]kGroupName[?U]
1062         case 'k': {
1063             Advance();
1064             if (c0_ != '<') {
1065                 if (!IsUtf16() || HasNamedCaptures()) {
1066                     ParseError("expecting group name.");
1067                     break;
1068                 }
1069             }
1070             Advance();
1071             Prev();
1072             PandaString name;
1073             auto **pp = const_cast<const uint8_t **>(&pc_);
1074             if (!ParseGroupSpecifier(pp, name)) {
1075                 ParseError("GroupName Syntax error.");
1076                 break;
1077             }
1078             int postion = FindGroupName(name);
1079             if (postion < 0) {
1080                 postion = ParseCaptureCount(name.c_str());
1081                 if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1082                     ParseError("group name not defined");
1083                     break;
1084                 }
1085             }
1086             if (isBackward) {
1087                 BackwardBackReferenceOpCode backReferenceOp;
1088                 backReferenceOp.EmitOpCode(&buffer_, postion);
1089             } else {
1090                 BackReferenceOpCode backReferenceOp;
1091                 backReferenceOp.EmitOpCode(&buffer_, postion);
1092             }
1093             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1094             Advance();
1095             break;
1096         }
1097         parseLookBehind : {
1098             if (isBackward) {
1099                 prevOp.EmitOpCode(&buffer_, 0);
1100             }
1101             Advance();
1102             break;
1103         }
1104         default:
1105             result = ParseCharacterEscape();
1106             break;
1107     }
1108     return result;
1109 }
1110 
RecountCaptures()1111 int RegExpParser::RecountCaptures()
1112 {
1113     if (totalCaptureCount_ < 0) {
1114         const char *name = reinterpret_cast<const char *>(groupNames_.buf_);
1115         totalCaptureCount_ = ParseCaptureCount(name);
1116     }
1117     return totalCaptureCount_;
1118 }
HasNamedCaptures()1119 bool RegExpParser::HasNamedCaptures()
1120 {
1121     if (hasNamedCaptures_ < 0) {
1122         RecountCaptures();
1123     }
1124     return false;
1125 }
1126 
ParseCharacterEscape()1127 int RegExpParser::ParseCharacterEscape()
1128 {
1129     // CharacterEscape[U]::
1130     //     ControlEscape
1131     //     c ControlLetter
1132     //     0 [lookahead ∉ DecimalDigit]
1133     //     HexEscapeSequence
1134     //     RegExpUnicodeEscapeSequence[?U]
1135     //     IdentityEscape[?U]
1136     uint32_t result = 0;
1137     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1138     switch (c0_) {
1139         // ControlEscape
1140         case 'f':
1141             result = '\f';
1142             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1143             PrintF("ControlEscape %c\n", c0_);
1144             Advance();
1145             break;
1146         case 'n':
1147             result = '\n';
1148             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1149             PrintF("ControlEscape %c\n", c0_);
1150             Advance();
1151             break;
1152         case 'r':
1153             result = '\r';
1154             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1155             PrintF("ControlEscape %c\n", c0_);
1156             Advance();
1157             break;
1158         case 't':
1159             result = '\t';
1160             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1161             PrintF("ControlEscape %c\n", c0_);
1162             Advance();
1163             break;
1164         case 'v':
1165             result = '\v';
1166             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1167             PrintF("ControlEscape %c\n", c0_);
1168             Advance();
1169             break;
1170         // c ControlLetter
1171         case 'c': {
1172             Advance();
1173             if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1174                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1175                 PrintF("ControlLetter %c\n", c0_);
1176                 result = static_cast<uint32_t>(c0_) & 0x1f;  // NOLINT(readability-magic-numbers, hicpp-signed-bitwise)
1177                 Advance();
1178             } else {
1179                 if (!IsUtf16()) {
1180                     pc_--;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1181                     result = '\\';
1182                 } else {
1183                     ParseError("Invalid control letter");
1184                     return -1;
1185                 }
1186             }
1187             break;
1188         }
1189         case '0': {
1190             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1191             PrintF("CharacterEscape 0 [lookahead ∉ DecimalDigit]\n");
1192             if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) {  // NOLINT(readability-magic-numbers)
1193                 Advance();
1194                 result = 0;
1195                 break;
1196             }
1197             [[fallthrough]];
1198         }
1199         case '1':
1200         case '2':
1201         case '3':
1202         case '4':
1203         case '5':
1204         case '6':
1205         case '7': {
1206             if (IsUtf16()) {
1207                 // With /u, decimal escape is not interpreted as octal character code.
1208                 ParseError("Invalid class escape");
1209                 return 0;
1210             }
1211             result = ParseOctalLiteral();
1212             break;
1213         }
1214         // ParseHexEscapeSequence
1215         // ParseRegExpUnicodeEscapeSequence
1216         case 'x': {
1217             Advance();
1218             if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1219                 return result;
1220             }
1221             if (IsUtf16()) {
1222                 ParseError("Invalid class escape");
1223                 return -1;
1224             }
1225             result = 'x';
1226             break;
1227         }
1228         case 'u': {
1229             Advance();
1230             if (ParseUnicodeEscape(&result)) {
1231                 return result;
1232             }
1233             if (IsUtf16()) {
1234                 // With /u, invalid escapes are not treated as identity escapes.
1235                 ParseError("Invalid unicode escape");
1236                 return 0;
1237             }
1238             // If \u is not followed by a two-digit hexadecimal, treat it
1239             // as an identity escape.
1240             result = 'u';
1241             break;
1242         }
1243         // IdentityEscape[?U]
1244         case '$':
1245         case '(':
1246         case ')':
1247         case '*':
1248         case '+':
1249         case '.':
1250         case '/':
1251         case '?':
1252         case '[':
1253         case '\\':
1254         case ']':
1255         case '^':
1256         case '{':
1257         case '|':
1258         case '}':
1259             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1260             PrintF("IdentityEscape %c\n", c0_);
1261             result = c0_;
1262             Advance();
1263             break;
1264         default: {
1265             if (IsUtf16()) {
1266                 ParseError("Invalid unicode escape");
1267                 return 0;
1268             }
1269             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1270             PrintF("SourceCharacter %c\n", c0_);
1271             result = c0_;
1272             if (result < CHAR_MAXS) {
1273                 Advance();
1274             }
1275             break;
1276         }
1277     }
1278     return result;
1279 }
1280 
ParseClassRanges(RangeSet * result)1281 bool RegExpParser::ParseClassRanges(RangeSet *result)
1282 {
1283     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1284     PrintF("Parse ClassRanges------\n");
1285     while (c0_ != ']') {
1286         RangeSet s1;
1287         uint32_t c1 = ParseClassAtom(&s1);
1288         if (c1 == UINT32_MAX) {
1289             ParseError("invalid class range");
1290             return false;
1291         }
1292 
1293         int nextC0 = *pc_;
1294         if (c0_ == '-' && nextC0 != ']') {
1295             if (c1 == CLASS_RANGE_BASE) {
1296                 if (IsUtf16()) {
1297                     ParseError("invalid class range");
1298                     return false;
1299                 }
1300                 result->Insert(s1);
1301                 continue;
1302             }
1303             Advance();
1304             RangeSet s2;
1305             uint32_t c2 = ParseClassAtom(&s2);
1306             if (c2 == UINT32_MAX) {
1307                 ParseError("invalid class range");
1308                 return false;
1309             }
1310             if (c2 == CLASS_RANGE_BASE) {
1311                 if (IsUtf16()) {
1312                     ParseError("invalid class range");
1313                     return false;
1314                 }
1315                 result->Insert(s2);
1316                 continue;
1317             }
1318             if (c1 < INT8_MAX) {
1319                 if (c1 > c2) {
1320                     ParseError("invalid class range");
1321                     return false;
1322                 }
1323             }
1324             if (IsIgnoreCase()) {
1325                 c1 = static_cast<uint32_t>(Canonicalize(c1, IsUtf16()));
1326                 c2 = static_cast<uint32_t>(Canonicalize(c2, IsUtf16()));
1327             }
1328 
1329             result->Insert(c1, c2);
1330         } else {
1331             result->Insert(s1);
1332         }
1333     }
1334     Advance();
1335     return true;
1336 }
1337 
ParseClassAtom(RangeSet * atom)1338 uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1339 {
1340     uint32_t ret = UINT32_MAX;
1341     switch (c0_) {
1342         case '\\': {
1343             Advance();
1344             ret = static_cast<uint32_t>(ParseClassEscape(atom));
1345             break;
1346         }
1347         case KEY_EOF:
1348             break;
1349         case 0: {
1350             if (pc_ >= end_) {
1351                 return UINT32_MAX;
1352             }
1353             [[fallthrough]];
1354         }
1355         default: {
1356             uint32_t value = c0_;
1357             size_t u16Size;
1358             if (c0_ > INT8_MAX) {
1359                 pc_ -= 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1360                 auto u16Result = utf::ConvertUtf8ToUtf16Pair(pc_, true);
1361                 value = u16Result.first;
1362                 u16Size = u16Result.second;
1363                 Advance(u16Size + 1);
1364             } else {
1365                 Advance();
1366             }
1367             if (IsIgnoreCase()) {
1368                 value = static_cast<uint32_t>(Canonicalize(value, IsUtf16()));
1369             }
1370             atom->Insert(RangeSet(value));
1371             ret = value;
1372             break;
1373         }
1374     }
1375     return ret;
1376 }
1377 
ParseClassEscape(RangeSet * atom)1378 int RegExpParser::ParseClassEscape(RangeSet *atom)
1379 {
1380     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1381     PrintF("Parse ClassEscape------\n");
1382     int result = -1;
1383     switch (c0_) {
1384         case 'b':
1385             Advance();
1386             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1387             PrintF("ClassEscape %c", 'b');
1388             result = '\b';
1389             atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1390             break;
1391         case '-':
1392             Advance();
1393             result = '-';
1394             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1395             PrintF("ClassEscape %c", '-');
1396             atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1397             break;
1398         // CharacterClassEscape
1399         case 'd':
1400         case 'D':
1401             result = CLASS_RANGE_BASE;
1402             atom->Insert(g_gRangeD);
1403             if (c0_ == 'D') {
1404                 atom->Invert(IsUtf16());
1405             }
1406             Advance();
1407             break;
1408         case 's':
1409         case 'S':
1410             result = CLASS_RANGE_BASE;
1411             atom->Insert(g_gRangeS);
1412             if (c0_ == 'S') {
1413                 atom->Invert(IsUtf16());
1414             }
1415             Advance();
1416             break;
1417         case 'w':
1418         case 'W':
1419             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1420             PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1421             result = CLASS_RANGE_BASE;
1422             atom->Insert(g_gRangeW);
1423             if (c0_ == 'W') {
1424                 atom->Invert(IsUtf16());
1425             }
1426             Advance();
1427             break;
1428         // P{UnicodePropertyValueExpression}
1429         // p{UnicodePropertyValueExpression}
1430         case 'P':
1431         case 'p':
1432             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1433             PrintF("Warning: \\p is not supported in ECMA 2015!");
1434             Advance();
1435             if (c0_ == '{') {
1436                 Advance();
1437                 if (c0_ == '}') {
1438                     break;  // p{}, invalid
1439                 }
1440                 bool isValue = false;
1441                 ParseUnicodePropertyValueCharacters(&isValue);
1442                 if (!isValue && c0_ == '=') {
1443                     // UnicodePropertyName = UnicodePropertyValue
1444                     Advance();
1445                     if (c0_ == '}') {
1446                         break;  // p{xxx=}, invalid
1447                     }
1448                     ParseUnicodePropertyValueCharacters(&isValue);
1449                 }
1450                 if (c0_ != '}') {
1451                     break;  // p{xxx, invalid
1452                 }
1453                 // should do atom->Invert() here after ECMA 9.0
1454                 Advance();
1455                 result = CLASS_RANGE_BASE;
1456             }
1457             break;
1458         default:
1459             result = ParseCharacterEscape();
1460             int value = result;
1461             if (IsIgnoreCase()) {
1462                 value = Canonicalize(value, IsUtf16());
1463             }
1464             atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1465             break;
1466     }
1467     return result;
1468 }
1469 
ParseUnicodePropertyValueCharacters(bool * isValue)1470 void RegExpParser::ParseUnicodePropertyValueCharacters(bool *isValue)
1471 {
1472     if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1473         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1474         PrintF("UnicodePropertyCharacter::ControlLetter %c\n", c0_);
1475     } else if (c0_ == '_') {
1476         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1477         PrintF("UnicodePropertyCharacter:: _ \n");
1478     } else if (c0_ >= '0' && c0_ <= '9') {
1479         *isValue = true;
1480         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1481         PrintF("UnicodePropertyValueCharacter::DecimalDigit %c\n", c0_);
1482     } else {
1483         return;
1484     }
1485     Advance();
1486     ParseUnicodePropertyValueCharacters(isValue);
1487 }
1488 
1489 // NOLINTNEXTLINE(cert-dcl50-cpp)
PrintF(const char * fmt,...)1490 void RegExpParser::PrintF(const char *fmt, ...)
1491 {
1492     (void)fmt;
1493 }
1494 
ParseError(const char * errorMessage)1495 void RegExpParser::ParseError(const char *errorMessage)
1496 {
1497     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1498     PrintF("error: ");
1499     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1500     PrintF(errorMessage);
1501     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1502     PrintF("\n");
1503     SetIsError();
1504     size_t length = strlen(errorMessage) + 1;
1505     if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1506         LOG(FATAL, COMMON) << "memcpy_s failed";
1507         UNREACHABLE();
1508     }
1509 }
1510 
IsIdentFirst(uint32_t c)1511 int RegExpParser::IsIdentFirst(uint32_t c)
1512 {
1513     if (c < CACHE_SIZE) {
1514         // NOLINTNEXTLINE(hicpp-signed-bitwise
1515         return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1;  // 5: Shift five bits 31: and operation binary of 31
1516     }
1517     return static_cast<int>(u_isIDStart(c));
1518 }
1519 }  // namespace ark