• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2025 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "runtime/regexp/ecmascript/regexp_parser.h"
17 #include "runtime/regexp/ecmascript/regexp_opcode.h"
18 #include "runtime/include/coretypes/string-inl.h"
19 
20 #include "libpandabase/utils/utils.h"
21 #include "libpandabase/utils/utf.h"
22 
23 #include "securec.h"
24 #include "unicode/uchar.h"
25 #include "unicode/uniset.h"
26 
27 namespace {
28 constexpr int INVALID_UNICODE_FROM_UTF8 = -1;
29 
30 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
31 constexpr int UICODE_FROM_UTF8[] = {
32     0x80, 0xc0, 0xdf, 0xe0, 0xef, 0xf0, 0xf7, 0xf8, 0xfb, 0xfc, 0xfd,
33 };
34 
35 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
36 constexpr int UTF8_MIN_CODE[] = {
37     0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
38 };
39 
40 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
41 constexpr char UTF8_FIRST_CODE[] = {
42     0x1f, 0xf, 0x7, 0x3, 0x1,
43 };
44 
FromUtf8(int c,int l,const uint8_t * p,const uint8_t ** pp)45 int FromUtf8(int c, int l, const uint8_t *p, const uint8_t **pp)
46 {
47     uint32_t b;
48     uint32_t cc = c;
49     // NOLINTNEXTLINE(hicpp-signed-bitwise)
50     cc &= UTF8_FIRST_CODE[l - 1];
51     for (int i = 0; i < l; i++) {
52         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
53         b = *p++;
54         if (b < ark::utf::UTF8_2B_SECOND || b >= ark::utf::UTF8_2B_FIRST) {
55             return INVALID_UNICODE_FROM_UTF8;
56         }
57         // NOLINTNEXTLINE(hicpp-signed-bitwise)
58         cc = (cc << 6) | (b & ark::utf::UTF8_2B_THIRD);  // 6: Maximum Unicode range
59     }
60     if (cc < static_cast<uint32_t>(UTF8_MIN_CODE[l - 1])) {
61         return INVALID_UNICODE_FROM_UTF8;
62     }
63     *pp = p;
64     return static_cast<int>(cc);
65 }
66 
UnicodeFromUtf8(const uint8_t * p,int maxLen,const uint8_t ** pp)67 int UnicodeFromUtf8(const uint8_t *p, int maxLen, const uint8_t **pp)
68 {
69     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
70     int c = *p++;
71     if (c < UICODE_FROM_UTF8[0]) {
72         *pp = p;
73         return c;
74     }
75     int l = 0;
76     if (c >= UICODE_FROM_UTF8[1U] && c <= UICODE_FROM_UTF8[2U]) {         // 1 - 2: 0000 0080 - 0000 07FF
77         l = 1;                                                            // 1: 0000 0080 - 0000 07FF Unicode
78     } else if (c >= UICODE_FROM_UTF8[3U] && c <= UICODE_FROM_UTF8[4U]) {  // 3 - 4: 0000 0800 - 0000 FFFF
79         l = 2;                                                            // 2: 0000 0800 - 0000 FFFF Unicode
80     } else if (c >= UICODE_FROM_UTF8[5U] && c <= UICODE_FROM_UTF8[6U]) {  // 5 - 6: 0001 0000 - 0010 FFFF
81         l = 3;                                                            // 3: 0001 0000 - 0010 FFFF Unicode
82     } else if (c >= UICODE_FROM_UTF8[7U] && c <= UICODE_FROM_UTF8[8U]) {  // 7 - 8: 0020 0000 - 03FF FFFF
83         l = 4;                                                            // 4: 0020 0000 - 03FF FFFF Unicode
84         // NOLINTNEXTLINE(readability-magic-numbers)
85     } else if (c == UICODE_FROM_UTF8[9U] || c == UICODE_FROM_UTF8[10U]) {  // 9 - 10: 0400 0000 - 7FFF FFFF
86         l = 5;                                                             // 5: 0400 0000 - 7FFF FFFF Unicode
87     } else {
88         return INVALID_UNICODE_FROM_UTF8;
89     }
90     /* check that we have enough characters */
91     if (l > (maxLen - 1)) {
92         return INVALID_UNICODE_FROM_UTF8;
93     }
94     return FromUtf8(c, l, p, pp);
95 }
96 }  // namespace
97 
98 namespace ark {
99 static constexpr uint32_t CACHE_SIZE = 128;
100 static constexpr uint32_t CHAR_MAXS = 128;
101 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
102 static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
103     /* $ A-Z _ a-z */
104     0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE};
105 static RangeSet g_gRangeD(0x30, 0x39);  // NOLINT(fuchsia-statically-constructed-objects, readability-magic-numbers)
106 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
107 static RangeSet g_gRangeS({
108     std::pair<uint32_t, uint32_t>(0x0009, 0x000D),  // NOLINT(readability-magic-numbers)
109     std::pair<uint32_t, uint32_t>(0x0020, 0x0020),  // NOLINT(readability-magic-numbers)
110     std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0),  // NOLINT(readability-magic-numbers)
111     std::pair<uint32_t, uint32_t>(0x1680, 0x1680),  // NOLINT(readability-magic-numbers)
112     std::pair<uint32_t, uint32_t>(0x2000, 0x200A),  // NOLINT(readability-magic-numbers)
113     /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
114     /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
115     std::pair<uint32_t, uint32_t>(0x2028, 0x2029),  // NOLINT(readability-magic-numbers)
116     std::pair<uint32_t, uint32_t>(0x202F, 0x202F),  // NOLINT(readability-magic-numbers)
117     std::pair<uint32_t, uint32_t>(0x205F, 0x205F),  // NOLINT(readability-magic-numbers)
118     std::pair<uint32_t, uint32_t>(0x3000, 0x3000),  // NOLINT(readability-magic-numbers)
119     /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
120     std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF),  // NOLINT(readability-magic-numbers)
121 });
122 
123 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
124 static RangeSet g_gRangeW({
125     std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINT(readability-magic-numbers)
126     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINT(readability-magic-numbers)
127     std::pair<uint32_t, uint32_t>(0x005F, 0x005F),  // NOLINT(readability-magic-numbers)
128     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINT(readability-magic-numbers)
129 });
130 
131 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
132 static RangeSet g_gRegexpIdentifyStart({
133     std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINT(readability-magic-numbers)
134     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINT(readability-magic-numbers)
135     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINT(readability-magic-numbers)
136 });
137 
138 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
139 static RangeSet g_gRegexpIdentifyContinue({
140     std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINT(readability-magic-numbers)
141     std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINT(readability-magic-numbers)
142     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINT(readability-magic-numbers)
143     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINT(readability-magic-numbers)
144 });
145 
Parse()146 void RegExpParser::Parse()
147 {
148     // dynbuffer head init [size,capture_count,statck_count,flags]
149     buffer_.EmitU32(0);
150     buffer_.EmitU32(0);
151     buffer_.EmitU32(0);
152     buffer_.EmitU32(0);
153     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
154     PrintF("Parse Pattern------\n");
155     // Pattern[U, N]::
156     //      Disjunction[?U, ?N]
157     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
158     Advance();
159     SaveStartOpCode saveStartOp;
160     int captureIndex = captureCount_++;
161     saveStartOp.EmitOpCode(&buffer_, captureIndex);
162     ParseDisjunction(false);
163     if (c0_ != KEY_EOF) {
164         ParseError("extraneous characters at the end");
165         return;
166     }
167     SaveEndOpCode saveEndOp;
168     saveEndOp.EmitOpCode(&buffer_, captureIndex);
169     MatchEndOpCode matchEndOp;
170     matchEndOp.EmitOpCode(&buffer_, 0);
171     // dynbuffer head assignments
172     buffer_.PutU32(0, buffer_.size_);
173     buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
174     buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
175     buffer_.PutU32(FLAGS_OFFSET, flags_);
176 }
177 
ParseDisjunction(bool isBackward)178 void RegExpParser::ParseDisjunction(bool isBackward)
179 {
180     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
181     PrintF("Parse Disjunction------\n");
182     size_t start = buffer_.size_;
183     ParseAlternative(isBackward);
184     if (isError_) {
185         return;
186     }
187     do {
188         if (c0_ == '|') {
189             SplitNextOpCode splitOp;
190             uint32_t len = buffer_.size_ - start;
191             GotoOpCode gotoOp;
192             splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
193             uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
194             Advance();
195             ParseAlternative(isBackward);
196             gotoOp.UpdateOpPara(&buffer_, pos, buffer_.size_ - pos - gotoOp.GetSize());
197         }
198     } while (c0_ != KEY_EOF && c0_ != ')');
199 }
200 
ParseOctalLiteral()201 uint32_t RegExpParser::ParseOctalLiteral()
202 {
203     // For compatibility with some other browsers (not all), we parse
204     // up to three octal digits with a value below 256.
205     // ES#prod-annexB-LegacyOctalEscapeSequence
206     uint32_t value = c0_ - '0';
207     Advance();
208     if (c0_ >= '0' && c0_ <= '7') {
209         value = value * OCTAL_VALUE + c0_ - '0';
210         Advance();
211         if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
212             value = value * OCTAL_VALUE + c0_ - '0';
213             Advance();
214         }
215     }
216     return value;
217 }
218 
ParseUnlimitedLengthHexNumber(uint32_t maxValue,uint32_t * value)219 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
220 {
221     uint32_t x = 0;
222     int d = static_cast<int>(HexValue(c0_));
223     if (d < 0) {
224         return false;
225     }
226     while (d >= 0) {
227         if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
228             LOG(FATAL, COMMON) << "value overflow";
229             return false;
230         }
231         x = x * HEX_VALUE + static_cast<uint32_t>(d);
232         if (x > maxValue) {
233             return false;
234         }
235         Advance();
236         d = static_cast<int>(HexValue(c0_));
237     }
238     *value = x;
239     return true;
240 }
241 
242 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uint32_t * value)243 bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
244 {
245     // Accept both \uxxxx and \u{xxxxxx} (if allowed).
246     // In the latter case, the number of hex digits between { } is arbitrary.
247     // \ and u have already been read.
248     if (c0_ == '{' && IsUtf16()) {
249         uint8_t *start = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
250         Advance();
251         if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {  // NOLINT(readability-magic-numbers)
252             if (c0_ == '}') {
253                 Advance();
254                 return true;
255             }
256         }
257         pc_ = start;
258         Advance();
259         return false;
260     }
261     // \u but no {, or \u{...} escapes not allowed.
262     bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
263     if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
264         // Attempt to read trail surrogate.
265         uint8_t *start = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
266         if (*pc_ == 'u') {
267             Advance(UNICODE_HEX_ADVANCE);
268             uint32_t trail;
269             if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
270                 *value = U16_GET_SUPPLEMENTARY((*value), (trail));  // NOLINT(hicpp-signed-bitwise)
271                 return true;
272             }
273         }
274         pc_ = start;
275         Advance();
276     }
277     return result;
278 }
279 
ParseHexEscape(int length,uint32_t * value)280 bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
281 {
282     uint8_t *start = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
283     uint32_t val = 0;
284     for (int i = 0; i < length; ++i) {
285         uint32_t c = c0_;
286         int d = static_cast<int>(HexValue(c));
287         if (d < 0) {
288             pc_ = start;
289             Advance();
290             return false;
291         }
292         val = val * HEX_VALUE + static_cast<uint32_t>(d);
293         Advance();
294     }
295     *value = val;
296     return true;
297 }
298 
ParseAlternativeEscape(bool isBackward,bool & isAtom)299 void RegExpParser::ParseAlternativeEscape(bool isBackward, bool &isAtom)
300 {
301     switch (c0_) {
302         case 'b': {
303             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
304             PrintF("Assertion %c \n", c0_);
305             WordBoundaryOpCode wordBoundaryOp;
306             wordBoundaryOp.EmitOpCode(&buffer_, 0);
307             Advance();
308             break;
309         }
310         case 'B': {
311             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
312             PrintF("Assertion %c \n", c0_);
313             NotWordBoundaryOpCode notWordBoundaryOp;
314             notWordBoundaryOp.EmitOpCode(&buffer_, 0);
315             Advance();
316             break;
317         }
318         default: {
319             isAtom = true;
320             int atomValue = ParseAtomEscape(isBackward);
321             if (atomValue != -1) {
322                 ParseAlternativeEscapeDefault(atomValue);
323             }
324             break;
325         }
326     }
327 }
328 
ParseAlternativeEscapeDefault(int atomValue)329 void RegExpParser::ParseAlternativeEscapeDefault(int atomValue)
330 {
331     if (IsIgnoreCase()) {
332         if (!IsUtf16()) {
333             atomValue = Canonicalize(atomValue, false);
334         } else {
335             icu::UnicodeSet set(atomValue, atomValue);
336             set.closeOver(USET_CASE_INSENSITIVE);
337             set.removeAllStrings();
338             int32_t size = set.size();
339             RangeOpCode rangeOp;
340             RangeSet rangeResult;
341             for (int32_t idx = 0; idx < size; idx++) {
342                 int32_t uc = set.charAt(idx);
343                 RangeSet curRange(uc);
344                 rangeResult.Insert(curRange);
345             }
346             rangeOp.InsertOpCode(&buffer_, rangeResult);
347             return;
348         }
349     }
350     if (atomValue <= UINT16_MAX) {
351         CharOpCode charOp;
352         charOp.EmitOpCode(&buffer_, atomValue);
353     } else {
354         Char32OpCode charOp;
355         charOp.EmitOpCode(&buffer_, atomValue);
356     }
357 }
358 
ParsePatternCharacter(bool isBackward)359 void RegExpParser::ParsePatternCharacter(bool isBackward)
360 {
361     PrevOpCode prevOp;
362     if (isBackward) {
363         prevOp.EmitOpCode(&buffer_, 0);
364     }
365     uint32_t matchedChar = c0_;
366     if (c0_ > (INT8_MAX + 1)) {
367         Prev();
368         int i = 0;
369         UChar32 c = 0;
370         int32_t length = end_ - pc_ + 1;
371         // NOLINTNEXTLINE(hicpp-signed-bitwise)
372         U8_NEXT(pc_, i, length, c);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
373         matchedChar = static_cast<uint32_t>(c);
374         pc_ += i;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
375     }
376     if (IsIgnoreCase()) {
377         matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
378     }
379     if (matchedChar > UINT16_MAX) {
380         Char32OpCode charOp;
381         charOp.EmitOpCode(&buffer_, matchedChar);
382     } else {
383         CharOpCode charOp;
384         charOp.EmitOpCode(&buffer_, matchedChar);
385     }
386     if (isBackward) {
387         prevOp.EmitOpCode(&buffer_, 0);
388     }
389 }
390 
ParseAlternativeAny(bool isBackward)391 void RegExpParser::ParseAlternativeAny(bool isBackward)
392 {
393     PrevOpCode prevOp;
394     if (isBackward) {
395         prevOp.EmitOpCode(&buffer_, 0);
396     }
397     if (IsDotAll()) {
398         AllOpCode allOp;
399         allOp.EmitOpCode(&buffer_, 0);
400     } else {
401         DotsOpCode dotsOp;
402         dotsOp.EmitOpCode(&buffer_, 0);
403     }
404     if (isBackward) {
405         prevOp.EmitOpCode(&buffer_, 0);
406     }
407 }
408 
ParseAlternativeRange(bool isBackward)409 void RegExpParser::ParseAlternativeRange(bool isBackward)
410 {
411     PrevOpCode prevOp;
412     Advance();
413     if (isBackward) {
414         prevOp.EmitOpCode(&buffer_, 0);
415     }
416     bool isInvert = false;
417     if (c0_ == '^') {
418         isInvert = true;
419         Advance();
420     }
421     RangeSet rangeResult;
422     if (!ParseClassRanges(&rangeResult)) {
423         return;
424     }
425     if (isInvert) {
426         rangeResult.Invert(IsUtf16());
427     }
428     uint32_t highValue = rangeResult.HighestValue();
429     if (highValue <= UINT16_MAX) {
430         RangeOpCode rangeOp;
431         rangeOp.InsertOpCode(&buffer_, rangeResult);
432     } else {
433         Range32OpCode rangeOp;
434         rangeOp.InsertOpCode(&buffer_, rangeResult);
435     }
436 
437     if (isBackward) {
438         prevOp.EmitOpCode(&buffer_, 0);
439     }
440 }
441 
442 // CC-OFFNXT(G.FUN.01, huge_method) solid logic
ParseAlternativeImpl(bool isBackward,bool & isAtom,int & captureIndex)443 void RegExpParser::ParseAlternativeImpl(bool isBackward, bool &isAtom, int &captureIndex)
444 {
445     switch (c0_) {
446         case '^': {
447             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
448             PrintF("Assertion %c line start \n", c0_);
449             LineStartOpCode lineStartOp;
450             lineStartOp.EmitOpCode(&buffer_, 0);
451             Advance();
452             break;
453         }
454         case '$': {
455             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
456             PrintF("Assertion %c line end \n", c0_);
457             LineEndOpCode lineEndOp;
458             lineEndOp.EmitOpCode(&buffer_, 0);
459             Advance();
460             break;
461         }
462         case '\\': {
463             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
464             PrintF("Escape %c \n", c0_);
465             Advance();
466             ParseAlternativeEscape(isBackward, isAtom);
467             break;
468         }
469         case '(': {
470             Advance();
471             isAtom = ParseAssertionCapture(&captureIndex, isBackward);
472             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
473             Advance();
474             break;
475         }
476         case '.': {
477             ParseAlternativeAny(isBackward);
478             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
479             PrintF("Atom %c match any \n", c0_);
480             isAtom = true;
481             Advance();
482             break;
483         }
484         case '[': {
485             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
486             PrintF("Atom %c match range \n", c0_);
487             isAtom = true;
488             ParseAlternativeRange(isBackward);
489             break;
490         }
491         case '*':
492         case '+':
493         case '?':
494             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
495             ParseError("nothing to repeat");
496             return;
497         case '{': {
498             uint8_t *begin = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
499             int dummy;
500             if (ParserIntervalQuantifier(&dummy, &dummy)) {
501                 ParseError("nothing to repeat");
502                 return;
503             }
504             pc_ = begin;
505             Advance();
506         }
507             [[fallthrough]];
508         case '}':
509         case ']':
510             if (IsUtf16()) {
511                 ParseError("syntax error");
512                 return;
513             }
514             [[fallthrough]];
515         default: {
516             // PatternCharacter
517             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
518             PrintF("PatternCharacter %c\n", c0_);
519             isAtom = true;
520             ParsePatternCharacter(isBackward);
521             Advance();
522             break;
523         }
524     }
525 }
526 
ParseAlternative(bool isBackward)527 void RegExpParser::ParseAlternative(bool isBackward)
528 {
529     size_t start = buffer_.size_;
530     while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
531         if (isError_) {
532             return;
533         }
534         size_t atomBcStart = buffer_.GetSize();
535         int captureIndex = 0;
536         bool isAtom = false;
537         ParseAlternativeImpl(isBackward, isAtom, captureIndex);
538         if (isAtom && !isError_) {
539             ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
540         }
541         if (isBackward) {
542             size_t end = buffer_.GetSize();
543             size_t termSize = end - atomBcStart;
544             size_t moveSize = end - start;
545             buffer_.Expand(end + termSize);
546             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
547             if (memmove_s(buffer_.buf_ + start + termSize, moveSize, buffer_.buf_ + start, moveSize) != EOK) {
548                 LOG(FATAL, COMMON) << "memmove_s failed";
549                 UNREACHABLE();
550             }
551             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
552             if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
553                 LOG(FATAL, COMMON) << "memcpy_s failed";
554                 UNREACHABLE();
555             }
556         }
557     }
558 }
559 
FindGroupName(const PandaString & name)560 int RegExpParser::FindGroupName(const PandaString &name)
561 {
562     size_t len;
563     size_t nameLen = name.size();
564     const char *p = reinterpret_cast<char *>(groupNames_.buf_);
565     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
566     const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
567     int captureIndex = 1;
568     while (p < bufEnd) {
569         len = strlen(p);
570         if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
571             return captureIndex;
572         }
573         p += len + 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
574         captureIndex++;
575     }
576     return -1;
577 }
578 
579 template <typename OpCodeT>
InsertMatchAheadOpCode(bool isBackward)580 void RegExpParser::InsertMatchAheadOpCode(bool isBackward)
581 {
582     Advance();
583     uint32_t start = buffer_.size_;
584     ParseDisjunction(isBackward);
585     MatchOpCode matchOp;
586     matchOp.EmitOpCode(&buffer_, 0);
587     OpCodeT matchAheadOp;
588     uint32_t len = buffer_.size_ - start;
589     matchAheadOp.InsertOpCode(&buffer_, start, len);
590 }
591 
HandleGroupName()592 bool RegExpParser::HandleGroupName()
593 {
594     PandaString name;
595     auto **pp = const_cast<const uint8_t **>(&pc_);
596     if (!ParseGroupSpecifier(pp, name)) {
597         ParseError("GroupName Syntax error.");
598         return false;
599     }
600     if (FindGroupName(name) > 0) {
601         ParseError("Duplicate GroupName error.");
602         return false;
603     }
604     groupNames_.EmitStr(name.c_str());
605     newGroupNames_.push_back(name);
606     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
607     PrintF("group name %s", name.c_str());
608 
609     return true;
610 }
611 
ParseAssertion(bool isBackward,bool & isAtom,bool & parseCapture)612 bool RegExpParser::ParseAssertion(bool isBackward, bool &isAtom, bool &parseCapture)
613 {
614     switch (c0_) {
615         // (?=Disjunction[?U, ?N])
616         case '=': {
617             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
618             PrintF("Assertion(?= Disjunction)\n");
619             InsertMatchAheadOpCode<MatchAheadOpCode>(isBackward);
620             break;
621         }
622         // (?!Disjunction[?U, ?N])
623         case '!': {
624             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
625             PrintF("Assertion(?! Disjunction)\n");
626             InsertMatchAheadOpCode<NegativeMatchAheadOpCode>(isBackward);
627             break;
628         }
629         case '<': {
630             Advance();
631             // (?<=Disjunction[?U, ?N])
632             if (c0_ == '=') {
633                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
634                 PrintF("Assertion(?<= Disjunction)\n");
635                 InsertMatchAheadOpCode<MatchAheadOpCode>(true);
636                 return true;
637                 // (?<!Disjunction[?U, ?N])
638             }
639             if (c0_ == '!') {
640                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
641                 PrintF("Assertion(?<! Disjunction)\n");
642                 InsertMatchAheadOpCode<NegativeMatchAheadOpCode>(true);
643                 return true;
644             }
645 
646             Prev();
647             if (!HandleGroupName()) {
648                 return false;
649             }
650             Advance();
651             parseCapture = true;
652             break;
653         }
654         // (?:Disjunction[?U, ?N])
655         case ':':
656             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
657             PrintF("Atom(?<: Disjunction)\n");
658             isAtom = true;
659             Advance();
660             ParseDisjunction(isBackward);
661             break;
662         default:
663             Advance();
664             ParseError("? Syntax error.");
665             return false;
666     }
667 
668     return true;
669 }
670 
ParseAssertionCapture(int * captureIndex,bool isBackward)671 bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
672 {
673     bool isAtom = false;
674     do {
675         bool parseCapture = false;
676         if (c0_ == '?') {
677             Advance();
678             if (!ParseAssertion(isBackward, isAtom, parseCapture)) {
679                 return false;
680             }
681         } else {
682             groupNames_.EmitChar(0);
683             parseCapture = true;
684         }
685         if (parseCapture) {
686             isAtom = true;
687             *captureIndex = captureCount_++;
688             SaveEndOpCode saveEndOp;
689             SaveStartOpCode saveStartOp;
690             if (isBackward) {
691                 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
692             } else {
693                 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
694             }
695             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
696             PrintF("capture start %d \n", *captureIndex);
697             ParseDisjunction(isBackward);
698             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
699             PrintF("capture end %d \n", *captureIndex);
700             if (isBackward) {
701                 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
702             } else {
703                 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
704             }
705         }
706     } while (c0_ != ')' && c0_ != KEY_EOF);
707     if (c0_ != ')') {
708         ParseError("capture syntax error");
709         return false;
710     }
711     return isAtom;
712 }
713 
ParseDecimalDigits()714 int RegExpParser::ParseDecimalDigits()
715 {
716     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
717     PrintF("Parse DecimalDigits------\n");
718     uint32_t result = 0;
719     bool overflow = false;
720     while (true) {
721         if (c0_ < '0' || c0_ > '9') {
722             break;
723         }
724         if (!overflow) {
725             if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
726                 overflow = true;
727             } else {
728                 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
729             }
730         }
731         Advance();
732     }
733     if (overflow) {
734         return INT32_MAX;
735     }
736     return result;
737 }
738 
ParserIntervalQuantifier(int * pmin,int * pmax)739 bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
740 {
741     // Quantifier::
742     //     QuantifierPrefix
743     //     QuantifierPrefix?
744     // QuantifierPrefix::
745     // *
746     // +
747     // ?
748     // {DecimalDigits}
749     // {DecimalDigits,}
750     // {DecimalDigits,DecimalDigits}
751     Advance();
752     *pmin = ParseDecimalDigits();
753     *pmax = *pmin;
754     switch (c0_) {
755         case ',': {
756             Advance();
757             if (c0_ == '}') {
758                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
759                 PrintF("QuantifierPrefix{DecimalDigits,}\n");
760                 *pmax = INT32_MAX;
761                 Advance();
762             } else {
763                 *pmax = ParseDecimalDigits();
764                 if (c0_ == '}') {
765                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
766                     PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
767                     Advance();
768                 } else {
769                     return false;
770                 }
771             }
772             break;
773         }
774         case '}':
775             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
776             PrintF("QuantifierPrefix{DecimalDigits}\n");
777             Advance();
778             break;
779         default:
780             Advance();
781             return false;
782     }
783     return true;
784 }
785 
ParseQuantifierPrefix(int & min,int & max,bool & isGreedy)786 bool RegExpParser::ParseQuantifierPrefix(int &min, int &max, bool &isGreedy)
787 {
788     switch (c0_) {
789         case '*':
790             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
791             PrintF("QuantifierPrefix %c\n", c0_);
792             min = 0;
793             max = INT32_MAX;
794             Advance();
795             break;
796         case '+':
797             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
798             PrintF("QuantifierPrefix %c\n", c0_);
799             min = 1;
800             max = INT32_MAX;
801             Advance();
802             break;
803         case '?':
804             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
805             PrintF("QuantifierPrefix %c\n", c0_);
806             Advance();
807             min = 0;
808             max = 1;
809             break;
810         case '{': {
811             uint8_t *start = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
812             if (!ParserIntervalQuantifier(&min, &max)) {
813                 pc_ = start;
814                 Advance();  // back to '{'
815                 return false;
816             }
817             if (min > max) {
818                 ParseError("Invalid repetition count");
819                 return false;
820             }
821             break;
822         }
823         default:
824             break;
825     }
826     if (c0_ == '?') {
827         isGreedy = false;
828         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
829         PrintF("Quantifier::QuantifierPrefix?\n");
830         Advance();
831     } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
832         ParseError("nothing to repeat");
833         return false;
834     }
835     return true;
836 }
837 
ParseQuantifier(size_t atomBcStart,int captureStart,int captureEnd)838 void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
839 {
840     int min = -1;
841     int max = -1;
842     bool isGreedy = true;
843     if (!ParseQuantifierPrefix(min, max, isGreedy)) {
844         return;
845     }
846     if (min != -1 && max != -1) {
847         stackCount_++;
848         PushOpCode pushOp;
849         pushOp.InsertOpCode(&buffer_, atomBcStart);
850         atomBcStart += pushOp.GetSize();
851 
852         if (captureStart != 0) {
853             SaveResetOpCode saveResetOp;
854             saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
855         }
856 
857         // zero advance check
858         if (max == INT32_MAX) {
859             stackCount_++;
860             PushCharOpCode pushCharOp;
861             pushCharOp.InsertOpCode(&buffer_, atomBcStart);
862             CheckCharOpCode checkCharOp;
863             // NOLINTNEXTLINE(readability-magic-numbers)
864             checkCharOp.EmitOpCode(&buffer_, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
865         }
866 
867         if (isGreedy) {
868             LoopGreedyOpCode loopOp;
869             loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
870         } else {
871             LoopOpCode loopOp;
872             loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
873         }
874 
875         if (min == 0) {
876             if (isGreedy) {
877                 SplitNextOpCode splitNextOp;
878                 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
879             } else {
880                 SplitFirstOpCode splitFirstOp;
881                 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
882             }
883         }
884 
885         PopOpCode popOp;
886         popOp.EmitOpCode(&buffer_);
887     }
888 }
889 
ParseGroupSpecifier(const uint8_t ** pp,PandaString & name)890 bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, PandaString &name)
891 {
892     const uint8_t *p = *pp;
893     uint32_t c;
894     std::array<char, CACHE_SIZE> buffer {};
895     char *q = buffer.data();
896     while (true) {
897         if (p <= end_) {
898             c = *p;
899         } else {
900             c = KEY_EOF;
901         }
902         if (c == '\\') {
903             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
904             p++;
905             if (*p != 'u') {
906                 return false;
907             }
908             if (!ParseUnicodeEscape(&c)) {
909                 return false;
910             }
911         } else if (c == '>') {
912             break;
913         } else if (c > CACHE_SIZE && c != KEY_EOF) {
914             c = static_cast<uint32_t>(UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
915         } else if (c != KEY_EOF) {
916             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
917             p++;
918         } else {
919             return false;
920         }
921         if (q == buffer.data()) {
922             if (IsIdentFirst(c) == 0) {
923                 return false;
924             }
925         } else {
926             if (!u_isIDPart(c)) {
927                 return false;
928             }
929         }
930         if (q != nullptr) {
931             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
932             *q++ = c;
933         }
934     }
935     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
936     p++;
937     *pp = p;
938     name = buffer.data();
939     return true;
940 }
941 
CalculateCaptureIndex(const uint8_t * p,int & captureIndex,const char * groupName,PandaString & name)942 bool RegExpParser::CalculateCaptureIndex(const uint8_t *p, int &captureIndex, const char *groupName, PandaString &name)
943 {
944     if (p[1] == '?') {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
945         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
946         if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
947             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
948             p[CAPTURE_CONUT_ADVANCE] != '=') {
949             hasNamedCaptures_ = 1;
950             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
951             p += CAPTURE_CONUT_ADVANCE;
952             if (groupName != nullptr && ParseGroupSpecifier(&p, name) && strcmp(name.c_str(), groupName) == 0) {
953                 return true;
954             }
955             captureIndex++;
956         }
957     } else {
958         captureIndex++;
959     }
960 
961     return false;
962 }
963 
ShiftPointerToClosingBracket(const uint8_t * p,const uint8_t * end)964 static inline void ShiftPointerToClosingBracket(const uint8_t *p, const uint8_t *end)
965 {
966     while (p < end && *p != ']') {
967         if (*p == '\\') {
968             p++;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
969         }
970         p++;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
971     }
972 }
973 
ParseCaptureCountImpl(const uint8_t * p,int & captureIndex,const char * groupName,PandaString & name)974 bool RegExpParser::ParseCaptureCountImpl(const uint8_t *p, int &captureIndex, const char *groupName, PandaString &name)
975 {
976     switch (*p) {
977         case '(': {
978             if (CalculateCaptureIndex(p, captureIndex, groupName, name)) {
979                 return true;
980             }
981             break;
982         }
983         case '\\':
984             p++;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
985             break;
986         case '[': {
987             ShiftPointerToClosingBracket(p, end_);
988             break;
989         }
990         default:
991             break;
992     }
993 
994     return false;
995 }
996 
ParseCaptureCount(const char * groupName)997 int RegExpParser::ParseCaptureCount(const char *groupName)
998 {
999     const uint8_t *p = nullptr;
1000     int captureIndex = 1;
1001     PandaString name;
1002     hasNamedCaptures_ = 0;
1003     for (p = base_; p < end_; p++) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1004         if (ParseCaptureCountImpl(p, captureIndex, groupName, name)) {
1005             return captureIndex;
1006         }
1007     }
1008     return captureIndex;
1009 }
1010 
ParseLookBehind(DynChunk & buffer,PrevOpCode & prevOp,bool isBackward)1011 void RegExpParser::ParseLookBehind(DynChunk &buffer, PrevOpCode &prevOp, bool isBackward)
1012 {
1013     if (isBackward) {
1014         prevOp.EmitOpCode(&buffer, 0);
1015     }
1016     Advance();
1017 }
1018 
InsertRangeOpCode(DynChunk & buffer,RangeSet & rangeSet,PrevOpCode & prevOp,bool isBackward)1019 void RegExpParser::InsertRangeOpCode(DynChunk &buffer, RangeSet &rangeSet, PrevOpCode &prevOp, bool isBackward)
1020 {
1021     RangeOpCode rangeOp;
1022     if (isBackward) {
1023         prevOp.EmitOpCode(&buffer, 0);
1024     }
1025     rangeOp.InsertOpCode(&buffer, rangeSet);
1026     ParseLookBehind(buffer, prevOp, isBackward);
1027 }
1028 
InsertRange32OpCode(DynChunk & buffer,RangeSet & rangeSet,PrevOpCode & prevOp,bool isBackward)1029 void RegExpParser::InsertRange32OpCode(DynChunk &buffer, RangeSet &rangeSet, PrevOpCode &prevOp, bool isBackward)
1030 {
1031     RangeSet atomRange(rangeSet);
1032     atomRange.Invert(IsUtf16());
1033     Range32OpCode rangeOp;
1034     if (isBackward) {
1035         prevOp.EmitOpCode(&buffer, 0);
1036     }
1037     rangeOp.InsertOpCode(&buffer, atomRange);
1038     ParseLookBehind(buffer, prevOp, isBackward);
1039 }
1040 
ParseGroupName()1041 int RegExpParser::ParseGroupName()
1042 {
1043     Advance();
1044     if (c0_ != '<') {
1045         if (!IsUtf16() || HasNamedCaptures()) {
1046             ParseError("expecting group name.");
1047             return -1;
1048         }
1049     }
1050     Advance();
1051     Prev();
1052     PandaString name;
1053     auto **pp = const_cast<const uint8_t **>(&pc_);
1054     if (!ParseGroupSpecifier(pp, name)) {
1055         ParseError("GroupName Syntax error.");
1056         return -1;
1057     }
1058     int postion = FindGroupName(name);
1059     if (postion < 0) {
1060         postion = ParseCaptureCount(name.c_str());
1061         if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1062             ParseError("group name not defined");
1063             return -1;
1064         }
1065     }
1066 
1067     return postion;
1068 }
1069 
EmitRefOpCode(DynChunk & buffer,uint32_t para,bool isBackward)1070 static void EmitRefOpCode(DynChunk &buffer, uint32_t para, bool isBackward)
1071 {
1072     if (isBackward) {
1073         BackwardBackReferenceOpCode backReferenceOp;
1074         backReferenceOp.EmitOpCode(&buffer, para);
1075     } else {
1076         BackReferenceOpCode backReferenceOp;
1077         backReferenceOp.EmitOpCode(&buffer, para);
1078     }
1079 }
1080 
1081 // CC-OFFNXT(G.FUN.01, huge_method) big switch case
1082 // NOLINTNEXTLINE(readability-function-size)
ParseAtomEscape(bool isBackward)1083 int RegExpParser::ParseAtomEscape(bool isBackward)
1084 {
1085     // AtomEscape[U, N]::
1086     //     DecimalEscape
1087     //     CharacterClassEscape[?U]
1088     //     CharacterEscape[?U]
1089     //     [+N]kGroupName[?U]
1090     int result = -1;
1091     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1092     PrintF("Parse AtomEscape------\n");
1093     PrevOpCode prevOp;
1094     switch (c0_) {
1095         case KEY_EOF:
1096             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1097             ParseError("unexpected end");
1098             break;
1099         // DecimalEscape
1100         case '1':
1101         case '2':
1102         case '3':
1103         case '4':
1104         case '5':
1105         case '6':
1106         case '7':
1107         case '8':
1108         case '9': {
1109             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1110             PrintF("NonZeroDigit %c\n", c0_);
1111             int capture = ParseDecimalDigits();
1112             if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
1113                 ParseError("invalid backreference count");
1114                 break;
1115             }
1116             EmitRefOpCode(buffer_, capture, isBackward);
1117             break;
1118         }
1119         // CharacterClassEscape
1120         case 'd': {
1121             // [0-9]
1122             InsertRangeOpCode(buffer_, g_gRangeD, prevOp, isBackward);
1123             break;
1124         }
1125         case 'D': {
1126             // [^0-9]
1127             InsertRange32OpCode(buffer_, g_gRangeD, prevOp, isBackward);
1128             break;
1129         }
1130         case 's': {
1131             // [\f\n\r\t\v]
1132             InsertRangeOpCode(buffer_, g_gRangeS, prevOp, isBackward);
1133             break;
1134         }
1135         case 'S': {
1136             InsertRange32OpCode(buffer_, g_gRangeS, prevOp, isBackward);
1137             break;
1138         }
1139         case 'w': {
1140             // [A-Za-z0-9]
1141             InsertRangeOpCode(buffer_, g_gRangeW, prevOp, isBackward);
1142             break;
1143         }
1144         case 'W': {
1145             // [^A-Za-z0-9]
1146             InsertRange32OpCode(buffer_, g_gRangeW, prevOp, isBackward);
1147             break;
1148         }
1149         // P{UnicodePropertyValueExpression}
1150         // p{UnicodePropertyValueExpression}
1151         case 'P':
1152         case 'p':
1153         // [+N]kGroupName[?U]
1154         case 'k': {
1155             int postion = ParseGroupName();
1156             if (postion < 0) {
1157                 break;
1158             }
1159             EmitRefOpCode(buffer_, postion, isBackward);
1160             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1161             Advance();
1162             break;
1163         }
1164         default:
1165             result = ParseCharacterEscape();
1166             break;
1167     }
1168     return result;
1169 }
1170 
RecountCaptures()1171 int RegExpParser::RecountCaptures()
1172 {
1173     if (totalCaptureCount_ < 0) {
1174         const char *name = reinterpret_cast<const char *>(groupNames_.buf_);
1175         totalCaptureCount_ = ParseCaptureCount(name);
1176     }
1177     return totalCaptureCount_;
1178 }
HasNamedCaptures()1179 bool RegExpParser::HasNamedCaptures()
1180 {
1181     if (hasNamedCaptures_ < 0) {
1182         RecountCaptures();
1183     }
1184     return false;
1185 }
1186 
1187 // CC-OFFNXT(G.FUN.01, huge_cyclomatic_complexity, huge_method) big switch case
ParseCharacterEscape()1188 int RegExpParser::ParseCharacterEscape()
1189 {
1190     // CharacterEscape[U]::
1191     //     ControlEscape
1192     //     c ControlLetter
1193     //     0 [lookahead ∉ DecimalDigit]
1194     //     HexEscapeSequence
1195     //     RegExpUnicodeEscapeSequence[?U]
1196     //     IdentityEscape[?U]
1197     uint32_t result = 0;
1198     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1199     switch (c0_) {
1200         // ControlEscape
1201         case 'f':
1202             result = '\f';
1203             PrintControlEscapeAndAdvance();
1204             break;
1205         case 'n':
1206             result = '\n';
1207             PrintControlEscapeAndAdvance();
1208             break;
1209         case 'r':
1210             result = '\r';
1211             PrintControlEscapeAndAdvance();
1212             break;
1213         case 't':
1214             result = '\t';
1215             PrintControlEscapeAndAdvance();
1216             break;
1217         case 'v':
1218             result = '\v';
1219             PrintControlEscapeAndAdvance();
1220             break;
1221         // c ControlLetter
1222         case 'c': {
1223             ParseControlLetter(result);
1224             break;
1225         }
1226         case '0': {
1227             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1228             PrintF("CharacterEscape 0 [lookahead ∉ DecimalDigit]\n");
1229             if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) {  // NOLINT(readability-magic-numbers)
1230                 Advance();
1231                 result = 0;
1232                 break;
1233             }
1234             [[fallthrough]];
1235         }
1236         case '1':
1237         case '2':
1238         case '3':
1239         case '4':
1240         case '5':
1241         case '6':
1242         case '7': {
1243             if (IsUtf16()) {
1244                 // With /u, decimal escape is not interpreted as octal character code.
1245                 ParseError("Invalid class escape");
1246                 return 0;
1247             }
1248             result = ParseOctalLiteral();
1249             break;
1250         }
1251         // ParseHexEscapeSequence
1252         // ParseRegExpUnicodeEscapeSequence
1253         case 'x': {
1254             Advance();
1255             if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1256                 return result;
1257             }
1258             if (IsUtf16()) {
1259                 ParseError("Invalid class escape");
1260                 return -1;
1261             }
1262             result = 'x';
1263             break;
1264         }
1265         case 'u': {
1266             Advance();
1267             if (ParseUnicodeEscape(&result)) {
1268                 return result;
1269             }
1270             if (IsUtf16()) {
1271                 // With /u, invalid escapes are not treated as identity escapes.
1272                 ParseError("Invalid unicode escape");
1273                 return 0;
1274             }
1275             // If \u is not followed by a two-digit hexadecimal, treat it
1276             // as an identity escape.
1277             result = 'u';
1278             break;
1279         }
1280         // IdentityEscape[?U]
1281         case '$':
1282         case '(':
1283         case ')':
1284         case '*':
1285         case '+':
1286         case '.':
1287         case '/':
1288         case '?':
1289         case '[':
1290         case '\\':
1291         case ']':
1292         case '^':
1293         case '{':
1294         case '|':
1295         case '}':
1296             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1297             PrintF("IdentityEscape %c\n", c0_);
1298             result = c0_;
1299             Advance();
1300             break;
1301         default: {
1302             ParseCharacterEscapeDefault(result);
1303             break;
1304         }
1305     }
1306     return static_cast<int>(result);
1307 }
1308 
ParseCharacterEscapeDefault(uint32_t & result)1309 void RegExpParser::ParseCharacterEscapeDefault(uint32_t &result)
1310 {
1311     if (IsUtf16()) {
1312         ParseError("Invalid unicode escape");
1313         result = 0;
1314         return;
1315     }
1316     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1317     PrintF("SourceCharacter %c\n", c0_);
1318     result = c0_;
1319     if (result < CHAR_MAXS) {
1320         Advance();
1321     }
1322 }
1323 
ParseControlLetter(uint32_t & result)1324 void RegExpParser::ParseControlLetter(uint32_t &result)
1325 {
1326     Advance();
1327     if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1328         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1329         PrintF("ControlLetter %c\n", c0_);
1330         result = static_cast<uint32_t>(c0_) & 0x1f;  // NOLINT(readability-magic-numbers, hicpp-signed-bitwise)
1331         Advance();
1332     } else {
1333         if (!IsUtf16()) {
1334             pc_--;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1335             result = '\\';
1336         } else {
1337             ParseError("Invalid control letter");
1338             result = -1;
1339         }
1340     }
1341 }
1342 
PrintControlEscapeAndAdvance()1343 void RegExpParser::PrintControlEscapeAndAdvance()
1344 {
1345     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1346     PrintF("ControlEscape %c\n", c0_);
1347     Advance();
1348 }
1349 
ParseClassRangesImpl(RangeSet * result)1350 bool RegExpParser::ParseClassRangesImpl(RangeSet *result)
1351 {
1352     RangeSet s1;
1353     uint32_t c1 = ParseClassAtom(&s1);
1354     if (c1 == UINT32_MAX) {
1355         ParseError("invalid class range");
1356         return false;
1357     }
1358 
1359     int nextC0 = *pc_;
1360     if (c0_ == '-' && nextC0 != ']') {
1361         if (c1 == CLASS_RANGE_BASE) {
1362             if (IsUtf16()) {
1363                 ParseError("invalid class range");
1364                 return false;
1365             }
1366             result->Insert(s1);
1367             return true;
1368         }
1369         Advance();
1370         RangeSet s2;
1371         uint32_t c2 = ParseClassAtom(&s2);
1372         if (c2 == UINT32_MAX) {
1373             ParseError("invalid class range");
1374             return false;
1375         }
1376         if (c2 == CLASS_RANGE_BASE) {
1377             if (IsUtf16()) {
1378                 ParseError("invalid class range");
1379                 return false;
1380             }
1381             result->Insert(s2);
1382             return true;
1383         }
1384         if (c1 < INT8_MAX) {
1385             if (c1 > c2) {
1386                 ParseError("invalid class range");
1387                 return false;
1388             }
1389         }
1390         if (IsIgnoreCase()) {
1391             c1 = static_cast<uint32_t>(Canonicalize(c1, IsUtf16()));
1392             c2 = static_cast<uint32_t>(Canonicalize(c2, IsUtf16()));
1393         }
1394 
1395         result->Insert(c1, c2);
1396     } else {
1397         result->Insert(s1);
1398     }
1399 
1400     return true;
1401 }
1402 
ParseClassRanges(RangeSet * result)1403 bool RegExpParser::ParseClassRanges(RangeSet *result)
1404 {
1405     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1406     PrintF("Parse ClassRanges------\n");
1407     while (c0_ != ']') {
1408         if (!ParseClassRangesImpl(result)) {
1409             return false;
1410         }
1411     }
1412     Advance();
1413     return true;
1414 }
1415 
ParseClassAtom(RangeSet * atom)1416 uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1417 {
1418     uint32_t ret = UINT32_MAX;
1419     switch (c0_) {
1420         case '\\': {
1421             Advance();
1422             ret = static_cast<uint32_t>(ParseClassEscape(atom));
1423             break;
1424         }
1425         case KEY_EOF:
1426             break;
1427         case 0: {
1428             if (pc_ >= end_) {
1429                 return UINT32_MAX;
1430             }
1431             [[fallthrough]];
1432         }
1433         default: {
1434             uint32_t value = c0_;
1435             size_t u16Size;
1436             if (c0_ > INT8_MAX) {
1437                 pc_ -= 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1438                 auto u16Result = utf::ConvertUtf8ToUtf16Pair(pc_, true);
1439                 value = u16Result.first;
1440                 u16Size = u16Result.second;
1441                 Advance(u16Size + 1);
1442             } else {
1443                 Advance();
1444             }
1445             if (IsIgnoreCase()) {
1446                 value = static_cast<uint32_t>(Canonicalize(value, IsUtf16()));
1447             }
1448             atom->Insert(RangeSet(value));
1449             ret = value;
1450             break;
1451         }
1452     }
1453     return ret;
1454 }
1455 
InsertRangeBase(RangeSet * atom,RangeSet & rangeSet,bool invert)1456 void RegExpParser::InsertRangeBase(RangeSet *atom, RangeSet &rangeSet, bool invert)
1457 {
1458     atom->Insert(rangeSet);
1459     if (invert) {
1460         atom->Invert(IsUtf16());
1461     }
1462 }
1463 
ParseClassEscape(RangeSet * atom)1464 int RegExpParser::ParseClassEscape(RangeSet *atom)
1465 {
1466     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1467     PrintF("Parse ClassEscape------\n");
1468     int result = -1;
1469     switch (c0_) {
1470         case 'b':
1471             Advance();
1472             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1473             PrintF("ClassEscape %c", 'b');
1474             result = '\b';
1475             atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1476             break;
1477         case '-':
1478             Advance();
1479             result = '-';
1480             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1481             PrintF("ClassEscape %c", '-');
1482             atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1483             break;
1484         // CharacterClassEscape
1485         case 'd':
1486         case 'D':
1487             result = CLASS_RANGE_BASE;
1488             InsertRangeBase(atom, g_gRangeD, c0_ == 'D');
1489             Advance();
1490             break;
1491         case 's':
1492         case 'S':
1493             result = CLASS_RANGE_BASE;
1494             InsertRangeBase(atom, g_gRangeS, c0_ == 'S');
1495             Advance();
1496             break;
1497         case 'w':
1498         case 'W':
1499             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1500             PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1501             result = CLASS_RANGE_BASE;
1502             InsertRangeBase(atom, g_gRangeW, c0_ == 'W');
1503             Advance();
1504             break;
1505         // P{UnicodePropertyValueExpression}
1506         // p{UnicodePropertyValueExpression}
1507         case 'P':
1508         case 'p':
1509             ParseUnicodePropertyValueCharacters(result);
1510             break;
1511         default:
1512             result = ParseCharacterEscape();
1513             int value = result;
1514             if (IsIgnoreCase()) {
1515                 value = Canonicalize(value, IsUtf16());
1516             }
1517             atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1518             break;
1519     }
1520     return result;
1521 }
1522 
ParseUnicodePropertyValueCharacters(int & result)1523 void RegExpParser::ParseUnicodePropertyValueCharacters(int &result)
1524 {
1525     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1526     PrintF("Warning: \\p is not supported in ECMA 2015!");
1527     Advance();
1528     if (c0_ == '{') {
1529         Advance();
1530         if (c0_ == '}') {
1531             return;  // p{}, invalid
1532         }
1533         bool isValue = false;
1534         ParseUnicodePropertyValueCharactersImpl(&isValue);
1535         if (!isValue && c0_ == '=') {
1536             // UnicodePropertyName = UnicodePropertyValue
1537             Advance();
1538             if (c0_ == '}') {
1539                 return;  // p{xxx=}, invalid
1540             }
1541             ParseUnicodePropertyValueCharactersImpl(&isValue);
1542         }
1543         if (c0_ != '}') {
1544             return;  // p{xxx, invalid
1545         }
1546         // should do atom->Invert() here after ECMA 9.0
1547         Advance();
1548         result = CLASS_RANGE_BASE;
1549     }
1550 }
1551 
ParseUnicodePropertyValueCharactersImpl(bool * isValue)1552 void RegExpParser::ParseUnicodePropertyValueCharactersImpl(bool *isValue)
1553 {
1554     if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1555         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1556         PrintF("UnicodePropertyCharacter::ControlLetter %c\n", c0_);
1557     } else if (c0_ == '_') {
1558         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1559         PrintF("UnicodePropertyCharacter:: _ \n");
1560     } else if (c0_ >= '0' && c0_ <= '9') {
1561         *isValue = true;
1562         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1563         PrintF("UnicodePropertyValueCharacter::DecimalDigit %c\n", c0_);
1564     } else {
1565         return;
1566     }
1567     Advance();
1568     ParseUnicodePropertyValueCharactersImpl(isValue);
1569 }
1570 
1571 // NOLINTNEXTLINE(cert-dcl50-cpp)
PrintF(const char * fmt,...)1572 void RegExpParser::PrintF(const char *fmt, ...)
1573 {
1574     (void)fmt;
1575 }
1576 
ParseError(const char * errorMessage)1577 void RegExpParser::ParseError(const char *errorMessage)
1578 {
1579     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1580     PrintF("error: ");
1581     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1582     PrintF(errorMessage);
1583     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1584     PrintF("\n");
1585     SetIsError();
1586     size_t length = strlen(errorMessage) + 1;
1587     if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1588         LOG(FATAL, COMMON) << "memcpy_s failed";
1589         UNREACHABLE();
1590     }
1591 }
1592 
IsIdentFirst(uint32_t c)1593 int RegExpParser::IsIdentFirst(uint32_t c)
1594 {
1595     if (c < CACHE_SIZE) {
1596         // NOLINTNEXTLINE(hicpp-signed-bitwise
1597         return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1;  // 5: Shift five bits 31: and operation binary of 31
1598     }
1599     return static_cast<int>(u_isIDStart(c));
1600 }
1601 }  // namespace ark