• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "runtime/regexp/ecmascript/regexp_parser.h"
17 #include "runtime/regexp/ecmascript/regexp_opcode.h"
18 #include "runtime/include/coretypes/string-inl.h"
19 
20 #include "libpandabase/utils/utils.h"
21 #include "libpandabase/utils/utf.h"
22 
23 #include "securec.h"
24 #include "unicode/uchar.h"
25 #include "unicode/uniset.h"
26 
27 namespace {
28 constexpr int INVALID_UNICODE_FROM_UTF8 = -1;
29 
30 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
31 constexpr int UICODE_FROM_UTF8[] = {
32     0x80, 0xc0, 0xdf, 0xe0, 0xef, 0xf0, 0xf7, 0xf8, 0xfb, 0xfc, 0xfd,
33 };
34 
35 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
36 constexpr int UTF8_MIN_CODE[] = {
37     0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
38 };
39 
40 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
41 constexpr char UTF8_FIRST_CODE[] = {
42     0x1f, 0xf, 0x7, 0x3, 0x1,
43 };
44 
FromUtf8(int c,int l,const uint8_t * p,const uint8_t ** pp)45 int FromUtf8(int c, int l, const uint8_t *p, const uint8_t **pp)
46 {
47     uint32_t b;
48     // NOLINTNEXTLINE(hicpp-signed-bitwise)
49     c &= UTF8_FIRST_CODE[l - 1];
50     for (int i = 0; i < l; i++) {
51         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
52         b = *p++;
53         if (b < ark::utf::UTF8_2B_SECOND || b >= ark::utf::UTF8_2B_FIRST) {
54             return INVALID_UNICODE_FROM_UTF8;
55         }
56         // NOLINTNEXTLINE(hicpp-signed-bitwise)
57         c = (c << 6) | (b & ark::utf::UTF8_2B_THIRD);  // 6: Maximum Unicode range
58     }
59     if (c < UTF8_MIN_CODE[l - 1]) {
60         return INVALID_UNICODE_FROM_UTF8;
61     }
62     *pp = p;
63     return c;
64 }
65 
UnicodeFromUtf8(const uint8_t * p,int maxLen,const uint8_t ** pp)66 int UnicodeFromUtf8(const uint8_t *p, int maxLen, const uint8_t **pp)
67 {
68     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
69     int c = *p++;
70     if (c < UICODE_FROM_UTF8[0]) {
71         *pp = p;
72         return c;
73     }
74     int l = 0;
75     if (c >= UICODE_FROM_UTF8[1U] && c <= UICODE_FROM_UTF8[2U]) {         // 1 - 2: 0000 0080 - 0000 07FF
76         l = 1;                                                            // 1: 0000 0080 - 0000 07FF Unicode
77     } else if (c >= UICODE_FROM_UTF8[3U] && c <= UICODE_FROM_UTF8[4U]) {  // 3 - 4: 0000 0800 - 0000 FFFF
78         l = 2;                                                            // 2: 0000 0800 - 0000 FFFF Unicode
79     } else if (c >= UICODE_FROM_UTF8[5U] && c <= UICODE_FROM_UTF8[6U]) {  // 5 - 6: 0001 0000 - 0010 FFFF
80         l = 3;                                                            // 3: 0001 0000 - 0010 FFFF Unicode
81     } else if (c >= UICODE_FROM_UTF8[7U] && c <= UICODE_FROM_UTF8[8U]) {  // 7 - 8: 0020 0000 - 03FF FFFF
82         l = 4;                                                            // 4: 0020 0000 - 03FF FFFF Unicode
83         // NOLINTNEXTLINE(readability-magic-numbers)
84     } else if (c == UICODE_FROM_UTF8[9U] || c == UICODE_FROM_UTF8[10U]) {  // 9 - 10: 0400 0000 - 7FFF FFFF
85         l = 5;                                                             // 5: 0400 0000 - 7FFF FFFF Unicode
86     } else {
87         return INVALID_UNICODE_FROM_UTF8;
88     }
89     /* check that we have enough characters */
90     if (l > (maxLen - 1)) {
91         return INVALID_UNICODE_FROM_UTF8;
92     }
93     return FromUtf8(c, l, p, pp);
94 }
95 }  // namespace
96 
97 namespace ark {
98 static constexpr uint32_t CACHE_SIZE = 128;
99 static constexpr uint32_t CHAR_MAXS = 128;
100 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
101 static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
102     /* $ A-Z _ a-z */
103     0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE};
104 static RangeSet g_gRangeD(0x30, 0x39);  // NOLINT(fuchsia-statically-constructed-objects, readability-magic-numbers)
105 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
106 static RangeSet g_gRangeS({
107     std::pair<uint32_t, uint32_t>(0x0009, 0x000D),  // NOLINT(readability-magic-numbers)
108     std::pair<uint32_t, uint32_t>(0x0020, 0x0020),  // NOLINT(readability-magic-numbers)
109     std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0),  // NOLINT(readability-magic-numbers)
110     std::pair<uint32_t, uint32_t>(0x1680, 0x1680),  // NOLINT(readability-magic-numbers)
111     std::pair<uint32_t, uint32_t>(0x2000, 0x200A),  // NOLINT(readability-magic-numbers)
112     /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
113     /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
114     std::pair<uint32_t, uint32_t>(0x2028, 0x2029),  // NOLINT(readability-magic-numbers)
115     std::pair<uint32_t, uint32_t>(0x202F, 0x202F),  // NOLINT(readability-magic-numbers)
116     std::pair<uint32_t, uint32_t>(0x205F, 0x205F),  // NOLINT(readability-magic-numbers)
117     std::pair<uint32_t, uint32_t>(0x3000, 0x3000),  // NOLINT(readability-magic-numbers)
118     /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
119     std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF),  // NOLINT(readability-magic-numbers)
120 });
121 
122 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
123 static RangeSet g_gRangeW({
124     std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINT(readability-magic-numbers)
125     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINT(readability-magic-numbers)
126     std::pair<uint32_t, uint32_t>(0x005F, 0x005F),  // NOLINT(readability-magic-numbers)
127     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINT(readability-magic-numbers)
128 });
129 
130 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
131 static RangeSet g_gRegexpIdentifyStart({
132     std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINT(readability-magic-numbers)
133     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINT(readability-magic-numbers)
134     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINT(readability-magic-numbers)
135 });
136 
137 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
138 static RangeSet g_gRegexpIdentifyContinue({
139     std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINT(readability-magic-numbers)
140     std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINT(readability-magic-numbers)
141     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINT(readability-magic-numbers)
142     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINT(readability-magic-numbers)
143 });
144 
Parse()145 void RegExpParser::Parse()
146 {
147     // dynbuffer head init [size,capture_count,statck_count,flags]
148     buffer_.EmitU32(0);
149     buffer_.EmitU32(0);
150     buffer_.EmitU32(0);
151     buffer_.EmitU32(0);
152     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
153     PrintF("Parse Pattern------\n");
154     // Pattern[U, N]::
155     //      Disjunction[?U, ?N]
156     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
157     Advance();
158     SaveStartOpCode saveStartOp;
159     int captureIndex = captureCount_++;
160     saveStartOp.EmitOpCode(&buffer_, captureIndex);
161     ParseDisjunction(false);
162     if (c0_ != KEY_EOF) {
163         ParseError("extraneous characters at the end");
164         return;
165     }
166     SaveEndOpCode saveEndOp;
167     saveEndOp.EmitOpCode(&buffer_, captureIndex);
168     MatchEndOpCode matchEndOp;
169     matchEndOp.EmitOpCode(&buffer_, 0);
170     // dynbuffer head assignments
171     buffer_.PutU32(0, buffer_.size_);
172     buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
173     buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
174     buffer_.PutU32(FLAGS_OFFSET, flags_);
175 }
176 
ParseDisjunction(bool isBackward)177 void RegExpParser::ParseDisjunction(bool isBackward)
178 {
179     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
180     PrintF("Parse Disjunction------\n");
181     size_t start = buffer_.size_;
182     ParseAlternative(isBackward);
183     if (isError_) {
184         return;
185     }
186     do {
187         if (c0_ == '|') {
188             SplitNextOpCode splitOp;
189             uint32_t len = buffer_.size_ - start;
190             GotoOpCode gotoOp;
191             splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
192             uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
193             Advance();
194             ParseAlternative(isBackward);
195             gotoOp.UpdateOpPara(&buffer_, pos, buffer_.size_ - pos - gotoOp.GetSize());
196         }
197     } while (c0_ != KEY_EOF && c0_ != ')');
198 }
199 
ParseOctalLiteral()200 uint32_t RegExpParser::ParseOctalLiteral()
201 {
202     // For compatibility with some other browsers (not all), we parse
203     // up to three octal digits with a value below 256.
204     // ES#prod-annexB-LegacyOctalEscapeSequence
205     uint32_t value = c0_ - '0';
206     Advance();
207     if (c0_ >= '0' && c0_ <= '7') {
208         value = value * OCTAL_VALUE + c0_ - '0';
209         Advance();
210         if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
211             value = value * OCTAL_VALUE + c0_ - '0';
212             Advance();
213         }
214     }
215     return value;
216 }
217 
ParseUnlimitedLengthHexNumber(uint32_t maxValue,uint32_t * value)218 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
219 {
220     uint32_t x = 0;
221     int d = static_cast<int>(HexValue(c0_));
222     if (d < 0) {
223         return false;
224     }
225     while (d >= 0) {
226         if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
227             LOG(FATAL, COMMON) << "value overflow";
228             return false;
229         }
230         x = x * HEX_VALUE + static_cast<uint32_t>(d);
231         if (x > maxValue) {
232             return false;
233         }
234         Advance();
235         d = static_cast<int>(HexValue(c0_));
236     }
237     *value = x;
238     return true;
239 }
240 
241 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uint32_t * value)242 bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
243 {
244     // Accept both \uxxxx and \u{xxxxxx} (if allowed).
245     // In the latter case, the number of hex digits between { } is arbitrary.
246     // \ and u have already been read.
247     if (c0_ == '{' && IsUtf16()) {
248         uint8_t *start = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
249         Advance();
250         if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {  // NOLINT(readability-magic-numbers)
251             if (c0_ == '}') {
252                 Advance();
253                 return true;
254             }
255         }
256         pc_ = start;
257         Advance();
258         return false;
259     }
260     // \u but no {, or \u{...} escapes not allowed.
261     bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
262     if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
263         // Attempt to read trail surrogate.
264         uint8_t *start = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
265         if (*pc_ == 'u') {
266             Advance(UNICODE_HEX_ADVANCE);
267             uint32_t trail;
268             if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
269                 *value = U16_GET_SUPPLEMENTARY((*value), (trail));  // NOLINT(hicpp-signed-bitwise)
270                 return true;
271             }
272         }
273         pc_ = start;
274         Advance();
275     }
276     return result;
277 }
278 
ParseHexEscape(int length,uint32_t * value)279 bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
280 {
281     uint8_t *start = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
282     uint32_t val = 0;
283     for (int i = 0; i < length; ++i) {
284         uint32_t c = c0_;
285         int d = static_cast<int>(HexValue(c));
286         if (d < 0) {
287             pc_ = start;
288             Advance();
289             return false;
290         }
291         val = val * HEX_VALUE + static_cast<uint32_t>(d);
292         Advance();
293     }
294     *value = val;
295     return true;
296 }
297 
ParseAlternativeEscape(bool isBackward,bool & isAtom)298 void RegExpParser::ParseAlternativeEscape(bool isBackward, bool &isAtom)
299 {
300     switch (c0_) {
301         case 'b': {
302             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
303             PrintF("Assertion %c \n", c0_);
304             WordBoundaryOpCode wordBoundaryOp;
305             wordBoundaryOp.EmitOpCode(&buffer_, 0);
306             Advance();
307             break;
308         }
309         case 'B': {
310             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
311             PrintF("Assertion %c \n", c0_);
312             NotWordBoundaryOpCode notWordBoundaryOp;
313             notWordBoundaryOp.EmitOpCode(&buffer_, 0);
314             Advance();
315             break;
316         }
317         default: {
318             isAtom = true;
319             int atomValue = ParseAtomEscape(isBackward);
320             if (atomValue != -1) {
321                 ParseAlternativeEscapeDefault(atomValue);
322             }
323             break;
324         }
325     }
326 }
327 
ParseAlternativeEscapeDefault(int atomValue)328 void RegExpParser::ParseAlternativeEscapeDefault(int atomValue)
329 {
330     if (IsIgnoreCase()) {
331         if (!IsUtf16()) {
332             atomValue = Canonicalize(atomValue, false);
333         } else {
334             icu::UnicodeSet set(atomValue, atomValue);
335             set.closeOver(USET_CASE_INSENSITIVE);
336             set.removeAllStrings();
337             int32_t size = set.size();
338             RangeOpCode rangeOp;
339             RangeSet rangeResult;
340             for (int32_t idx = 0; idx < size; idx++) {
341                 int32_t uc = set.charAt(idx);
342                 RangeSet curRange(uc);
343                 rangeResult.Insert(curRange);
344             }
345             rangeOp.InsertOpCode(&buffer_, rangeResult);
346             return;
347         }
348     }
349     if (atomValue <= UINT16_MAX) {
350         CharOpCode charOp;
351         charOp.EmitOpCode(&buffer_, atomValue);
352     } else {
353         Char32OpCode charOp;
354         charOp.EmitOpCode(&buffer_, atomValue);
355     }
356 }
357 
ParsePatternCharacter(bool isBackward)358 void RegExpParser::ParsePatternCharacter(bool isBackward)
359 {
360     PrevOpCode prevOp;
361     if (isBackward) {
362         prevOp.EmitOpCode(&buffer_, 0);
363     }
364     uint32_t matchedChar = c0_;
365     if (c0_ > (INT8_MAX + 1)) {
366         Prev();
367         int i = 0;
368         UChar32 c;
369         int32_t length = end_ - pc_ + 1;
370         // NOLINTNEXTLINE(hicpp-signed-bitwise)
371         U8_NEXT(pc_, i, length, c);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
372         matchedChar = static_cast<uint32_t>(c);
373         pc_ += i;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
374     }
375     if (IsIgnoreCase()) {
376         matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
377     }
378     if (matchedChar > UINT16_MAX) {
379         Char32OpCode charOp;
380         charOp.EmitOpCode(&buffer_, matchedChar);
381     } else {
382         CharOpCode charOp;
383         charOp.EmitOpCode(&buffer_, matchedChar);
384     }
385     if (isBackward) {
386         prevOp.EmitOpCode(&buffer_, 0);
387     }
388 }
389 
ParseAlternativeAny(bool isBackward)390 void RegExpParser::ParseAlternativeAny(bool isBackward)
391 {
392     PrevOpCode prevOp;
393     if (isBackward) {
394         prevOp.EmitOpCode(&buffer_, 0);
395     }
396     if (IsDotAll()) {
397         AllOpCode allOp;
398         allOp.EmitOpCode(&buffer_, 0);
399     } else {
400         DotsOpCode dotsOp;
401         dotsOp.EmitOpCode(&buffer_, 0);
402     }
403     if (isBackward) {
404         prevOp.EmitOpCode(&buffer_, 0);
405     }
406 }
407 
ParseAlternativeRange(bool isBackward)408 void RegExpParser::ParseAlternativeRange(bool isBackward)
409 {
410     PrevOpCode prevOp;
411     Advance();
412     if (isBackward) {
413         prevOp.EmitOpCode(&buffer_, 0);
414     }
415     bool isInvert = false;
416     if (c0_ == '^') {
417         isInvert = true;
418         Advance();
419     }
420     RangeSet rangeResult;
421     if (!ParseClassRanges(&rangeResult)) {
422         return;
423     }
424     if (isInvert) {
425         rangeResult.Invert(IsUtf16());
426     }
427     uint32_t highValue = rangeResult.HighestValue();
428     if (highValue <= UINT16_MAX) {
429         RangeOpCode rangeOp;
430         rangeOp.InsertOpCode(&buffer_, rangeResult);
431     } else {
432         Range32OpCode rangeOp;
433         rangeOp.InsertOpCode(&buffer_, rangeResult);
434     }
435 
436     if (isBackward) {
437         prevOp.EmitOpCode(&buffer_, 0);
438     }
439 }
440 
441 // CC-OFFNXT(G.FUN.01, huge_method) solid logic
ParseAlternativeImpl(bool isBackward,bool & isAtom,int & captureIndex)442 void RegExpParser::ParseAlternativeImpl(bool isBackward, bool &isAtom, int &captureIndex)
443 {
444     switch (c0_) {
445         case '^': {
446             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
447             PrintF("Assertion %c line start \n", c0_);
448             LineStartOpCode lineStartOp;
449             lineStartOp.EmitOpCode(&buffer_, 0);
450             Advance();
451             break;
452         }
453         case '$': {
454             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
455             PrintF("Assertion %c line end \n", c0_);
456             LineEndOpCode lineEndOp;
457             lineEndOp.EmitOpCode(&buffer_, 0);
458             Advance();
459             break;
460         }
461         case '\\': {
462             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
463             PrintF("Escape %c \n", c0_);
464             Advance();
465             ParseAlternativeEscape(isBackward, isAtom);
466             break;
467         }
468         case '(': {
469             Advance();
470             isAtom = ParseAssertionCapture(&captureIndex, isBackward);
471             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
472             Advance();
473             break;
474         }
475         case '.': {
476             ParseAlternativeAny(isBackward);
477             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
478             PrintF("Atom %c match any \n", c0_);
479             isAtom = true;
480             Advance();
481             break;
482         }
483         case '[': {
484             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
485             PrintF("Atom %c match range \n", c0_);
486             isAtom = true;
487             ParseAlternativeRange(isBackward);
488             break;
489         }
490         case '*':
491         case '+':
492         case '?':
493             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
494             ParseError("nothing to repeat");
495             return;
496         case '{': {
497             uint8_t *begin = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
498             int dummy;
499             if (ParserIntervalQuantifier(&dummy, &dummy)) {
500                 ParseError("nothing to repeat");
501                 return;
502             }
503             pc_ = begin;
504             Advance();
505         }
506             [[fallthrough]];
507         case '}':
508         case ']':
509             if (IsUtf16()) {
510                 ParseError("syntax error");
511                 return;
512             }
513             [[fallthrough]];
514         default: {
515             // PatternCharacter
516             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
517             PrintF("PatternCharacter %c\n", c0_);
518             isAtom = true;
519             ParsePatternCharacter(isBackward);
520             Advance();
521             break;
522         }
523     }
524 }
525 
ParseAlternative(bool isBackward)526 void RegExpParser::ParseAlternative(bool isBackward)
527 {
528     size_t start = buffer_.size_;
529     while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
530         if (isError_) {
531             return;
532         }
533         size_t atomBcStart = buffer_.GetSize();
534         int captureIndex = 0;
535         bool isAtom = false;
536         ParseAlternativeImpl(isBackward, isAtom, captureIndex);
537         if (isAtom && !isError_) {
538             ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
539         }
540         if (isBackward) {
541             size_t end = buffer_.GetSize();
542             size_t termSize = end - atomBcStart;
543             size_t moveSize = end - start;
544             buffer_.Expand(end + termSize);
545             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
546             if (memmove_s(buffer_.buf_ + start + termSize, moveSize, buffer_.buf_ + start, moveSize) != EOK) {
547                 LOG(FATAL, COMMON) << "memmove_s failed";
548                 UNREACHABLE();
549             }
550             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
551             if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
552                 LOG(FATAL, COMMON) << "memcpy_s failed";
553                 UNREACHABLE();
554             }
555         }
556     }
557 }
558 
FindGroupName(const PandaString & name)559 int RegExpParser::FindGroupName(const PandaString &name)
560 {
561     size_t len;
562     size_t nameLen = name.size();
563     const char *p = reinterpret_cast<char *>(groupNames_.buf_);
564     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
565     const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
566     int captureIndex = 1;
567     while (p < bufEnd) {
568         len = strlen(p);
569         if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
570             return captureIndex;
571         }
572         p += len + 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
573         captureIndex++;
574     }
575     return -1;
576 }
577 
578 template <typename OpCodeT>
InsertMatchAheadOpCode(bool isBackward)579 void RegExpParser::InsertMatchAheadOpCode(bool isBackward)
580 {
581     Advance();
582     uint32_t start = buffer_.size_;
583     ParseDisjunction(isBackward);
584     MatchOpCode matchOp;
585     matchOp.EmitOpCode(&buffer_, 0);
586     OpCodeT matchAheadOp;
587     uint32_t len = buffer_.size_ - start;
588     matchAheadOp.InsertOpCode(&buffer_, start, len);
589 }
590 
HandleGroupName()591 bool RegExpParser::HandleGroupName()
592 {
593     PandaString name;
594     auto **pp = const_cast<const uint8_t **>(&pc_);
595     if (!ParseGroupSpecifier(pp, name)) {
596         ParseError("GroupName Syntax error.");
597         return false;
598     }
599     if (FindGroupName(name) > 0) {
600         ParseError("Duplicate GroupName error.");
601         return false;
602     }
603     groupNames_.EmitStr(name.c_str());
604     newGroupNames_.push_back(name);
605     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
606     PrintF("group name %s", name.c_str());
607 
608     return true;
609 }
610 
ParseAssertion(bool isBackward,bool & isAtom,bool & parseCapture)611 bool RegExpParser::ParseAssertion(bool isBackward, bool &isAtom, bool &parseCapture)
612 {
613     switch (c0_) {
614         // (?=Disjunction[?U, ?N])
615         case '=': {
616             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
617             PrintF("Assertion(?= Disjunction)\n");
618             InsertMatchAheadOpCode<MatchAheadOpCode>(isBackward);
619             break;
620         }
621         // (?!Disjunction[?U, ?N])
622         case '!': {
623             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
624             PrintF("Assertion(?! Disjunction)\n");
625             InsertMatchAheadOpCode<NegativeMatchAheadOpCode>(isBackward);
626             break;
627         }
628         case '<': {
629             Advance();
630             // (?<=Disjunction[?U, ?N])
631             if (c0_ == '=') {
632                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
633                 PrintF("Assertion(?<= Disjunction)\n");
634                 InsertMatchAheadOpCode<MatchAheadOpCode>(true);
635                 return true;
636                 // (?<!Disjunction[?U, ?N])
637             }
638             if (c0_ == '!') {
639                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
640                 PrintF("Assertion(?<! Disjunction)\n");
641                 InsertMatchAheadOpCode<NegativeMatchAheadOpCode>(true);
642                 return true;
643             }
644 
645             Prev();
646             if (!HandleGroupName()) {
647                 return false;
648             }
649             Advance();
650             parseCapture = true;
651             break;
652         }
653         // (?:Disjunction[?U, ?N])
654         case ':':
655             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
656             PrintF("Atom(?<: Disjunction)\n");
657             isAtom = true;
658             Advance();
659             ParseDisjunction(isBackward);
660             break;
661         default:
662             Advance();
663             ParseError("? Syntax error.");
664             return false;
665     }
666 
667     return true;
668 }
669 
ParseAssertionCapture(int * captureIndex,bool isBackward)670 bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
671 {
672     bool isAtom = false;
673     do {
674         bool parseCapture = false;
675         if (c0_ == '?') {
676             Advance();
677             if (!ParseAssertion(isBackward, isAtom, parseCapture)) {
678                 return false;
679             }
680         } else {
681             groupNames_.EmitChar(0);
682             parseCapture = true;
683         }
684         if (parseCapture) {
685             isAtom = true;
686             *captureIndex = captureCount_++;
687             SaveEndOpCode saveEndOp;
688             SaveStartOpCode saveStartOp;
689             if (isBackward) {
690                 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
691             } else {
692                 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
693             }
694             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
695             PrintF("capture start %d \n", *captureIndex);
696             ParseDisjunction(isBackward);
697             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
698             PrintF("capture end %d \n", *captureIndex);
699             if (isBackward) {
700                 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
701             } else {
702                 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
703             }
704         }
705     } while (c0_ != ')' && c0_ != KEY_EOF);
706     if (c0_ != ')') {
707         ParseError("capture syntax error");
708         return false;
709     }
710     return isAtom;
711 }
712 
ParseDecimalDigits()713 int RegExpParser::ParseDecimalDigits()
714 {
715     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
716     PrintF("Parse DecimalDigits------\n");
717     uint32_t result = 0;
718     bool overflow = false;
719     while (true) {
720         if (c0_ < '0' || c0_ > '9') {
721             break;
722         }
723         if (!overflow) {
724             if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
725                 overflow = true;
726             } else {
727                 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
728             }
729         }
730         Advance();
731     }
732     if (overflow) {
733         return INT32_MAX;
734     }
735     return result;
736 }
737 
ParserIntervalQuantifier(int * pmin,int * pmax)738 bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
739 {
740     // Quantifier::
741     //     QuantifierPrefix
742     //     QuantifierPrefix?
743     // QuantifierPrefix::
744     // *
745     // +
746     // ?
747     // {DecimalDigits}
748     // {DecimalDigits,}
749     // {DecimalDigits,DecimalDigits}
750     Advance();
751     *pmin = ParseDecimalDigits();
752     *pmax = *pmin;
753     switch (c0_) {
754         case ',': {
755             Advance();
756             if (c0_ == '}') {
757                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
758                 PrintF("QuantifierPrefix{DecimalDigits,}\n");
759                 *pmax = INT32_MAX;
760                 Advance();
761             } else {
762                 *pmax = ParseDecimalDigits();
763                 if (c0_ == '}') {
764                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
765                     PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
766                     Advance();
767                 } else {
768                     return false;
769                 }
770             }
771             break;
772         }
773         case '}':
774             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
775             PrintF("QuantifierPrefix{DecimalDigits}\n");
776             Advance();
777             break;
778         default:
779             Advance();
780             return false;
781     }
782     return true;
783 }
784 
ParseQuantifierPrefix(int & min,int & max,bool & isGreedy)785 bool RegExpParser::ParseQuantifierPrefix(int &min, int &max, bool &isGreedy)
786 {
787     switch (c0_) {
788         case '*':
789             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
790             PrintF("QuantifierPrefix %c\n", c0_);
791             min = 0;
792             max = INT32_MAX;
793             Advance();
794             break;
795         case '+':
796             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
797             PrintF("QuantifierPrefix %c\n", c0_);
798             min = 1;
799             max = INT32_MAX;
800             Advance();
801             break;
802         case '?':
803             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
804             PrintF("QuantifierPrefix %c\n", c0_);
805             Advance();
806             min = 0;
807             max = 1;
808             break;
809         case '{': {
810             uint8_t *start = pc_ - 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
811             if (!ParserIntervalQuantifier(&min, &max)) {
812                 pc_ = start;
813                 Advance();  // back to '{'
814                 return false;
815             }
816             if (min > max) {
817                 ParseError("Invalid repetition count");
818                 return false;
819             }
820             break;
821         }
822         default:
823             break;
824     }
825     if (c0_ == '?') {
826         isGreedy = false;
827         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
828         PrintF("Quantifier::QuantifierPrefix?\n");
829         Advance();
830     } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
831         ParseError("nothing to repeat");
832         return false;
833     }
834     return true;
835 }
836 
ParseQuantifier(size_t atomBcStart,int captureStart,int captureEnd)837 void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
838 {
839     int min = -1;
840     int max = -1;
841     bool isGreedy = true;
842     if (!ParseQuantifierPrefix(min, max, isGreedy)) {
843         return;
844     }
845     if (min != -1 && max != -1) {
846         stackCount_++;
847         PushOpCode pushOp;
848         pushOp.InsertOpCode(&buffer_, atomBcStart);
849         atomBcStart += pushOp.GetSize();
850 
851         if (captureStart != 0) {
852             SaveResetOpCode saveResetOp;
853             saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
854         }
855 
856         // zero advance check
857         if (max == INT32_MAX) {
858             stackCount_++;
859             PushCharOpCode pushCharOp;
860             pushCharOp.InsertOpCode(&buffer_, atomBcStart);
861             CheckCharOpCode checkCharOp;
862             // NOLINTNEXTLINE(readability-magic-numbers)
863             checkCharOp.EmitOpCode(&buffer_, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
864         }
865 
866         if (isGreedy) {
867             LoopGreedyOpCode loopOp;
868             loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
869         } else {
870             LoopOpCode loopOp;
871             loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
872         }
873 
874         if (min == 0) {
875             if (isGreedy) {
876                 SplitNextOpCode splitNextOp;
877                 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
878             } else {
879                 SplitFirstOpCode splitFirstOp;
880                 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
881             }
882         }
883 
884         PopOpCode popOp;
885         popOp.EmitOpCode(&buffer_);
886     }
887 }
888 
ParseGroupSpecifier(const uint8_t ** pp,PandaString & name)889 bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, PandaString &name)
890 {
891     const uint8_t *p = *pp;
892     uint32_t c;
893     std::array<char, CACHE_SIZE> buffer {};
894     char *q = buffer.data();
895     while (true) {
896         if (p <= end_) {
897             c = *p;
898         } else {
899             c = KEY_EOF;
900         }
901         if (c == '\\') {
902             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
903             p++;
904             if (*p != 'u') {
905                 return false;
906             }
907             if (!ParseUnicodeEscape(&c)) {
908                 return false;
909             }
910         } else if (c == '>') {
911             break;
912         } else if (c > CACHE_SIZE && c != KEY_EOF) {
913             c = static_cast<uint32_t>(UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
914         } else if (c != KEY_EOF) {
915             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
916             p++;
917         } else {
918             return false;
919         }
920         if (q == buffer.data()) {
921             if (IsIdentFirst(c) != 0) {
922                 return false;
923             }
924         } else {
925             if (!u_isIDPart(c)) {
926                 return false;
927             }
928         }
929         if (q != nullptr) {
930             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
931             *q++ = c;
932         }
933     }
934     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
935     p++;
936     *pp = p;
937     name = buffer.data();
938     return true;
939 }
940 
CalculateCaptureIndex(const uint8_t * p,int & captureIndex,const char * groupName,PandaString & name)941 bool RegExpParser::CalculateCaptureIndex(const uint8_t *p, int &captureIndex, const char *groupName, PandaString &name)
942 {
943     if (p[1] == '?') {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
944         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
945         if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
946             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
947             p[CAPTURE_CONUT_ADVANCE] != '=') {
948             hasNamedCaptures_ = 1;
949             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
950             p += CAPTURE_CONUT_ADVANCE;
951             if (groupName != nullptr && ParseGroupSpecifier(&p, name) && strcmp(name.c_str(), groupName) == 0) {
952                 return true;
953             }
954             captureIndex++;
955         }
956     } else {
957         captureIndex++;
958     }
959 
960     return false;
961 }
962 
ShiftPointerToClosingBracket(const uint8_t * p,const uint8_t * end)963 static inline void ShiftPointerToClosingBracket(const uint8_t *p, const uint8_t *end)
964 {
965     while (p < end && *p != ']') {
966         if (*p == '\\') {
967             p++;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
968         }
969         p++;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
970     }
971 }
972 
ParseCaptureCountImpl(const uint8_t * p,int & captureIndex,const char * groupName,PandaString & name)973 bool RegExpParser::ParseCaptureCountImpl(const uint8_t *p, int &captureIndex, const char *groupName, PandaString &name)
974 {
975     switch (*p) {
976         case '(': {
977             if (CalculateCaptureIndex(p, captureIndex, groupName, name)) {
978                 return true;
979             }
980             break;
981         }
982         case '\\':
983             p++;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
984             break;
985         case '[': {
986             ShiftPointerToClosingBracket(p, end_);
987             break;
988         }
989         default:
990             break;
991     }
992 
993     return false;
994 }
995 
ParseCaptureCount(const char * groupName)996 int RegExpParser::ParseCaptureCount(const char *groupName)
997 {
998     const uint8_t *p = nullptr;
999     int captureIndex = 1;
1000     PandaString name;
1001     hasNamedCaptures_ = 0;
1002     for (p = base_; p < end_; p++) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1003         if (ParseCaptureCountImpl(p, captureIndex, groupName, name)) {
1004             return captureIndex;
1005         }
1006     }
1007     return captureIndex;
1008 }
1009 
ParseLookBehind(DynChunk & buffer,PrevOpCode & prevOp,bool isBackward)1010 void RegExpParser::ParseLookBehind(DynChunk &buffer, PrevOpCode &prevOp, bool isBackward)
1011 {
1012     if (isBackward) {
1013         prevOp.EmitOpCode(&buffer, 0);
1014     }
1015     Advance();
1016 }
1017 
InsertRangeOpCode(DynChunk & buffer,RangeSet & rangeSet,PrevOpCode & prevOp,bool isBackward)1018 void RegExpParser::InsertRangeOpCode(DynChunk &buffer, RangeSet &rangeSet, PrevOpCode &prevOp, bool isBackward)
1019 {
1020     RangeOpCode rangeOp;
1021     if (isBackward) {
1022         prevOp.EmitOpCode(&buffer, 0);
1023     }
1024     rangeOp.InsertOpCode(&buffer, rangeSet);
1025     ParseLookBehind(buffer, prevOp, isBackward);
1026 }
1027 
InsertRange32OpCode(DynChunk & buffer,RangeSet & rangeSet,PrevOpCode & prevOp,bool isBackward)1028 void RegExpParser::InsertRange32OpCode(DynChunk &buffer, RangeSet &rangeSet, PrevOpCode &prevOp, bool isBackward)
1029 {
1030     RangeSet atomRange(rangeSet);
1031     atomRange.Invert(IsUtf16());
1032     Range32OpCode rangeOp;
1033     if (isBackward) {
1034         prevOp.EmitOpCode(&buffer, 0);
1035     }
1036     rangeOp.InsertOpCode(&buffer, atomRange);
1037     ParseLookBehind(buffer, prevOp, isBackward);
1038 }
1039 
ParseGroupName()1040 int RegExpParser::ParseGroupName()
1041 {
1042     Advance();
1043     if (c0_ != '<') {
1044         if (!IsUtf16() || HasNamedCaptures()) {
1045             ParseError("expecting group name.");
1046             return -1;
1047         }
1048     }
1049     Advance();
1050     Prev();
1051     PandaString name;
1052     auto **pp = const_cast<const uint8_t **>(&pc_);
1053     if (!ParseGroupSpecifier(pp, name)) {
1054         ParseError("GroupName Syntax error.");
1055         return -1;
1056     }
1057     int postion = FindGroupName(name);
1058     if (postion < 0) {
1059         postion = ParseCaptureCount(name.c_str());
1060         if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1061             ParseError("group name not defined");
1062             return -1;
1063         }
1064     }
1065 
1066     return postion;
1067 }
1068 
EmitRefOpCode(DynChunk & buffer,uint32_t para,bool isBackward)1069 static void EmitRefOpCode(DynChunk &buffer, uint32_t para, bool isBackward)
1070 {
1071     if (isBackward) {
1072         BackwardBackReferenceOpCode backReferenceOp;
1073         backReferenceOp.EmitOpCode(&buffer, para);
1074     } else {
1075         BackReferenceOpCode backReferenceOp;
1076         backReferenceOp.EmitOpCode(&buffer, para);
1077     }
1078 }
1079 
1080 // CC-OFFNXT(G.FUN.01, huge_method) big switch case
1081 // NOLINTNEXTLINE(readability-function-size)
ParseAtomEscape(bool isBackward)1082 int RegExpParser::ParseAtomEscape(bool isBackward)
1083 {
1084     // AtomEscape[U, N]::
1085     //     DecimalEscape
1086     //     CharacterClassEscape[?U]
1087     //     CharacterEscape[?U]
1088     //     [+N]kGroupName[?U]
1089     int result = -1;
1090     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1091     PrintF("Parse AtomEscape------\n");
1092     PrevOpCode prevOp;
1093     switch (c0_) {
1094         case KEY_EOF:
1095             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1096             ParseError("unexpected end");
1097             break;
1098         // DecimalEscape
1099         case '1':
1100         case '2':
1101         case '3':
1102         case '4':
1103         case '5':
1104         case '6':
1105         case '7':
1106         case '8':
1107         case '9': {
1108             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1109             PrintF("NonZeroDigit %c\n", c0_);
1110             int capture = ParseDecimalDigits();
1111             if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
1112                 ParseError("invalid backreference count");
1113                 break;
1114             }
1115             EmitRefOpCode(buffer_, capture, isBackward);
1116             break;
1117         }
1118         // CharacterClassEscape
1119         case 'd': {
1120             // [0-9]
1121             InsertRangeOpCode(buffer_, g_gRangeD, prevOp, isBackward);
1122             break;
1123         }
1124         case 'D': {
1125             // [^0-9]
1126             InsertRange32OpCode(buffer_, g_gRangeD, prevOp, isBackward);
1127             break;
1128         }
1129         case 's': {
1130             // [\f\n\r\t\v]
1131             InsertRangeOpCode(buffer_, g_gRangeS, prevOp, isBackward);
1132             break;
1133         }
1134         case 'S': {
1135             InsertRange32OpCode(buffer_, g_gRangeS, prevOp, isBackward);
1136             break;
1137         }
1138         case 'w': {
1139             // [A-Za-z0-9]
1140             InsertRangeOpCode(buffer_, g_gRangeW, prevOp, isBackward);
1141             break;
1142         }
1143         case 'W': {
1144             // [^A-Za-z0-9]
1145             InsertRange32OpCode(buffer_, g_gRangeW, prevOp, isBackward);
1146             break;
1147         }
1148         // P{UnicodePropertyValueExpression}
1149         // p{UnicodePropertyValueExpression}
1150         case 'P':
1151         case 'p':
1152         // [+N]kGroupName[?U]
1153         case 'k': {
1154             int postion = ParseGroupName();
1155             if (postion < 0) {
1156                 break;
1157             }
1158             EmitRefOpCode(buffer_, postion, isBackward);
1159             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1160             Advance();
1161             break;
1162         }
1163         default:
1164             result = ParseCharacterEscape();
1165             break;
1166     }
1167     return result;
1168 }
1169 
RecountCaptures()1170 int RegExpParser::RecountCaptures()
1171 {
1172     if (totalCaptureCount_ < 0) {
1173         const char *name = reinterpret_cast<const char *>(groupNames_.buf_);
1174         totalCaptureCount_ = ParseCaptureCount(name);
1175     }
1176     return totalCaptureCount_;
1177 }
HasNamedCaptures()1178 bool RegExpParser::HasNamedCaptures()
1179 {
1180     if (hasNamedCaptures_ < 0) {
1181         RecountCaptures();
1182     }
1183     return false;
1184 }
1185 
1186 // CC-OFFNXT(G.FUN.01, huge_cyclomatic_complexity, huge_method) big switch case
ParseCharacterEscape()1187 int RegExpParser::ParseCharacterEscape()
1188 {
1189     // CharacterEscape[U]::
1190     //     ControlEscape
1191     //     c ControlLetter
1192     //     0 [lookahead ∉ DecimalDigit]
1193     //     HexEscapeSequence
1194     //     RegExpUnicodeEscapeSequence[?U]
1195     //     IdentityEscape[?U]
1196     uint32_t result = 0;
1197     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1198     switch (c0_) {
1199         // ControlEscape
1200         case 'f':
1201             result = '\f';
1202             PrintControlEscapeAndAdvance();
1203             break;
1204         case 'n':
1205             result = '\n';
1206             PrintControlEscapeAndAdvance();
1207             break;
1208         case 'r':
1209             result = '\r';
1210             PrintControlEscapeAndAdvance();
1211             break;
1212         case 't':
1213             result = '\t';
1214             PrintControlEscapeAndAdvance();
1215             break;
1216         case 'v':
1217             result = '\v';
1218             PrintControlEscapeAndAdvance();
1219             break;
1220         // c ControlLetter
1221         case 'c': {
1222             ParseControlLetter(result);
1223             break;
1224         }
1225         case '0': {
1226             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1227             PrintF("CharacterEscape 0 [lookahead ∉ DecimalDigit]\n");
1228             if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) {  // NOLINT(readability-magic-numbers)
1229                 Advance();
1230                 result = 0;
1231                 break;
1232             }
1233             [[fallthrough]];
1234         }
1235         case '1':
1236         case '2':
1237         case '3':
1238         case '4':
1239         case '5':
1240         case '6':
1241         case '7': {
1242             if (IsUtf16()) {
1243                 // With /u, decimal escape is not interpreted as octal character code.
1244                 ParseError("Invalid class escape");
1245                 return 0;
1246             }
1247             result = ParseOctalLiteral();
1248             break;
1249         }
1250         // ParseHexEscapeSequence
1251         // ParseRegExpUnicodeEscapeSequence
1252         case 'x': {
1253             Advance();
1254             if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1255                 return result;
1256             }
1257             if (IsUtf16()) {
1258                 ParseError("Invalid class escape");
1259                 return -1;
1260             }
1261             result = 'x';
1262             break;
1263         }
1264         case 'u': {
1265             Advance();
1266             if (ParseUnicodeEscape(&result)) {
1267                 return result;
1268             }
1269             if (IsUtf16()) {
1270                 // With /u, invalid escapes are not treated as identity escapes.
1271                 ParseError("Invalid unicode escape");
1272                 return 0;
1273             }
1274             // If \u is not followed by a two-digit hexadecimal, treat it
1275             // as an identity escape.
1276             result = 'u';
1277             break;
1278         }
1279         // IdentityEscape[?U]
1280         case '$':
1281         case '(':
1282         case ')':
1283         case '*':
1284         case '+':
1285         case '.':
1286         case '/':
1287         case '?':
1288         case '[':
1289         case '\\':
1290         case ']':
1291         case '^':
1292         case '{':
1293         case '|':
1294         case '}':
1295             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1296             PrintF("IdentityEscape %c\n", c0_);
1297             result = c0_;
1298             Advance();
1299             break;
1300         default: {
1301             ParseCharacterEscapeDefault(result);
1302             break;
1303         }
1304     }
1305     return result;
1306 }
1307 
ParseCharacterEscapeDefault(uint32_t & result)1308 void RegExpParser::ParseCharacterEscapeDefault(uint32_t &result)
1309 {
1310     if (IsUtf16()) {
1311         ParseError("Invalid unicode escape");
1312         result = 0;
1313         return;
1314     }
1315     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1316     PrintF("SourceCharacter %c\n", c0_);
1317     result = c0_;
1318     if (result < CHAR_MAXS) {
1319         Advance();
1320     }
1321 }
1322 
ParseControlLetter(uint32_t & result)1323 void RegExpParser::ParseControlLetter(uint32_t &result)
1324 {
1325     Advance();
1326     if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1327         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1328         PrintF("ControlLetter %c\n", c0_);
1329         result = static_cast<uint32_t>(c0_) & 0x1f;  // NOLINT(readability-magic-numbers, hicpp-signed-bitwise)
1330         Advance();
1331     } else {
1332         if (!IsUtf16()) {
1333             pc_--;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1334             result = '\\';
1335         } else {
1336             ParseError("Invalid control letter");
1337             result = -1;
1338         }
1339     }
1340 }
1341 
PrintControlEscapeAndAdvance()1342 void RegExpParser::PrintControlEscapeAndAdvance()
1343 {
1344     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1345     PrintF("ControlEscape %c\n", c0_);
1346     Advance();
1347 }
1348 
ParseClassRangesImpl(RangeSet * result)1349 bool RegExpParser::ParseClassRangesImpl(RangeSet *result)
1350 {
1351     RangeSet s1;
1352     uint32_t c1 = ParseClassAtom(&s1);
1353     if (c1 == UINT32_MAX) {
1354         ParseError("invalid class range");
1355         return false;
1356     }
1357 
1358     int nextC0 = *pc_;
1359     if (c0_ == '-' && nextC0 != ']') {
1360         if (c1 == CLASS_RANGE_BASE) {
1361             if (IsUtf16()) {
1362                 ParseError("invalid class range");
1363                 return false;
1364             }
1365             result->Insert(s1);
1366             return true;
1367         }
1368         Advance();
1369         RangeSet s2;
1370         uint32_t c2 = ParseClassAtom(&s2);
1371         if (c2 == UINT32_MAX) {
1372             ParseError("invalid class range");
1373             return false;
1374         }
1375         if (c2 == CLASS_RANGE_BASE) {
1376             if (IsUtf16()) {
1377                 ParseError("invalid class range");
1378                 return false;
1379             }
1380             result->Insert(s2);
1381             return true;
1382         }
1383         if (c1 < INT8_MAX) {
1384             if (c1 > c2) {
1385                 ParseError("invalid class range");
1386                 return false;
1387             }
1388         }
1389         if (IsIgnoreCase()) {
1390             c1 = static_cast<uint32_t>(Canonicalize(c1, IsUtf16()));
1391             c2 = static_cast<uint32_t>(Canonicalize(c2, IsUtf16()));
1392         }
1393 
1394         result->Insert(c1, c2);
1395     } else {
1396         result->Insert(s1);
1397     }
1398 
1399     return true;
1400 }
1401 
ParseClassRanges(RangeSet * result)1402 bool RegExpParser::ParseClassRanges(RangeSet *result)
1403 {
1404     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1405     PrintF("Parse ClassRanges------\n");
1406     while (c0_ != ']') {
1407         if (!ParseClassRangesImpl(result)) {
1408             return false;
1409         }
1410     }
1411     Advance();
1412     return true;
1413 }
1414 
ParseClassAtom(RangeSet * atom)1415 uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1416 {
1417     uint32_t ret = UINT32_MAX;
1418     switch (c0_) {
1419         case '\\': {
1420             Advance();
1421             ret = static_cast<uint32_t>(ParseClassEscape(atom));
1422             break;
1423         }
1424         case KEY_EOF:
1425             break;
1426         case 0: {
1427             if (pc_ >= end_) {
1428                 return UINT32_MAX;
1429             }
1430             [[fallthrough]];
1431         }
1432         default: {
1433             uint32_t value = c0_;
1434             size_t u16Size;
1435             if (c0_ > INT8_MAX) {
1436                 pc_ -= 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1437                 auto u16Result = utf::ConvertUtf8ToUtf16Pair(pc_, true);
1438                 value = u16Result.first;
1439                 u16Size = u16Result.second;
1440                 Advance(u16Size + 1);
1441             } else {
1442                 Advance();
1443             }
1444             if (IsIgnoreCase()) {
1445                 value = static_cast<uint32_t>(Canonicalize(value, IsUtf16()));
1446             }
1447             atom->Insert(RangeSet(value));
1448             ret = value;
1449             break;
1450         }
1451     }
1452     return ret;
1453 }
1454 
InsertRangeBase(RangeSet * atom,RangeSet & rangeSet,bool invert)1455 void RegExpParser::InsertRangeBase(RangeSet *atom, RangeSet &rangeSet, bool invert)
1456 {
1457     atom->Insert(rangeSet);
1458     if (invert) {
1459         atom->Invert(IsUtf16());
1460     }
1461 }
1462 
ParseClassEscape(RangeSet * atom)1463 int RegExpParser::ParseClassEscape(RangeSet *atom)
1464 {
1465     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1466     PrintF("Parse ClassEscape------\n");
1467     int result = -1;
1468     switch (c0_) {
1469         case 'b':
1470             Advance();
1471             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1472             PrintF("ClassEscape %c", 'b');
1473             result = '\b';
1474             atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1475             break;
1476         case '-':
1477             Advance();
1478             result = '-';
1479             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1480             PrintF("ClassEscape %c", '-');
1481             atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1482             break;
1483         // CharacterClassEscape
1484         case 'd':
1485         case 'D':
1486             result = CLASS_RANGE_BASE;
1487             InsertRangeBase(atom, g_gRangeD, c0_ == 'D');
1488             Advance();
1489             break;
1490         case 's':
1491         case 'S':
1492             result = CLASS_RANGE_BASE;
1493             InsertRangeBase(atom, g_gRangeS, c0_ == 'S');
1494             Advance();
1495             break;
1496         case 'w':
1497         case 'W':
1498             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1499             PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1500             result = CLASS_RANGE_BASE;
1501             InsertRangeBase(atom, g_gRangeW, c0_ == 'W');
1502             Advance();
1503             break;
1504         // P{UnicodePropertyValueExpression}
1505         // p{UnicodePropertyValueExpression}
1506         case 'P':
1507         case 'p':
1508             ParseUnicodePropertyValueCharacters(result);
1509             break;
1510         default:
1511             result = ParseCharacterEscape();
1512             int value = result;
1513             if (IsIgnoreCase()) {
1514                 value = Canonicalize(value, IsUtf16());
1515             }
1516             atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1517             break;
1518     }
1519     return result;
1520 }
1521 
ParseUnicodePropertyValueCharacters(int & result)1522 void RegExpParser::ParseUnicodePropertyValueCharacters(int &result)
1523 {
1524     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1525     PrintF("Warning: \\p is not supported in ECMA 2015!");
1526     Advance();
1527     if (c0_ == '{') {
1528         Advance();
1529         if (c0_ == '}') {
1530             return;  // p{}, invalid
1531         }
1532         bool isValue = false;
1533         ParseUnicodePropertyValueCharactersImpl(&isValue);
1534         if (!isValue && c0_ == '=') {
1535             // UnicodePropertyName = UnicodePropertyValue
1536             Advance();
1537             if (c0_ == '}') {
1538                 return;  // p{xxx=}, invalid
1539             }
1540             ParseUnicodePropertyValueCharactersImpl(&isValue);
1541         }
1542         if (c0_ != '}') {
1543             return;  // p{xxx, invalid
1544         }
1545         // should do atom->Invert() here after ECMA 9.0
1546         Advance();
1547         result = CLASS_RANGE_BASE;
1548     }
1549 }
1550 
ParseUnicodePropertyValueCharactersImpl(bool * isValue)1551 void RegExpParser::ParseUnicodePropertyValueCharactersImpl(bool *isValue)
1552 {
1553     if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1554         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1555         PrintF("UnicodePropertyCharacter::ControlLetter %c\n", c0_);
1556     } else if (c0_ == '_') {
1557         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1558         PrintF("UnicodePropertyCharacter:: _ \n");
1559     } else if (c0_ >= '0' && c0_ <= '9') {
1560         *isValue = true;
1561         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1562         PrintF("UnicodePropertyValueCharacter::DecimalDigit %c\n", c0_);
1563     } else {
1564         return;
1565     }
1566     Advance();
1567     ParseUnicodePropertyValueCharactersImpl(isValue);
1568 }
1569 
1570 // NOLINTNEXTLINE(cert-dcl50-cpp)
PrintF(const char * fmt,...)1571 void RegExpParser::PrintF(const char *fmt, ...)
1572 {
1573     (void)fmt;
1574 }
1575 
ParseError(const char * errorMessage)1576 void RegExpParser::ParseError(const char *errorMessage)
1577 {
1578     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1579     PrintF("error: ");
1580     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1581     PrintF(errorMessage);
1582     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1583     PrintF("\n");
1584     SetIsError();
1585     size_t length = strlen(errorMessage) + 1;
1586     if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1587         LOG(FATAL, COMMON) << "memcpy_s failed";
1588         UNREACHABLE();
1589     }
1590 }
1591 
IsIdentFirst(uint32_t c)1592 int RegExpParser::IsIdentFirst(uint32_t c)
1593 {
1594     if (c < CACHE_SIZE) {
1595         // NOLINTNEXTLINE(hicpp-signed-bitwise
1596         return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1;  // 5: Shift five bits 31: and operation binary of 31
1597     }
1598     return static_cast<int>(u_isIDStart(c));
1599 }
1600 }  // namespace ark