• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "ecmascript/regexp/regexp_parser.h"
17 
18 #include "ecmascript/base/string_helper.h"
19 #include "ecmascript/ecma_macros.h"
20 #include "ecmascript/regexp/regexp_opcode.h"
21 #include "libpandabase/utils/utils.h"
22 #include "securec.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uniset.h"
25 #define _NO_DEBUG_
26 
27 namespace panda::ecmascript {
28 static constexpr uint32_t CACHE_SIZE = 128;
29 static constexpr uint32_t CHAR_MAXS = 128;
30 static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
31     /* $ A-Z _ a-z */
32     0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
33 };
34 static RangeSet g_rangeD(0x30, 0x39);  // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
35 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
36 static RangeSet g_rangeS({
37     std::pair<uint32_t, uint32_t>(0x0009, 0x000D),  // NOLINTNEXTLINE(readability-magic-numbers)
38     std::pair<uint32_t, uint32_t>(0x0020, 0x0020),  // NOLINTNEXTLINE(readability-magic-numbers)
39     std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0),  // NOLINTNEXTLINE(readability-magic-numbers)
40     std::pair<uint32_t, uint32_t>(0x1680, 0x1680),  // NOLINTNEXTLINE(readability-magic-numbers)
41     std::pair<uint32_t, uint32_t>(0x2000, 0x200A),  // NOLINTNEXTLINE(readability-magic-numbers)
42     /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
43     /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
44     std::pair<uint32_t, uint32_t>(0x2028, 0x2029),  // NOLINTNEXTLINE(readability-magic-numbers)
45     std::pair<uint32_t, uint32_t>(0x202F, 0x202F),  // NOLINTNEXTLINE(readability-magic-numbers)
46     std::pair<uint32_t, uint32_t>(0x205F, 0x205F),  // NOLINTNEXTLINE(readability-magic-numbers)
47     std::pair<uint32_t, uint32_t>(0x3000, 0x3000),  // NOLINTNEXTLINE(readability-magic-numbers)
48     /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
49     std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF),  // NOLINTNEXTLINE(readability-magic-numbers)
50 });
51 
52 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
53 static RangeSet g_rangeW({
54     std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINTNEXTLINE(readability-magic-numbers)
55     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINTNEXTLINE(readability-magic-numbers)
56     std::pair<uint32_t, uint32_t>(0x005F, 0x005F),  // NOLINTNEXTLINE(readability-magic-numbers)
57     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINTNEXTLINE(readability-magic-numbers)
58 });
59 
60 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
61 static RangeSet g_regexpIdentifyStart({
62     std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINTNEXTLINE(readability-magic-numbers)
63     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINTNEXTLINE(readability-magic-numbers)
64     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINTNEXTLINE(readability-magic-numbers)
65 });
66 
67 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
68 static RangeSet g_regexpIdentifyContinue({
69     std::pair<uint32_t, uint32_t>(0x0024, 0x0024),  // NOLINTNEXTLINE(readability-magic-numbers)
70     std::pair<uint32_t, uint32_t>(0x0030, 0x0039),  // NOLINTNEXTLINE(readability-magic-numbers)
71     std::pair<uint32_t, uint32_t>(0x0041, 0x005A),  // NOLINTNEXTLINE(readability-magic-numbers)
72     std::pair<uint32_t, uint32_t>(0x0061, 0x007A),  // NOLINTNEXTLINE(readability-magic-numbers)
73 });
74 
Parse()75 void RegExpParser::Parse()
76 {
77     // dynbuffer head init [size,capture_count,statck_count,flags]
78     buffer_.EmitU32(0);
79     buffer_.EmitU32(0);
80     buffer_.EmitU32(0);
81     buffer_.EmitU32(0);
82     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
83     PrintF("Parse Pattern------\n");
84     // Pattern[U, N]::
85     //      Disjunction[?U, ?N]
86     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
87     Advance();
88     SaveStartOpCode saveStartOp;
89     int captureIndex = captureCount_++;
90     saveStartOp.EmitOpCode(&buffer_, captureIndex);
91     ParseDisjunction(false);
92     if (c0_ != KEY_EOF) {
93         ParseError("extraneous characters at the end");
94         return;
95     }
96     SaveEndOpCode saveEndOp;
97     saveEndOp.EmitOpCode(&buffer_, captureIndex);
98     MatchEndOpCode matchEndOp;
99     matchEndOp.EmitOpCode(&buffer_, 0);
100     // dynbuffer head assignments
101     buffer_.PutU32(0, buffer_.size_);
102     buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
103     buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
104     buffer_.PutU32(FLAGS_OFFSET, flags_);
105 #ifndef _NO_DEBUG_
106     RegExpOpCode::DumpRegExpOpCode(std::cout, buffer_);
107 #endif
108 }
109 
ParseDisjunction(bool isBackward)110 void RegExpParser::ParseDisjunction(bool isBackward)
111 {
112     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
113     PrintF("Parse Disjunction------\n");
114     if (c0_ == ')') {
115         isEmpty_ = true;
116         return;
117     }
118     size_t start = buffer_.size_;
119     ParseAlternative(isBackward);
120     if (isError_) {
121         return;
122     }
123     do {
124         if (c0_ == '|') {
125             SplitNextOpCode splitOp;
126             uint32_t len = buffer_.size_ - start;
127             GotoOpCode gotoOp;
128             splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
129             uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
130             Advance();
131             ParseAlternative(isBackward);
132             gotoOp.UpdateOpPara(&buffer_, pos, buffer_.size_ - pos - gotoOp.GetSize());
133         }
134     } while (c0_ != KEY_EOF && c0_ != ')');
135 }
136 
ParseOctalLiteral()137 uint32_t RegExpParser::ParseOctalLiteral()
138 {
139     // For compatibility with some other browsers (not all), we parse
140     // up to three octal digits with a value below 256.
141     // ES#prod-annexB-LegacyOctalEscapeSequence
142     uint32_t value = c0_ - '0';
143     Advance();
144     if (c0_ >= '0' && c0_ <= '7') {
145         value = value * OCTAL_VALUE + c0_ - '0';
146         Advance();
147         if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
148             value = value * OCTAL_VALUE + c0_ - '0';
149             Advance();
150         }
151     }
152     return value;
153 }
154 
ParseUnlimitedLengthHexNumber(uint32_t maxValue,uint32_t * value)155 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
156 {
157     uint32_t x = 0;
158     int d = static_cast<int>(HexValue(c0_));
159     if (d < 0) {
160         return false;
161     }
162     while (d >= 0) {
163         if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
164             LOG_FULL(FATAL) << "value overflow";
165             return false;
166         }
167         x = x * HEX_VALUE + static_cast<uint32_t>(d);
168         if (x > maxValue) {
169             return false;
170         }
171         Advance();
172         d = static_cast<int>(HexValue(c0_));
173     }
174     *value = x;
175     return true;
176 }
177 
178 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uint32_t * value)179 bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
180 {
181     // Accept both \uxxxx and \u{xxxxxx} (if allowed).
182     // In the latter case, the number of hex digits between { } is arbitrary.
183     // \ and u have already been read.
184     if (c0_ == '{' && IsUtf16()) {
185         uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
186         Advance();
187         if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {  // NOLINTNEXTLINE(readability-magic-numbers)
188             if (c0_ == '}') {
189                 Advance();
190                 return true;
191             }
192         }
193         pc_ = start;
194         Advance();
195         return false;
196     }
197     // \u but no {, or \u{...} escapes not allowed.
198     bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
199     if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
200         // Attempt to read trail surrogate.
201         uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
202         if (*pc_ == 'u') {
203             Advance(UNICODE_HEX_ADVANCE);
204             uint32_t trail = 0;
205             if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
206                 *value = U16_GET_SUPPLEMENTARY((*value), (trail));  // NOLINTNEXTLINE(hicpp-signed-bitwise)
207                 return true;
208             }
209         }
210         pc_ = start;
211         Advance();
212     }
213     return result;
214 }
215 
ParseHexEscape(int length,uint32_t * value)216 bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
217 {
218     uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
219     uint32_t val = 0;
220     for (int i = 0; i < length; ++i) {
221         uint32_t c = c0_;
222         int d = static_cast<int>(HexValue(c));
223         if (d < 0) {
224             pc_ = start;
225             Advance();
226             return false;
227         }
228         val = val * HEX_VALUE + static_cast<uint32_t>(d);
229         Advance();
230     }
231     *value = val;
232     return true;
233 }
234 
235 // NOLINTNEXTLINE(readability-function-size)
ParseAlternative(bool isBackward)236 void RegExpParser::ParseAlternative(bool isBackward)
237 {
238     size_t start = buffer_.size_;
239     while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
240         if (isError_) {
241             return;
242         }
243         size_t atomBcStart = buffer_.GetSize();
244         int captureIndex = 0;
245         bool isAtom = false;
246         switch (c0_) {
247             case '^': {
248                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
249                 PrintF("Assertion %c line start \n", c0_);
250                 LineStartOpCode lineStartOp;
251                 lineStartOp.EmitOpCode(&buffer_, 0);
252                 Advance();
253                 break;
254             }
255             case '$': {
256                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
257                 PrintF("Assertion %c line end \n", c0_);
258                 LineEndOpCode lineEndOp;
259                 lineEndOp.EmitOpCode(&buffer_, 0);
260                 Advance();
261                 break;
262             }
263             case '\\': {
264                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
265                 PrintF("Escape %c \n", c0_);
266                 Advance();
267                 switch (c0_) {
268                     case 'b': {
269                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
270                         PrintF("Assertion %c \n", c0_);
271                         WordBoundaryOpCode wordBoundaryOp;
272                         wordBoundaryOp.EmitOpCode(&buffer_, 0);
273                         Advance();
274                         break;
275                     }
276                     case 'B': {
277                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
278                         PrintF("Assertion %c \n", c0_);
279                         NotWordBoundaryOpCode notWordBoundaryOp;
280                         notWordBoundaryOp.EmitOpCode(&buffer_, 0);
281                         Advance();
282                         break;
283                     }
284                     default: {
285                         isAtom = true;
286                         int atomValue = ParseAtomEscape(isBackward);
287                         if (atomValue != -1) {
288                             PrevOpCode prevOp;
289                             if (isBackward) {
290                                 prevOp.EmitOpCode(&buffer_, 0);
291                             }
292                             if (IsIgnoreCase()) {
293                                 if (!IsUtf16()) {
294                                     atomValue = Canonicalize(atomValue, false);
295                                 } else {
296                                     icu::UnicodeSet set(atomValue, atomValue);
297                                     set.closeOver(USET_CASE_INSENSITIVE);
298                                     set.removeAllStrings();
299                                     uint32_t size = static_cast<uint32_t>(set.size());
300                                     RangeOpCode rangeOp;
301                                     RangeSet rangeResult;
302                                     for (uint32_t idx = 0; idx < size; idx++) {
303                                         int32_t uc = set.charAt(idx);
304                                         RangeSet curRange(uc);
305                                         rangeResult.Insert(curRange);
306                                     }
307                                     rangeOp.InsertOpCode(&buffer_, rangeResult);
308                                     break;
309                                 }
310                             }
311                             if (atomValue <= UINT16_MAX) {
312                                 CharOpCode charOp;
313                                 charOp.EmitOpCode(&buffer_, atomValue);
314                             } else {
315                                 Char32OpCode charOp;
316                                 charOp.EmitOpCode(&buffer_, atomValue);
317                             }
318                             if (isBackward) {
319                                 prevOp.EmitOpCode(&buffer_, 0);
320                             }
321                         }
322                         break;
323                     }
324                 }
325                 break;
326             }
327             case '(': {
328                 Advance();
329                 isAtom = ParseAssertionCapture(&captureIndex, isBackward);
330                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
331                 Advance();
332                 break;
333             }
334             case '.': {
335                 PrevOpCode prevOp;
336                 if (isBackward) {
337                     prevOp.EmitOpCode(&buffer_, 0);
338                 }
339                 if (IsDotAll()) {
340                     AllOpCode allOp;
341                     allOp.EmitOpCode(&buffer_, 0);
342                 } else {
343                     DotsOpCode dotsOp;
344                     dotsOp.EmitOpCode(&buffer_, 0);
345                 }
346                 if (isBackward) {
347                     prevOp.EmitOpCode(&buffer_, 0);
348                 }
349                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
350                 PrintF("Atom %c match any \n", c0_);
351                 isAtom = true;
352                 Advance();
353                 break;
354             }
355             case '[': {
356                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
357                 PrintF("Atom %c match range \n", c0_);
358                 isAtom = true;
359                 PrevOpCode prevOp;
360                 Advance();
361                 if (isBackward) {
362                     prevOp.EmitOpCode(&buffer_, 0);
363                 }
364                 bool isInvert = false;
365                 if (c0_ == '^') {
366                     isInvert = true;
367                     Advance();
368                 }
369                 RangeSet rangeResult;
370                 if (!ParseClassRanges(&rangeResult)) {
371                     break;
372                 }
373                 if (isInvert) {
374                     rangeResult.Invert(IsUtf16());
375                 }
376                 uint32_t highValue = rangeResult.HighestValue();
377                 if (highValue <= UINT16_MAX) {
378                     RangeOpCode rangeOp;
379                     rangeOp.InsertOpCode(&buffer_, rangeResult);
380                 } else {
381                     Range32OpCode rangeOp;
382                     rangeOp.InsertOpCode(&buffer_, rangeResult);
383                 }
384 
385                 if (isBackward) {
386                     prevOp.EmitOpCode(&buffer_, 0);
387                 }
388                 break;
389             }
390             case '*':
391             case '+':
392             case '?':
393                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
394                 ParseError("nothing to repeat");
395                 return;
396             case '{': {
397                 uint8_t *begin = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
398                 int dummy;
399                 if (ParserIntervalQuantifier(&dummy, &dummy)) {
400                     ParseError("nothing to repeat");
401                     return;
402                 }
403                 pc_ = begin;
404                 Advance();
405             }
406                 [[fallthrough]];
407             case '}':
408             case ']':
409                 if (IsUtf16()) {
410                     ParseError("syntax error");
411                     return;
412                 }
413                 [[fallthrough]];
414             default: {
415                 // PatternCharacter
416                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
417                 PrintF("PatternCharacter %c\n", c0_);
418                 isAtom = true;
419                 {
420                     PrevOpCode prevOp;
421                     if (isBackward) {
422                         prevOp.EmitOpCode(&buffer_, 0);
423                     }
424                     uint32_t matchedChar = c0_;
425                     if (c0_ > (INT8_MAX + 1)) {
426                         Prev();
427                         int i = 0;
428                         UChar32 c;
429                         int32_t length = end_ - pc_ + 1;
430                         // NOLINTNEXTLINE(hicpp-signed-bitwise)
431                         U8_NEXT(pc_, i, length, c);  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
432                         matchedChar = static_cast<uint32_t>(c);
433                         pc_ += i;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
434                     }
435                     if (IsIgnoreCase()) {
436                         matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
437                     }
438                     if (matchedChar > UINT16_MAX) {
439                         Char32OpCode charOp;
440                         charOp.EmitOpCode(&buffer_, matchedChar);
441                     } else {
442                         CharOpCode charOp;
443                         charOp.EmitOpCode(&buffer_, matchedChar);
444                     }
445                     if (isBackward) {
446                         prevOp.EmitOpCode(&buffer_, 0);
447                     }
448                 }
449                 Advance();
450                 break;
451             }
452         }
453         if (isAtom && !isError_) {
454             ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
455         }
456         if (isBackward) {
457             size_t end = buffer_.GetSize();
458             size_t termSize = end - atomBcStart;
459             size_t moveSize = end - start;
460             buffer_.Expand(end + termSize);
461             if (memmove_s(buffer_.buf_ + start +  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
462                               termSize,           // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
463                           moveSize,
464                           buffer_.buf_ + start,  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
465                           moveSize) != EOK) {
466                 LOG_FULL(FATAL) << "memmove_s failed";
467                 UNREACHABLE();
468             }
469             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
470             if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
471                 LOG_FULL(FATAL) << "memcpy_s failed";
472                 UNREACHABLE();
473             }
474         }
475     }
476 }
477 
FindGroupName(const CString & name)478 int RegExpParser::FindGroupName(const CString &name)
479 {
480     size_t len = 0;
481     size_t nameLen = name.size();
482     const char *p = reinterpret_cast<char *>(groupNames_.buf_);
483     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
484     const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
485     int captureIndex = 1;
486     while (p < bufEnd) {
487         len = strlen(p);
488         if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
489             return captureIndex;
490         }
491         p += len + 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
492         captureIndex++;
493     }
494     return -1;
495 }
496 
ParseAssertionCapture(int * captureIndex,bool isBackward)497 bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
498 {
499     bool isAtom = false;
500     do {
501         if (c0_ == '?') {
502             Advance();
503             switch (c0_) {
504                 // (?=Disjunction[?U, ?N])
505                 case '=': {
506                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
507                     PrintF("Assertion(?= Disjunction)\n");
508                     Advance();
509                     uint32_t start = buffer_.size_;
510                     ParseDisjunction(isBackward);
511                     MatchOpCode matchOp;
512                     matchOp.EmitOpCode(&buffer_, 0);
513                     MatchAheadOpCode matchAheadOp;
514                     uint32_t len = buffer_.size_ - start;
515                     matchAheadOp.InsertOpCode(&buffer_, start, len);
516                     break;
517                 }
518                 // (?!Disjunction[?U, ?N])
519                 case '!': {
520                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
521                     PrintF("Assertion(?! Disjunction)\n");
522                     uint32_t start = buffer_.size_;
523                     Advance();
524                     ParseDisjunction(isBackward);
525                     MatchOpCode matchOp;
526                     matchOp.EmitOpCode(&buffer_, 0);
527                     NegativeMatchAheadOpCode matchAheadOp;
528                     uint32_t len = buffer_.size_ - start;
529                     matchAheadOp.InsertOpCode(&buffer_, start, len);
530                     break;
531                 }
532                 case '<': {
533                     Advance();
534                     // (?<=Disjunction[?U, ?N])
535                     if (c0_ == '=') {
536                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
537                         PrintF("Assertion(?<= Disjunction)\n");
538                         Advance();
539                         uint32_t start = buffer_.size_;
540                         ParseDisjunction(true);
541                         MatchOpCode matchOp;
542                         matchOp.EmitOpCode(&buffer_, 0);
543                         MatchAheadOpCode matchAheadOp;
544                         uint32_t len = buffer_.size_ - start;
545                         matchAheadOp.InsertOpCode(&buffer_, start, len);
546                         // (?<!Disjunction[?U, ?N])
547                     } else if (c0_ == '!') {
548                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
549                         PrintF("Assertion(?<! Disjunction)\n");
550                         Advance();
551                         uint32_t start = buffer_.size_;
552                         ParseDisjunction(true);
553                         MatchOpCode matchOp;
554                         matchOp.EmitOpCode(&buffer_, 0);
555                         NegativeMatchAheadOpCode matchAheadOp;
556                         uint32_t len = buffer_.size_ - start;
557                         matchAheadOp.InsertOpCode(&buffer_, start, len);
558                     } else {
559                         Prev();
560                         CString name;
561                         auto **pp = const_cast<const uint8_t **>(&pc_);
562                         if (!ParseGroupSpecifier(pp, name)) {
563                             ParseError("GroupName Syntax error.");
564                             return false;
565                         }
566                         if (FindGroupName(name) > 0) {
567                             ParseError("Duplicate GroupName error.");
568                             return false;
569                         }
570                         groupNames_.EmitStr(name.c_str());
571                         newGroupNames_.push_back(name);
572                         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
573                         PrintF("group name %s", name.c_str());
574                         Advance();
575                         goto parseCapture;  // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
576                     }
577                     break;
578                 }
579                 // (?:Disjunction[?U, ?N])
580                 case ':':
581                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
582                     PrintF("Atom(?<: Disjunction)\n");
583                     isAtom = true;
584                     Advance();
585                     ParseDisjunction(isBackward);
586                     break;
587                 default:
588                     Advance();
589                     ParseError("? Syntax error.");
590                     return false;
591             }
592             if (isError_) {
593                 return false;
594             }
595         } else {
596             groupNames_.EmitChar(0);
597         parseCapture:
598             isAtom = true;
599             *captureIndex = captureCount_++;
600             SaveEndOpCode saveEndOp;
601             SaveStartOpCode saveStartOp;
602             if (isBackward) {
603                 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
604             } else {
605                 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
606             }
607             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
608             PrintF("capture start %d \n", *captureIndex);
609             ParseDisjunction(isBackward);
610             if (isError_) {
611                 return false;
612             }
613             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
614             PrintF("capture end %d \n", *captureIndex);
615             if (isBackward) {
616                 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
617             } else {
618                 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
619             }
620         }
621     } while (c0_ != ')' && c0_ != KEY_EOF);
622     if (c0_ != ')') {
623         ParseError("capture syntax error");
624         return false;
625     }
626     return isAtom;
627 }
628 
ParseDecimalDigits()629 int RegExpParser::ParseDecimalDigits()
630 {
631     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
632     PrintF("Parse DecimalDigits------\n");
633     uint32_t result = 0;
634     bool overflow = false;
635     while (true) {
636         if (c0_ < '0' || c0_ > '9') {
637             break;
638         }
639         if (!overflow) {
640             if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
641                 overflow = true;
642             } else {
643                 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
644             }
645         }
646         Advance();
647     }
648     if (overflow) {
649         return INT32_MAX;
650     }
651     return result;
652 }
653 
ParserIntervalQuantifier(int * pmin,int * pmax)654 bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
655 {
656     // Quantifier::
657     //     QuantifierPrefix
658     //     QuantifierPrefix?
659     // QuantifierPrefix::
660     // *
661     // +
662     // ?
663     // {DecimalDigits}
664     // {DecimalDigits,}
665     // {DecimalDigits,DecimalDigits}
666     Advance();
667     *pmin = ParseDecimalDigits();
668     *pmax = *pmin;
669     switch (c0_) {
670         case ',': {
671             Advance();
672             if (c0_ == '}') {
673                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
674                 PrintF("QuantifierPrefix{DecimalDigits,}\n");
675                 *pmax = INT32_MAX;
676                 Advance();
677             } else {
678                 *pmax = ParseDecimalDigits();
679                 if (c0_ == '}') {
680                     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
681                     PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
682                     Advance();
683                 } else {
684                     return false;
685                 }
686             }
687             break;
688         }
689         case '}':
690             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
691             PrintF("QuantifierPrefix{DecimalDigits}\n");
692             Advance();
693             break;
694         default:
695             Advance();
696             return false;
697     }
698     return true;
699 }
700 
ParseQuantifier(size_t atomBcStart,int captureStart,int captureEnd)701 void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
702 {
703     int min = -1;
704     int max = -1;
705     bool isGreedy = true;
706     switch (c0_) {
707         case '*':
708             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
709             PrintF("QuantifierPrefix %c\n", c0_);
710             min = 0;
711             max = INT32_MAX;
712             Advance();
713             break;
714         case '+':
715             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
716             PrintF("QuantifierPrefix %c\n", c0_);
717             min = 1;
718             max = INT32_MAX;
719             Advance();
720             break;
721         case '?':
722             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
723             PrintF("QuantifierPrefix %c\n", c0_);
724             Advance();
725             min = 0;
726             max = 1;
727             break;
728         case '{': {
729             uint8_t *start = pc_ - 1;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
730             if (!ParserIntervalQuantifier(&min, &max)) {
731                 pc_ = start;
732                 Advance();  // back to '{'
733                 return;
734             }
735             if (min > max) {
736                 ParseError("Invalid repetition count");
737                 return;
738             }
739             break;
740         }
741         default:
742             break;
743     }
744     if (c0_ == '?') {
745         isGreedy = false;
746         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
747         PrintF("Quantifier::QuantifierPrefix?\n");
748         Advance();
749     } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
750         ParseError("nothing to repeat");
751         return;
752     }
753     if (min != -1 && max != -1 && !isEmpty_) {
754         stackCount_++;
755         PushOpCode pushOp;
756         pushOp.InsertOpCode(&buffer_, atomBcStart);
757         atomBcStart += pushOp.GetSize();
758 
759         if (captureStart != 0) {
760             SaveResetOpCode saveResetOp;
761             saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
762         }
763 
764         // zero advance check
765         if (max == INT32_MAX) {
766             stackCount_++;
767             PushCharOpCode pushCharOp;
768             pushCharOp.InsertOpCode(&buffer_, atomBcStart);
769             CheckCharOpCode checkCharOp;
770             // NOLINTNEXTLINE(readability-magic-numbers)
771             checkCharOp.EmitOpCode(&buffer_, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
772         }
773 
774         if (isGreedy) {
775             LoopGreedyOpCode loopOp;
776             loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
777         } else {
778             LoopOpCode loopOp;
779             loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
780         }
781 
782         if (min == 0) {
783             if (isGreedy) {
784                 SplitNextOpCode splitNextOp;
785                 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
786             } else {
787                 SplitFirstOpCode splitFirstOp;
788                 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
789             }
790         }
791 
792         PopOpCode popOp;
793         popOp.EmitOpCode(&buffer_);
794     }
795     isEmpty_ = false;
796 }
797 
ParseGroupSpecifier(const uint8_t ** pp,CString & name)798 bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, CString &name)
799 {
800     const uint8_t *p = *pp;
801     uint32_t c = 0;
802     char buffer[CACHE_SIZE] = {0};
803     char *q = buffer;
804     while (true) {
805         if (p <= end_) {
806             c = *p;
807         } else {
808             c = KEY_EOF;
809         }
810         if (c == '\\') {
811             p++;
812             if (*p != 'u') {
813                 return false;
814             }
815             if (!ParseUnicodeEscape(&c)) {
816                 return false;
817             }
818         } else if (c == '>') {
819             break;
820         } else if (c > CACHE_SIZE && c != KEY_EOF) {
821             c = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
822         } else if (c != KEY_EOF) {
823             p++;
824         } else {
825             return false;
826         }
827         if (q == buffer) {
828             if (!IsIdentFirst(c)) {
829                 return false;
830             }
831         } else {
832             if (!u_isIDPart(c)) {
833                 return false;
834             }
835         }
836         if (q != nullptr) {
837             *q++ = c;
838         }
839     } // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
840     p++;
841     *pp = p;
842     name = buffer;
843     return true;
844 }
845 
ParseCaptureCount(const char * groupName)846 int RegExpParser::ParseCaptureCount(const char *groupName)
847 {
848     const uint8_t *p = nullptr;
849     int captureIndex = 1;
850     CString name;
851     hasNamedCaptures_ = 0;
852     for (p = base_; p < end_; p++) {  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
853         switch (*p) {
854             case '(': {
855                 if (p[1] == '?') {  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
856                     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
857                     if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
858                         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
859                         p[CAPTURE_CONUT_ADVANCE] != '=') {
860                         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
861                         hasNamedCaptures_ = 1;
862                         p += CAPTURE_CONUT_ADVANCE;
863                         if (groupName != nullptr) {
864                             if (ParseGroupSpecifier(&p, name)) {
865                                 if (strcmp(name.c_str(), groupName) == 0) {
866                                     return captureIndex;
867                                 }
868                             }
869                         }
870                         captureIndex++;
871                     }
872                 } else {
873                     captureIndex++;
874                 }
875                 break;
876             }
877             case '\\':
878                 p++;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
879                 break;
880             case '[': {
881                 while (p < end_ && *p != ']') {
882                     if (*p == '\\') {
883                         p++;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
884                     }
885                     p++;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
886                 }
887                 break;
888             }
889             default:
890                 break;
891         }
892     }
893     return captureIndex;
894 }
895 
896 // NOLINTNEXTLINE(readability-function-size)
ParseAtomEscape(bool isBackward)897 int RegExpParser::ParseAtomEscape(bool isBackward)
898 {
899     // AtomEscape[U, N]::
900     //     DecimalEscape
901     //     CharacterClassEscape[?U]
902     //     CharacterEscape[?U]
903     //     [+N]kGroupName[?U]
904     int result = -1;
905     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
906     PrintF("Parse AtomEscape------\n");
907     PrevOpCode prevOp;
908     switch (c0_) {
909         case KEY_EOF:
910             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
911             ParseError("unexpected end");
912             break;
913         // DecimalEscape
914         case '1':
915         case '2':
916         case '3':
917         case '4':
918         case '5':
919         case '6':
920         case '7':
921         case '8':
922         case '9': {
923             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
924             PrintF("NonZeroDigit %c\n", c0_);
925             int capture = ParseDecimalDigits();
926             if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
927                 ParseError("invalid backreference count");
928                 break;
929             }
930             if (isBackward) {
931                 BackwardBackReferenceOpCode backReferenceOp;
932                 backReferenceOp.EmitOpCode(&buffer_, capture);
933             } else {
934                 BackReferenceOpCode backReferenceOp;
935                 backReferenceOp.EmitOpCode(&buffer_, capture);
936             }
937             break;
938         }
939         // CharacterClassEscape
940         case 'd': {
941             // [0-9]
942             RangeOpCode rangeOp;
943             if (isBackward) {
944                 prevOp.EmitOpCode(&buffer_, 0);
945             }
946             rangeOp.InsertOpCode(&buffer_, g_rangeD);
947             goto parseLookBehind;
948         }
949         case 'D': {
950             // [^0-9]
951             RangeSet atomRange(g_rangeD);
952             atomRange.Invert(IsUtf16());
953             Range32OpCode rangeOp;
954             if (isBackward) {
955                 prevOp.EmitOpCode(&buffer_, 0);
956             }
957             rangeOp.InsertOpCode(&buffer_, atomRange);
958             goto parseLookBehind;
959         }
960         case 's': {
961             // [\f\n\r\t\v]
962             RangeOpCode rangeOp;
963             if (isBackward) {
964                 prevOp.EmitOpCode(&buffer_, 0);
965             }
966             rangeOp.InsertOpCode(&buffer_, g_rangeS);
967             goto parseLookBehind;
968         }
969         case 'S': {
970             RangeSet atomRange(g_rangeS);
971             Range32OpCode rangeOp;
972             atomRange.Invert(IsUtf16());
973             if (isBackward) {
974                 prevOp.EmitOpCode(&buffer_, 0);
975             }
976             rangeOp.InsertOpCode(&buffer_, atomRange);
977             goto parseLookBehind;
978         }
979         case 'w': {
980             // [A-Za-z0-9]
981             RangeOpCode rangeOp;
982             if (isBackward) {
983                 prevOp.EmitOpCode(&buffer_, 0);
984             }
985             rangeOp.InsertOpCode(&buffer_, g_rangeW);
986             goto parseLookBehind;
987         }
988         case 'W': {
989             // [^A-Za-z0-9]
990             RangeSet atomRange(g_rangeW);
991             atomRange.Invert(IsUtf16());
992             Range32OpCode rangeOp;
993             if (isBackward) {
994                 prevOp.EmitOpCode(&buffer_, 0);
995             }
996             rangeOp.InsertOpCode(&buffer_, atomRange);
997             goto parseLookBehind;
998         }
999         // P{UnicodePropertyValueExpression}
1000         // p{UnicodePropertyValueExpression}
1001         case 'P':
1002         case 'p':
1003         // [+N]kGroupName[?U]
1004         case 'k': {
1005             Advance();
1006             if (c0_ != '<') {
1007                 if (!IsUtf16() || HasNamedCaptures()) {
1008                     ParseError("expecting group name.");
1009                     break;
1010                 }
1011             }
1012             Advance();
1013             Prev();
1014             CString name;
1015             auto **pp = const_cast<const uint8_t **>(&pc_);
1016             if (!ParseGroupSpecifier(pp, name)) {
1017                 ParseError("GroupName Syntax error.");
1018                 break;
1019             }
1020             int postion = FindGroupName(name);
1021             if (postion < 0) {
1022                 postion = ParseCaptureCount(name.c_str());
1023                 if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1024                     ParseError("group name not defined");
1025                     break;
1026                 }
1027             }
1028             if (isBackward) {
1029                 BackwardBackReferenceOpCode backReferenceOp;
1030                 backReferenceOp.EmitOpCode(&buffer_, postion);
1031             } else {
1032                 BackReferenceOpCode backReferenceOp;
1033                 backReferenceOp.EmitOpCode(&buffer_, postion);
1034             }
1035             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1036             Advance();
1037             break;
1038         }
1039             parseLookBehind: {
1040                 if (isBackward) {
1041                     prevOp.EmitOpCode(&buffer_, 0);
1042                 }
1043                 Advance();
1044                 break;
1045             }
1046         default:
1047             result = ParseCharacterEscape();
1048             break;
1049     }
1050     return result;
1051 }
1052 
RecountCaptures()1053 int RegExpParser::RecountCaptures()
1054 {
1055     if (totalCaptureCount_ < 0) {
1056         const char *name = reinterpret_cast<const char*>(groupNames_.buf_);
1057         totalCaptureCount_ = ParseCaptureCount(name);
1058     }
1059     return totalCaptureCount_;
1060 }
HasNamedCaptures()1061 bool RegExpParser::HasNamedCaptures()
1062 {
1063     if (hasNamedCaptures_ < 0) {
1064         RecountCaptures();
1065     }
1066     return false;
1067 }
1068 
ParseCharacterEscape()1069 int RegExpParser::ParseCharacterEscape()
1070 {
1071     // CharacterEscape[U]::
1072     //     ControlEscape
1073     //     c ControlLetter
1074     //     0 [lookahead ∉ DecimalDigit]
1075     //     HexEscapeSequence
1076     //     RegExpUnicodeEscapeSequence[?U]
1077     //     IdentityEscape[?U]
1078     uint32_t result = 0;
1079     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1080     switch (c0_) {
1081         // ControlEscape
1082         case 'f':
1083             result = '\f';
1084             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1085             PrintF("ControlEscape %c\n", c0_);
1086             Advance();
1087             break;
1088         case 'n':
1089             result = '\n';
1090             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1091             PrintF("ControlEscape %c\n", c0_);
1092             Advance();
1093             break;
1094         case 'r':
1095             result = '\r';
1096             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1097             PrintF("ControlEscape %c\n", c0_);
1098             Advance();
1099             break;
1100         case 't':
1101             result = '\t';
1102             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1103             PrintF("ControlEscape %c\n", c0_);
1104             Advance();
1105             break;
1106         case 'v':
1107             result = '\v';
1108             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1109             PrintF("ControlEscape %c\n", c0_);
1110             Advance();
1111             break;
1112         // c ControlLetter
1113         case 'c': {
1114             Advance();
1115             if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1116                 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1117                 PrintF("ControlLetter %c\n", c0_);
1118                 result = static_cast<uint32_t>(c0_) & 0x1f;  // NOLINTNEXTLINE(readability-magic-numbers)
1119                 Advance();
1120             } else {
1121                 if (!IsUtf16()) {
1122                     pc_--;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1123                     result = '\\';
1124                 } else {
1125                     ParseError("Invalid control letter");
1126                     return -1;
1127                 }
1128             }
1129             break;
1130         }
1131         case '0': {
1132             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1133             PrintF("CharacterEscape 0 [lookahead ∉ DecimalDigit]\n");
1134             if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) {  // NOLINTNEXTLINE(readability-magic-numbers)
1135                 Advance();
1136                 result = 0;
1137                 break;
1138             }
1139             [[fallthrough]];
1140         }
1141         case '1':
1142         case '2':
1143         case '3':
1144         case '4':
1145         case '5':
1146         case '6':
1147         case '7': {
1148             if (IsUtf16()) {
1149                 // With /u, decimal escape is not interpreted as octal character code.
1150                 ParseError("Invalid class escape");
1151                 return 0;
1152             }
1153             result = ParseOctalLiteral();
1154             break;
1155         }
1156         // ParseHexEscapeSequence
1157         // ParseRegExpUnicodeEscapeSequence
1158         case 'x': {
1159             Advance();
1160             if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1161                 return result;
1162             }
1163             if (IsUtf16()) {
1164                 ParseError("Invalid class escape");
1165                 return -1;
1166             }
1167             result = 'x';
1168             break;
1169         }
1170         case 'u': {
1171             Advance();
1172             if (ParseUnicodeEscape(&result)) {
1173                 return result;
1174             }
1175             if (IsUtf16()) {
1176                 // With /u, invalid escapes are not treated as identity escapes.
1177                 ParseError("Invalid unicode escape");
1178                 return 0;
1179             }
1180             // If \u is not followed by a two-digit hexadecimal, treat it
1181             // as an identity escape.
1182             result = 'u';
1183             break;
1184         }
1185         // IdentityEscape[?U]
1186         case '$':
1187         case '(':
1188         case ')':
1189         case '*':
1190         case '+':
1191         case '.':
1192         case '/':
1193         case '?':
1194         case '[':
1195         case '\\':
1196         case ']':
1197         case '^':
1198         case '{':
1199         case '|':
1200         case '}':
1201             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1202             PrintF("IdentityEscape %c\n", c0_);
1203             result = c0_;
1204             Advance();
1205             break;
1206         default: {
1207             if (IsUtf16()) {
1208                 ParseError("Invalid unicode escape");
1209                 return 0;
1210             }
1211             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1212             PrintF("SourceCharacter %c\n", c0_);
1213             result = c0_;
1214             if (result < CHAR_MAXS) {
1215                 Advance();
1216             } else {
1217                 Prev();
1218                 const uint8_t *p = pc_;
1219                 result = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
1220                 int offset = static_cast<int>(p - pc_);
1221                 Advance(offset + 1);
1222             }
1223             break;
1224         }
1225     }
1226     return static_cast<int>(result);
1227 }
1228 
ParseClassRanges(RangeSet * result)1229 bool RegExpParser::ParseClassRanges(RangeSet *result)
1230 {
1231     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1232     PrintF("Parse ClassRanges------\n");
1233     while (c0_ != ']') {
1234         RangeSet s1;
1235         bool needInter = false;
1236         uint32_t c1 = ParseClassAtom(&s1);
1237         if (c1 == UINT32_MAX) {
1238             ParseError("invalid class range");
1239             return false;
1240         }
1241         needInter = NeedIntersection(c1);
1242         int next_c0 = *pc_;
1243         if (c0_ == '-' && next_c0 != ']') {
1244             if (c1 == CLASS_RANGE_BASE) {
1245                 if (IsUtf16()) {
1246                     ParseError("invalid class range");
1247                     return false;
1248                 }
1249                 result->Insert(s1);
1250                 continue;
1251             }
1252             Advance();
1253             RangeSet s2;
1254             uint32_t c2 = ParseClassAtom(&s2);
1255             if (c2 == UINT32_MAX) {
1256                 ParseError("invalid class range");
1257                 return false;
1258             }
1259             if (c2 == CLASS_RANGE_BASE) {
1260                 if (IsUtf16()) {
1261                     ParseError("invalid class range");
1262                     return false;
1263                 }
1264                 result->Insert(s2);
1265                 continue;
1266             }
1267             if (c1 < INT8_MAX) {
1268                 if (c1 > c2) {
1269                     ParseError("invalid class range");
1270                     return false;
1271                 }
1272             }
1273             needInter = NeedIntersection(c2);
1274             result->Insert(c1, c2);
1275             if (IsIgnoreCase() && needInter) {
1276                 ProcessIntersection(result);
1277             }
1278         } else {
1279             result->Insert(s1);
1280             if (!(IsIgnoreCase() && needInter)) {
1281                 continue;
1282             }
1283             if (c1 <= 'z' && c1 >= 'a') {
1284                 result->Insert(RangeSet(c1 - 'a' + 'A'));
1285             } else {
1286                 result->Insert(RangeSet(c1 - 'A' + 'a'));
1287             }
1288         }
1289     }
1290     Advance();
1291     return true;
1292 }
1293 
ParseClassAtom(RangeSet * atom)1294 uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1295 {
1296     uint32_t ret = UINT32_MAX;
1297     switch (c0_) {
1298         case '\\': {
1299             Advance();
1300             ret = static_cast<uint32_t>(ParseClassEscape(atom));
1301             break;
1302         }
1303         case KEY_EOF:
1304             break;
1305         case 0: {
1306             if (pc_ >= end_) {
1307                 return UINT32_MAX;
1308             }
1309             [[fallthrough]];
1310         }
1311         default: {
1312             uint32_t value = c0_;
1313             size_t u16_size = 0;
1314             if (c0_ > INT8_MAX) {  // NOLINTNEXTLINE(readability-magic-numbers)
1315                 pc_ -= 1;          // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1316                 auto u16_result = base::utf_helper::ConvertUtf8ToUtf16Pair(pc_, true);
1317                 value = u16_result.first;
1318                 u16_size = u16_result.second;
1319                 Advance(u16_size + 1);
1320             } else {
1321                 Advance();
1322             }
1323             atom->Insert(RangeSet(value));
1324             ret = value;
1325             break;
1326         }
1327     }
1328     return ret;
1329 }
1330 
ParseClassEscape(RangeSet * atom)1331 int RegExpParser::ParseClassEscape(RangeSet *atom)
1332 {
1333     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1334     PrintF("Parse ClassEscape------\n");
1335     int result = -1;
1336     switch (c0_) {
1337         case 'b':
1338             Advance();
1339             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1340             PrintF("ClassEscape %c", 'b');
1341             result = '\b';
1342             atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1343             break;
1344         case '-':
1345             Advance();
1346             result = '-';
1347             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1348             PrintF("ClassEscape %c", '-');
1349             atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1350             break;
1351         // CharacterClassEscape
1352         case 'd':
1353         case 'D':
1354             result = CLASS_RANGE_BASE;
1355             atom->Insert(g_rangeD);
1356             if (c0_ == 'D') {
1357                 atom->Invert(IsUtf16());
1358             }
1359             Advance();
1360             break;
1361         case 's':
1362         case 'S':
1363             result = CLASS_RANGE_BASE;
1364             atom->Insert(g_rangeS);
1365             if (c0_ == 'S') {
1366                 atom->Invert(IsUtf16());
1367             }
1368             Advance();
1369             break;
1370         case 'w':
1371         case 'W':
1372             // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1373             PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1374             result = CLASS_RANGE_BASE;
1375             atom->Insert(g_rangeW);
1376             if (c0_ == 'W') {
1377                 atom->Invert(IsUtf16());
1378             }
1379             Advance();
1380             break;
1381         // P{UnicodePropertyValueExpression}
1382         // p{UnicodePropertyValueExpression}
1383         case 'P':
1384         case 'p':
1385             PrintF("Warning: \\p is not supported in ECMA 2015!");
1386             Advance();
1387             if (c0_ == '{') {
1388                 Advance();
1389                 if (c0_ == '}') {
1390                     break;  // p{}, invalid
1391                 }
1392                 bool isValue = false;
1393                 ParseUnicodePropertyValueCharacters(&isValue);
1394                 if (!isValue && c0_ == '=') {
1395                     // UnicodePropertyName = UnicodePropertyValue
1396                     Advance();
1397                     if (c0_ == '}') {
1398                         break;  // p{xxx=}, invalid
1399                     }
1400                     ParseUnicodePropertyValueCharacters(&isValue);
1401                 }
1402                 if (c0_ != '}') {
1403                     break;  // p{xxx, invalid
1404                 }
1405                 // should do atom->Invert() here after ECMA 9.0
1406                 Advance();
1407                 result = CLASS_RANGE_BASE;
1408             }
1409             break;
1410         default:
1411             result = ParseCharacterEscape();
1412             int value = result;
1413             if (IsIgnoreCase()) {
1414                 value = Canonicalize(value, IsUtf16());
1415             }
1416             atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1417             break;
1418     }
1419     return result;
1420 }
1421 
ParseUnicodePropertyValueCharacters(bool * isValue)1422 void RegExpParser::ParseUnicodePropertyValueCharacters(bool *isValue)
1423 {
1424     if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1425         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1426         PrintF("UnicodePropertyCharacter::ControlLetter %c\n", c0_);
1427     } else if (c0_ == '_') {
1428         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1429         PrintF("UnicodePropertyCharacter:: _ \n");
1430     } else if (c0_ >= '0' && c0_ <= '9') {
1431         *isValue = true;
1432         // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1433         PrintF("UnicodePropertyValueCharacter::DecimalDigit %c\n", c0_);
1434     } else {
1435         return;
1436     }
1437     Advance();
1438     ParseUnicodePropertyValueCharacters(isValue);
1439 }
1440 
1441 // NOLINTNEXTLINE(cert-dcl50-cpp)
PrintF(const char * fmt,...)1442 void RegExpParser::PrintF(const char *fmt, ...)
1443 {
1444 #ifndef _NO_DEBUG_
1445     va_list args;
1446     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,)
1447     va_start(args, fmt);
1448     vprintf(fmt, args);
1449     va_end(args);
1450 #else
1451     (void)fmt;
1452 #endif
1453 }
1454 
ParseError(const char * errorMessage)1455 void RegExpParser::ParseError(const char *errorMessage)
1456 {
1457     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1458     PrintF("error: ");
1459     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1460     PrintF(errorMessage);
1461     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1462     PrintF("\n");
1463     SetIsError();
1464     size_t length = strlen(errorMessage) + 1;
1465     if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1466         LOG_FULL(FATAL) << "memcpy_s failed";
1467         UNREACHABLE();
1468     }
1469 }
1470 
IsIdentFirst(uint32_t c)1471 int RegExpParser::IsIdentFirst(uint32_t c)
1472 {
1473     if (c < CACHE_SIZE) {
1474         return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1; // 5: Shift five bits 31: and operation binary of 31
1475     } else {
1476         return static_cast<int>(u_isIDStart(c));
1477     }
1478 }
1479 
Canonicalize(int c,bool isUnicode)1480 int RegExpParser::Canonicalize(int c, bool isUnicode)
1481 {
1482     if (c < TMP_BUF_SIZE) {  // NOLINTNEXTLINE(readability-magic-numbers)
1483         if (c >= 'a' && c <= 'z') {
1484             c = c - 'a' + 'A';
1485         }
1486     } else {
1487         int cur = c;
1488         if (isUnicode) {
1489             c = u_tolower(static_cast<UChar32>(c));
1490             if (c >= 'a' && c <= 'z') {
1491                 c = cur;
1492             }
1493         } else {
1494             c = u_toupper(static_cast<UChar32>(c));
1495             if (c >= 'A' && c <= 'Z') {
1496                 c = cur;
1497             }
1498         }
1499     }
1500     return c;
1501 }
1502 
NeedIntersection(uint32_t c)1503 bool RegExpParser::NeedIntersection(uint32_t c)
1504 {
1505     return (c <= 'z' && c >= 'a') || (c <= 'Z' && c >= 'A');
1506 }
1507 }  // namespace panda::ecmascript
1508