1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "ecmascript/regexp/regexp_parser.h"
17
18 #include "ecmascript/base/string_helper.h"
19 #include "ecmascript/ecma_macros.h"
20 #include "ecmascript/regexp/regexp_opcode.h"
21 #include "libpandabase/utils/utils.h"
22 #include "securec.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uniset.h"
25 #define _NO_DEBUG_
26
27 namespace panda::ecmascript {
28 static constexpr uint32_t CACHE_SIZE = 128;
29 static constexpr uint32_t CHAR_MAXS = 128;
30 static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
31 /* $ A-Z _ a-z */
32 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
33 };
34 static RangeSet g_rangeD(0x30, 0x39); // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
35 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
36 static RangeSet g_rangeS({
37 std::pair<uint32_t, uint32_t>(0x0009, 0x000D), // NOLINTNEXTLINE(readability-magic-numbers)
38 std::pair<uint32_t, uint32_t>(0x0020, 0x0020), // NOLINTNEXTLINE(readability-magic-numbers)
39 std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0), // NOLINTNEXTLINE(readability-magic-numbers)
40 std::pair<uint32_t, uint32_t>(0x1680, 0x1680), // NOLINTNEXTLINE(readability-magic-numbers)
41 std::pair<uint32_t, uint32_t>(0x2000, 0x200A), // NOLINTNEXTLINE(readability-magic-numbers)
42 /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
43 /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
44 std::pair<uint32_t, uint32_t>(0x2028, 0x2029), // NOLINTNEXTLINE(readability-magic-numbers)
45 std::pair<uint32_t, uint32_t>(0x202F, 0x202F), // NOLINTNEXTLINE(readability-magic-numbers)
46 std::pair<uint32_t, uint32_t>(0x205F, 0x205F), // NOLINTNEXTLINE(readability-magic-numbers)
47 std::pair<uint32_t, uint32_t>(0x3000, 0x3000), // NOLINTNEXTLINE(readability-magic-numbers)
48 /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
49 std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF), // NOLINTNEXTLINE(readability-magic-numbers)
50 });
51
52 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
53 static RangeSet g_rangeW({
54 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINTNEXTLINE(readability-magic-numbers)
55 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINTNEXTLINE(readability-magic-numbers)
56 std::pair<uint32_t, uint32_t>(0x005F, 0x005F), // NOLINTNEXTLINE(readability-magic-numbers)
57 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINTNEXTLINE(readability-magic-numbers)
58 });
59
60 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
61 static RangeSet g_regexpIdentifyStart({
62 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINTNEXTLINE(readability-magic-numbers)
63 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINTNEXTLINE(readability-magic-numbers)
64 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINTNEXTLINE(readability-magic-numbers)
65 });
66
67 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
68 static RangeSet g_regexpIdentifyContinue({
69 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINTNEXTLINE(readability-magic-numbers)
70 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINTNEXTLINE(readability-magic-numbers)
71 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINTNEXTLINE(readability-magic-numbers)
72 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINTNEXTLINE(readability-magic-numbers)
73 });
74
Parse()75 void RegExpParser::Parse()
76 {
77 // dynbuffer head init [size,capture_count,statck_count,flags]
78 buffer_.EmitU32(0);
79 buffer_.EmitU32(0);
80 buffer_.EmitU32(0);
81 buffer_.EmitU32(0);
82 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
83 PrintF("Parse Pattern------\n");
84 // Pattern[U, N]::
85 // Disjunction[?U, ?N]
86 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
87 Advance();
88 SaveStartOpCode saveStartOp;
89 int captureIndex = captureCount_++;
90 saveStartOp.EmitOpCode(&buffer_, captureIndex);
91 ParseDisjunction(false);
92 if (isError_) {
93 return;
94 }
95 if (c0_ != KEY_EOF) {
96 ParseError("extraneous characters at the end");
97 return;
98 }
99 SaveEndOpCode saveEndOp;
100 saveEndOp.EmitOpCode(&buffer_, captureIndex);
101 MatchEndOpCode matchEndOp;
102 matchEndOp.EmitOpCode(&buffer_, 0);
103 // dynbuffer head assignments
104 buffer_.PutU32(0, buffer_.size_);
105 buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
106 buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
107 buffer_.PutU32(FLAGS_OFFSET, flags_);
108 #ifndef _NO_DEBUG_
109 RegExpOpCode::DumpRegExpOpCode(std::cout, buffer_);
110 #endif
111 }
112
ParseDisjunction(bool isBackward)113 void RegExpParser::ParseDisjunction(bool isBackward)
114 {
115 // check stack overflow because infinite recursion may occur
116 DoParserStackOverflowCheck("invalid regular expression.");
117 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
118 PrintF("Parse Disjunction------\n");
119 size_t start = buffer_.size_;
120 ParseAlternative(isBackward);
121 if (isError_) {
122 return;
123 }
124 do {
125 if (c0_ == '|') {
126 SplitNextOpCode splitOp;
127 uint32_t len = buffer_.size_ - start;
128 GotoOpCode gotoOp;
129 splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
130 uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
131 Advance();
132 ParseAlternative(isBackward);
133 gotoOp.UpdateOpPara(&buffer_, pos, buffer_.size_ - pos - gotoOp.GetSize());
134 }
135 } while (c0_ != KEY_EOF && c0_ != ')');
136 }
137
ParseOctalLiteral()138 uint32_t RegExpParser::ParseOctalLiteral()
139 {
140 // For compatibility with some other browsers (not all), we parse
141 // up to three octal digits with a value below 256.
142 // ES#prod-annexB-LegacyOctalEscapeSequence
143 uint32_t value = c0_ - '0';
144 Advance();
145 if (c0_ >= '0' && c0_ <= '7') {
146 value = value * OCTAL_VALUE + c0_ - '0';
147 Advance();
148 if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
149 value = value * OCTAL_VALUE + c0_ - '0';
150 Advance();
151 }
152 }
153 return value;
154 }
155
ParseUnlimitedLengthHexNumber(uint32_t maxValue,uint32_t * value)156 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
157 {
158 uint32_t x = 0;
159 int d = static_cast<int>(HexValue(c0_));
160 if (d < 0) {
161 return false;
162 }
163 while (d >= 0) {
164 if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
165 LOG_FULL(FATAL) << "value overflow";
166 return false;
167 }
168 x = x * HEX_VALUE + static_cast<uint32_t>(d);
169 if (x > maxValue) {
170 return false;
171 }
172 Advance();
173 d = static_cast<int>(HexValue(c0_));
174 }
175 *value = x;
176 return true;
177 }
178
179 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uint32_t * value)180 bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
181 {
182 // Accept both \uxxxx and \u{xxxxxx} (if allowed).
183 // In the latter case, the number of hex digits between { } is arbitrary.
184 // \ and u have already been read.
185 if (c0_ == '{' && IsUtf16()) {
186 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
187 Advance();
188 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { // NOLINTNEXTLINE(readability-magic-numbers)
189 if (c0_ == '}') {
190 Advance();
191 return true;
192 }
193 }
194 pc_ = start;
195 Advance();
196 return false;
197 }
198 // \u but no {, or \u{...} escapes not allowed.
199 bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
200 if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
201 // Attempt to read trail surrogate.
202 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
203 if (*pc_ == 'u') {
204 Advance(UNICODE_HEX_ADVANCE);
205 uint32_t trail = 0;
206 if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
207 *value = U16_GET_SUPPLEMENTARY((*value), (trail)); // NOLINTNEXTLINE(hicpp-signed-bitwise)
208 return true;
209 }
210 }
211 pc_ = start;
212 Advance();
213 }
214 return result;
215 }
216
ParseHexEscape(int length,uint32_t * value)217 bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
218 {
219 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
220 uint32_t val = 0;
221 for (int i = 0; i < length; ++i) {
222 uint32_t c = c0_;
223 int d = static_cast<int>(HexValue(c));
224 if (d < 0) {
225 pc_ = start;
226 Advance();
227 return false;
228 }
229 val = val * HEX_VALUE + static_cast<uint32_t>(d);
230 Advance();
231 }
232 *value = val;
233 return true;
234 }
235
236 // NOLINTNEXTLINE(readability-function-size)
ParseAlternative(bool isBackward)237 void RegExpParser::ParseAlternative(bool isBackward)
238 {
239 size_t start = buffer_.size_;
240 while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
241 if (isError_) {
242 return;
243 }
244 size_t atomBcStart = buffer_.GetSize();
245 int captureIndex = 0;
246 bool isAtom = false;
247 switch (c0_) {
248 case '^': {
249 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
250 PrintF("Assertion %c line start \n", c0_);
251 LineStartOpCode lineStartOp;
252 lineStartOp.EmitOpCode(&buffer_, 0);
253 Advance();
254 break;
255 }
256 case '$': {
257 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
258 PrintF("Assertion %c line end \n", c0_);
259 LineEndOpCode lineEndOp;
260 lineEndOp.EmitOpCode(&buffer_, 0);
261 Advance();
262 break;
263 }
264 case '\\': {
265 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
266 PrintF("Escape %c \n", c0_);
267 Advance();
268 switch (c0_) {
269 case 'b': {
270 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
271 PrintF("Assertion %c \n", c0_);
272 WordBoundaryOpCode wordBoundaryOp;
273 wordBoundaryOp.EmitOpCode(&buffer_, 0);
274 Advance();
275 break;
276 }
277 case 'B': {
278 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
279 PrintF("Assertion %c \n", c0_);
280 NotWordBoundaryOpCode notWordBoundaryOp;
281 notWordBoundaryOp.EmitOpCode(&buffer_, 0);
282 Advance();
283 break;
284 }
285 default: {
286 isAtom = true;
287 int atomValue = ParseAtomEscape(isBackward);
288 if (atomValue != -1) {
289 PrevOpCode prevOp;
290 if (isBackward) {
291 prevOp.EmitOpCode(&buffer_, 0);
292 }
293 if (IsIgnoreCase()) {
294 if (!IsUtf16()) {
295 atomValue = Canonicalize(atomValue, false);
296 } else {
297 icu::UnicodeSet set(atomValue, atomValue);
298 set.closeOver(USET_CASE_INSENSITIVE);
299 set.removeAllStrings();
300 uint32_t size = static_cast<uint32_t>(set.size());
301 RangeOpCode rangeOp;
302 RangeSet rangeResult;
303 for (uint32_t idx = 0; idx < size; idx++) {
304 int32_t uc = set.charAt(idx);
305 RangeSet curRange(uc);
306 rangeResult.Insert(curRange);
307 }
308 rangeOp.InsertOpCode(&buffer_, rangeResult);
309 break;
310 }
311 }
312 if (atomValue <= UINT16_MAX) {
313 CharOpCode charOp;
314 charOp.EmitOpCode(&buffer_, atomValue);
315 } else {
316 Char32OpCode charOp;
317 charOp.EmitOpCode(&buffer_, atomValue);
318 }
319 if (isBackward) {
320 prevOp.EmitOpCode(&buffer_, 0);
321 }
322 }
323 break;
324 }
325 }
326 break;
327 }
328 case '(': {
329 Advance();
330 isAtom = ParseAssertionCapture(&captureIndex, isBackward);
331 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
332 Advance();
333 break;
334 }
335 case '.': {
336 PrevOpCode prevOp;
337 if (isBackward) {
338 prevOp.EmitOpCode(&buffer_, 0);
339 }
340 if (IsDotAll()) {
341 AllOpCode allOp;
342 allOp.EmitOpCode(&buffer_, 0);
343 } else {
344 DotsOpCode dotsOp;
345 dotsOp.EmitOpCode(&buffer_, 0);
346 }
347 if (isBackward) {
348 prevOp.EmitOpCode(&buffer_, 0);
349 }
350 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
351 PrintF("Atom %c match any \n", c0_);
352 isAtom = true;
353 Advance();
354 break;
355 }
356 case '[': {
357 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
358 PrintF("Atom %c match range \n", c0_);
359 isAtom = true;
360 PrevOpCode prevOp;
361 Advance();
362 if (isBackward) {
363 prevOp.EmitOpCode(&buffer_, 0);
364 }
365 bool isInvert = false;
366 if (c0_ == '^') {
367 isInvert = true;
368 Advance();
369 }
370 RangeSet rangeResult;
371 if (!ParseClassRanges(&rangeResult)) {
372 break;
373 }
374 if (isInvert) {
375 rangeResult.Invert(IsUtf16());
376 }
377 uint32_t highValue = rangeResult.HighestValue();
378 if (highValue <= UINT16_MAX) {
379 RangeOpCode rangeOp;
380 rangeOp.InsertOpCode(&buffer_, rangeResult);
381 } else {
382 Range32OpCode rangeOp;
383 rangeOp.InsertOpCode(&buffer_, rangeResult);
384 }
385
386 if (isBackward) {
387 prevOp.EmitOpCode(&buffer_, 0);
388 }
389 break;
390 }
391 case '*':
392 case '+':
393 case '?':
394 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
395 ParseError("nothing to repeat");
396 return;
397 case '{': {
398 uint8_t *begin = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
399 int dummy;
400 if (ParserIntervalQuantifier(&dummy, &dummy)) {
401 ParseError("nothing to repeat");
402 return;
403 }
404 pc_ = begin;
405 Advance();
406 }
407 [[fallthrough]];
408 case '}':
409 case ']':
410 if (IsUtf16()) {
411 ParseError("syntax error");
412 return;
413 }
414 [[fallthrough]];
415 default: {
416 // PatternCharacter
417 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
418 PrintF("PatternCharacter %c\n", c0_);
419 isAtom = true;
420 {
421 PrevOpCode prevOp;
422 if (isBackward) {
423 prevOp.EmitOpCode(&buffer_, 0);
424 }
425 uint32_t matchedChar = c0_;
426 if (c0_ > (INT8_MAX + 1)) {
427 Prev();
428 int i = 0;
429 UChar32 c;
430 int32_t length = end_ - pc_ + 1;
431 // NOLINTNEXTLINE(hicpp-signed-bitwise)
432 U8_NEXT(pc_, i, length, c); // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
433 matchedChar = static_cast<uint32_t>(c);
434 pc_ += i; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
435 }
436 if (IsIgnoreCase()) {
437 matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
438 }
439 if (matchedChar > UINT16_MAX) {
440 Char32OpCode charOp;
441 charOp.EmitOpCode(&buffer_, matchedChar);
442 } else {
443 CharOpCode charOp;
444 charOp.EmitOpCode(&buffer_, matchedChar);
445 }
446 if (isBackward) {
447 prevOp.EmitOpCode(&buffer_, 0);
448 }
449 }
450 Advance();
451 break;
452 }
453 }
454 if (isAtom && !isError_) {
455 ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
456 }
457 if (isBackward) {
458 size_t end = buffer_.GetSize();
459 size_t termSize = end - atomBcStart;
460 size_t moveSize = end - start;
461 buffer_.Expand(end + termSize);
462 if (memmove_s(buffer_.buf_ + start + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
463 termSize, // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
464 moveSize,
465 buffer_.buf_ + start, // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
466 moveSize) != EOK) {
467 LOG_FULL(FATAL) << "memmove_s failed";
468 UNREACHABLE();
469 }
470 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
471 if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
472 LOG_FULL(FATAL) << "memcpy_s failed";
473 UNREACHABLE();
474 }
475 }
476 }
477 }
478
FindGroupName(const CString & name)479 int RegExpParser::FindGroupName(const CString &name)
480 {
481 size_t len = 0;
482 size_t nameLen = name.size();
483 const char *p = reinterpret_cast<char *>(groupNames_.buf_);
484 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
485 const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
486 int captureIndex = 1;
487 while (p < bufEnd) {
488 len = strlen(p);
489 if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
490 return captureIndex;
491 }
492 p += len + 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
493 captureIndex++;
494 }
495 return -1;
496 }
497
ParseAssertionCapture(int * captureIndex,bool isBackward)498 bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
499 {
500 bool isAtom = false;
501 do {
502 if (c0_ == '?') {
503 Advance();
504 switch (c0_) {
505 // (?=Disjunction[?U, ?N])
506 case '=': {
507 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
508 PrintF("Assertion(?= Disjunction)\n");
509 Advance();
510 uint32_t start = buffer_.size_;
511 ParseDisjunction(isBackward);
512 MatchOpCode matchOp;
513 matchOp.EmitOpCode(&buffer_, 0);
514 MatchAheadOpCode matchAheadOp;
515 uint32_t len = buffer_.size_ - start;
516 matchAheadOp.InsertOpCode(&buffer_, start, len);
517 break;
518 }
519 // (?!Disjunction[?U, ?N])
520 case '!': {
521 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
522 PrintF("Assertion(?! Disjunction)\n");
523 uint32_t start = buffer_.size_;
524 Advance();
525 ParseDisjunction(isBackward);
526 MatchOpCode matchOp;
527 matchOp.EmitOpCode(&buffer_, 0);
528 NegativeMatchAheadOpCode matchAheadOp;
529 uint32_t len = buffer_.size_ - start;
530 matchAheadOp.InsertOpCode(&buffer_, start, len);
531 break;
532 }
533 case '<': {
534 Advance();
535 // (?<=Disjunction[?U, ?N])
536 if (c0_ == '=') {
537 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
538 PrintF("Assertion(?<= Disjunction)\n");
539 Advance();
540 uint32_t start = buffer_.size_;
541 ParseDisjunction(true);
542 MatchOpCode matchOp;
543 matchOp.EmitOpCode(&buffer_, 0);
544 MatchAheadOpCode matchAheadOp;
545 uint32_t len = buffer_.size_ - start;
546 matchAheadOp.InsertOpCode(&buffer_, start, len);
547 // (?<!Disjunction[?U, ?N])
548 } else if (c0_ == '!') {
549 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
550 PrintF("Assertion(?<! Disjunction)\n");
551 Advance();
552 uint32_t start = buffer_.size_;
553 ParseDisjunction(true);
554 MatchOpCode matchOp;
555 matchOp.EmitOpCode(&buffer_, 0);
556 NegativeMatchAheadOpCode matchAheadOp;
557 uint32_t len = buffer_.size_ - start;
558 matchAheadOp.InsertOpCode(&buffer_, start, len);
559 } else {
560 Prev();
561 CString name;
562 auto **pp = const_cast<const uint8_t **>(&pc_);
563 if (!ParseGroupSpecifier(pp, name)) {
564 ParseError("GroupName Syntax error.");
565 return false;
566 }
567 if (FindGroupName(name) > 0) {
568 ParseError("Duplicate GroupName error.");
569 return false;
570 }
571 groupNames_.EmitStr(name.c_str());
572 newGroupNames_.push_back(name);
573 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
574 PrintF("group name %s", name.c_str());
575 Advance();
576 goto parseCapture; // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
577 }
578 break;
579 }
580 // (?:Disjunction[?U, ?N])
581 case ':':
582 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
583 PrintF("Atom(?<: Disjunction)\n");
584 isAtom = true;
585 Advance();
586 ParseDisjunction(isBackward);
587 break;
588 default:
589 Advance();
590 ParseError("? Syntax error.");
591 return false;
592 }
593 if (isError_) {
594 return false;
595 }
596 } else {
597 groupNames_.EmitChar(0);
598 parseCapture:
599 isAtom = true;
600 *captureIndex = captureCount_++;
601 SaveEndOpCode saveEndOp;
602 SaveStartOpCode saveStartOp;
603 if (isBackward) {
604 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
605 } else {
606 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
607 }
608 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
609 PrintF("capture start %d \n", *captureIndex);
610 ParseDisjunction(isBackward);
611 if (isError_) {
612 return false;
613 }
614 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
615 PrintF("capture end %d \n", *captureIndex);
616 if (isBackward) {
617 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
618 } else {
619 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
620 }
621 }
622 } while (c0_ != ')' && c0_ != KEY_EOF);
623 if (c0_ != ')') {
624 ParseError("capture syntax error");
625 return false;
626 }
627 return isAtom;
628 }
629
ParseDecimalDigits()630 int RegExpParser::ParseDecimalDigits()
631 {
632 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
633 PrintF("Parse DecimalDigits------\n");
634 uint32_t result = 0;
635 bool overflow = false;
636 while (true) {
637 if (c0_ < '0' || c0_ > '9') {
638 break;
639 }
640 if (!overflow) {
641 if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
642 overflow = true;
643 } else {
644 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
645 }
646 }
647 Advance();
648 }
649 if (overflow) {
650 return INT32_MAX;
651 }
652 return result;
653 }
654
ParserIntervalQuantifier(int * pmin,int * pmax)655 bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
656 {
657 // Quantifier::
658 // QuantifierPrefix
659 // QuantifierPrefix?
660 // QuantifierPrefix::
661 // *
662 // +
663 // ?
664 // {DecimalDigits}
665 // {DecimalDigits,}
666 // {DecimalDigits,DecimalDigits}
667 Advance();
668 *pmin = ParseDecimalDigits();
669 *pmax = *pmin;
670 switch (c0_) {
671 case ',': {
672 Advance();
673 if (c0_ == '}') {
674 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
675 PrintF("QuantifierPrefix{DecimalDigits,}\n");
676 *pmax = INT32_MAX;
677 Advance();
678 } else {
679 *pmax = ParseDecimalDigits();
680 if (c0_ == '}') {
681 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
682 PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
683 Advance();
684 } else {
685 return false;
686 }
687 }
688 break;
689 }
690 case '}':
691 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
692 PrintF("QuantifierPrefix{DecimalDigits}\n");
693 Advance();
694 break;
695 default:
696 Advance();
697 return false;
698 }
699 return true;
700 }
701
ParseQuantifier(size_t atomBcStart,int captureStart,int captureEnd)702 void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
703 {
704 int min = -1;
705 int max = -1;
706 bool isGreedy = true;
707 switch (c0_) {
708 case '*':
709 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
710 PrintF("QuantifierPrefix %c\n", c0_);
711 min = 0;
712 max = INT32_MAX;
713 Advance();
714 break;
715 case '+':
716 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
717 PrintF("QuantifierPrefix %c\n", c0_);
718 min = 1;
719 max = INT32_MAX;
720 Advance();
721 break;
722 case '?':
723 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
724 PrintF("QuantifierPrefix %c\n", c0_);
725 Advance();
726 min = 0;
727 max = 1;
728 break;
729 case '{': {
730 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
731 if (!ParserIntervalQuantifier(&min, &max)) {
732 pc_ = start;
733 Advance(); // back to '{'
734 return;
735 }
736 if (min > max) {
737 ParseError("Invalid repetition count");
738 return;
739 }
740 break;
741 }
742 default:
743 break;
744 }
745 if (c0_ == '?') {
746 isGreedy = false;
747 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
748 PrintF("Quantifier::QuantifierPrefix?\n");
749 Advance();
750 } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
751 ParseError("nothing to repeat");
752 return;
753 }
754 if (min != -1 && max != -1) {
755 stackCount_++;
756 PushOpCode pushOp;
757 pushOp.InsertOpCode(&buffer_, atomBcStart);
758 atomBcStart += pushOp.GetSize();
759
760 if (captureStart != 0) {
761 SaveResetOpCode saveResetOp;
762 saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
763 }
764
765 // zero advance check
766 if (max == INT32_MAX) {
767 stackCount_++;
768 PushCharOpCode pushCharOp;
769 pushCharOp.InsertOpCode(&buffer_, atomBcStart);
770 CheckCharOpCode checkCharOp;
771 // NOLINTNEXTLINE(readability-magic-numbers)
772 checkCharOp.EmitOpCode(&buffer_, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
773 }
774
775 if (isGreedy) {
776 LoopGreedyOpCode loopOp;
777 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
778 } else {
779 LoopOpCode loopOp;
780 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
781 }
782
783 if (min == 0) {
784 if (isGreedy) {
785 SplitNextOpCode splitNextOp;
786 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
787 } else {
788 SplitFirstOpCode splitFirstOp;
789 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
790 }
791 }
792
793 PopOpCode popOp;
794 popOp.EmitOpCode(&buffer_);
795 }
796 }
797
ParseGroupSpecifier(const uint8_t ** pp,CString & name)798 bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, CString &name)
799 {
800 const uint8_t *p = *pp;
801 uint32_t c = 0;
802 char buffer[CACHE_SIZE] = {0};
803 char *q = buffer;
804 while (true) {
805 if (p <= end_) {
806 c = *p;
807 } else {
808 c = KEY_EOF;
809 }
810 if (c == '\\') {
811 p++;
812 if (*p != 'u') {
813 return false;
814 }
815 if (!ParseUnicodeEscape(&c)) {
816 return false;
817 }
818 } else if (c == '>') {
819 break;
820 } else if (c > CACHE_SIZE && c != KEY_EOF) {
821 c = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
822 } else if (c != KEY_EOF) {
823 p++;
824 } else {
825 return false;
826 }
827 if (q == buffer) {
828 if (!IsIdentFirst(c)) {
829 return false;
830 }
831 } else {
832 if (!u_isIDPart(c)) {
833 return false;
834 }
835 }
836 if (q != nullptr) {
837 *q++ = c;
838 }
839 } // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
840 p++;
841 *pp = p;
842 name = buffer;
843 return true;
844 }
845
ParseCaptureCount(const char * groupName)846 int RegExpParser::ParseCaptureCount(const char *groupName)
847 {
848 const uint8_t *p = nullptr;
849 int captureIndex = 1;
850 CString name;
851 hasNamedCaptures_ = 0;
852 for (p = base_; p < end_; p++) { // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
853 switch (*p) {
854 case '(': {
855 if (p[1] == '?') { // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
856 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
857 if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
858 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
859 p[CAPTURE_CONUT_ADVANCE] != '=') {
860 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
861 hasNamedCaptures_ = 1;
862 p += CAPTURE_CONUT_ADVANCE;
863 if (groupName != nullptr) {
864 if (ParseGroupSpecifier(&p, name)) {
865 if (strcmp(name.c_str(), groupName) == 0) {
866 return captureIndex;
867 }
868 }
869 }
870 captureIndex++;
871 }
872 } else {
873 captureIndex++;
874 }
875 break;
876 }
877 case '\\':
878 p++; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
879 break;
880 case '[': {
881 while (p < end_ && *p != ']') {
882 if (*p == '\\') {
883 p++; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
884 }
885 p++; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
886 }
887 break;
888 }
889 default:
890 break;
891 }
892 }
893 return captureIndex;
894 }
895
896 // NOLINTNEXTLINE(readability-function-size)
ParseAtomEscape(bool isBackward)897 int RegExpParser::ParseAtomEscape(bool isBackward)
898 {
899 // AtomEscape[U, N]::
900 // DecimalEscape
901 // CharacterClassEscape[?U]
902 // CharacterEscape[?U]
903 // [+N]kGroupName[?U]
904 int result = -1;
905 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
906 PrintF("Parse AtomEscape------\n");
907 PrevOpCode prevOp;
908 switch (c0_) {
909 case KEY_EOF:
910 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
911 ParseError("unexpected end");
912 break;
913 // DecimalEscape
914 case '1':
915 case '2':
916 case '3':
917 case '4':
918 case '5':
919 case '6':
920 case '7':
921 case '8':
922 case '9': {
923 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
924 PrintF("NonZeroDigit %c\n", c0_);
925 int capture = ParseDecimalDigits();
926 if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
927 ParseError("invalid backreference count");
928 break;
929 }
930 if (isBackward) {
931 BackwardBackReferenceOpCode backReferenceOp;
932 backReferenceOp.EmitOpCode(&buffer_, capture);
933 } else {
934 BackReferenceOpCode backReferenceOp;
935 backReferenceOp.EmitOpCode(&buffer_, capture);
936 }
937 break;
938 }
939 // CharacterClassEscape
940 case 'd': {
941 // [0-9]
942 RangeOpCode rangeOp;
943 if (isBackward) {
944 prevOp.EmitOpCode(&buffer_, 0);
945 }
946 rangeOp.InsertOpCode(&buffer_, g_rangeD);
947 goto parseLookBehind;
948 }
949 case 'D': {
950 // [^0-9]
951 RangeSet atomRange(g_rangeD);
952 atomRange.Invert(IsUtf16());
953 Range32OpCode rangeOp;
954 if (isBackward) {
955 prevOp.EmitOpCode(&buffer_, 0);
956 }
957 rangeOp.InsertOpCode(&buffer_, atomRange);
958 goto parseLookBehind;
959 }
960 case 's': {
961 // [\f\n\r\t\v]
962 RangeOpCode rangeOp;
963 if (isBackward) {
964 prevOp.EmitOpCode(&buffer_, 0);
965 }
966 rangeOp.InsertOpCode(&buffer_, g_rangeS);
967 goto parseLookBehind;
968 }
969 case 'S': {
970 RangeSet atomRange(g_rangeS);
971 Range32OpCode rangeOp;
972 atomRange.Invert(IsUtf16());
973 if (isBackward) {
974 prevOp.EmitOpCode(&buffer_, 0);
975 }
976 rangeOp.InsertOpCode(&buffer_, atomRange);
977 goto parseLookBehind;
978 }
979 case 'w': {
980 // [A-Za-z0-9]
981 RangeOpCode rangeOp;
982 if (isBackward) {
983 prevOp.EmitOpCode(&buffer_, 0);
984 }
985 rangeOp.InsertOpCode(&buffer_, g_rangeW);
986 goto parseLookBehind;
987 }
988 case 'W': {
989 // [^A-Za-z0-9]
990 RangeSet atomRange(g_rangeW);
991 atomRange.Invert(IsUtf16());
992 Range32OpCode rangeOp;
993 if (isBackward) {
994 prevOp.EmitOpCode(&buffer_, 0);
995 }
996 rangeOp.InsertOpCode(&buffer_, atomRange);
997 goto parseLookBehind;
998 }
999 // P{UnicodePropertyValueExpression}
1000 // p{UnicodePropertyValueExpression}
1001 case 'P':
1002 case 'p':
1003 // [+N]kGroupName[?U]
1004 case 'k': {
1005 Advance();
1006 if (c0_ != '<') {
1007 if (!IsUtf16() || HasNamedCaptures()) {
1008 ParseError("expecting group name.");
1009 break;
1010 }
1011 }
1012 Advance();
1013 Prev();
1014 CString name;
1015 auto **pp = const_cast<const uint8_t **>(&pc_);
1016 if (!ParseGroupSpecifier(pp, name)) {
1017 ParseError("GroupName Syntax error.");
1018 break;
1019 }
1020 int postion = FindGroupName(name);
1021 if (postion < 0) {
1022 postion = ParseCaptureCount(name.c_str());
1023 if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1024 ParseError("group name not defined");
1025 break;
1026 }
1027 }
1028 if (isBackward) {
1029 BackwardBackReferenceOpCode backReferenceOp;
1030 backReferenceOp.EmitOpCode(&buffer_, postion);
1031 } else {
1032 BackReferenceOpCode backReferenceOp;
1033 backReferenceOp.EmitOpCode(&buffer_, postion);
1034 }
1035 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1036 Advance();
1037 break;
1038 }
1039 parseLookBehind: {
1040 if (isBackward) {
1041 prevOp.EmitOpCode(&buffer_, 0);
1042 }
1043 Advance();
1044 break;
1045 }
1046 default:
1047 result = ParseCharacterEscape();
1048 break;
1049 }
1050 return result;
1051 }
1052
RecountCaptures()1053 int RegExpParser::RecountCaptures()
1054 {
1055 if (totalCaptureCount_ < 0) {
1056 const char *name = reinterpret_cast<const char*>(groupNames_.buf_);
1057 totalCaptureCount_ = ParseCaptureCount(name);
1058 }
1059 return totalCaptureCount_;
1060 }
HasNamedCaptures()1061 bool RegExpParser::HasNamedCaptures()
1062 {
1063 if (hasNamedCaptures_ < 0) {
1064 RecountCaptures();
1065 }
1066 return false;
1067 }
1068
ParseCharacterEscape()1069 int RegExpParser::ParseCharacterEscape()
1070 {
1071 // CharacterEscape[U]::
1072 // ControlEscape
1073 // c ControlLetter
1074 // 0 [lookahead ∉ DecimalDigit]
1075 // HexEscapeSequence
1076 // RegExpUnicodeEscapeSequence[?U]
1077 // IdentityEscape[?U]
1078 uint32_t result = 0;
1079 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1080 switch (c0_) {
1081 // ControlEscape
1082 case 'f':
1083 result = '\f';
1084 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1085 PrintF("ControlEscape %c\n", c0_);
1086 Advance();
1087 break;
1088 case 'n':
1089 result = '\n';
1090 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1091 PrintF("ControlEscape %c\n", c0_);
1092 Advance();
1093 break;
1094 case 'r':
1095 result = '\r';
1096 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1097 PrintF("ControlEscape %c\n", c0_);
1098 Advance();
1099 break;
1100 case 't':
1101 result = '\t';
1102 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1103 PrintF("ControlEscape %c\n", c0_);
1104 Advance();
1105 break;
1106 case 'v':
1107 result = '\v';
1108 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1109 PrintF("ControlEscape %c\n", c0_);
1110 Advance();
1111 break;
1112 // c ControlLetter
1113 case 'c': {
1114 Advance();
1115 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1116 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1117 PrintF("ControlLetter %c\n", c0_);
1118 result = static_cast<uint32_t>(c0_) & 0x1f; // NOLINTNEXTLINE(readability-magic-numbers)
1119 Advance();
1120 } else {
1121 if (!IsUtf16()) {
1122 pc_--; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1123 result = '\\';
1124 } else {
1125 ParseError("Invalid control letter");
1126 return -1;
1127 }
1128 }
1129 break;
1130 }
1131 case '0': {
1132 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1133 PrintF("CharacterEscape 0 [lookahead ∉ DecimalDigit]\n");
1134 if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) { // NOLINTNEXTLINE(readability-magic-numbers)
1135 Advance();
1136 result = 0;
1137 break;
1138 }
1139 [[fallthrough]];
1140 }
1141 case '1':
1142 case '2':
1143 case '3':
1144 case '4':
1145 case '5':
1146 case '6':
1147 case '7': {
1148 if (IsUtf16()) {
1149 // With /u, decimal escape is not interpreted as octal character code.
1150 ParseError("Invalid class escape");
1151 return 0;
1152 }
1153 result = ParseOctalLiteral();
1154 break;
1155 }
1156 // ParseHexEscapeSequence
1157 // ParseRegExpUnicodeEscapeSequence
1158 case 'x': {
1159 Advance();
1160 if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1161 return result;
1162 }
1163 if (IsUtf16()) {
1164 ParseError("Invalid class escape");
1165 return -1;
1166 }
1167 result = 'x';
1168 break;
1169 }
1170 case 'u': {
1171 Advance();
1172 if (ParseUnicodeEscape(&result)) {
1173 return result;
1174 }
1175 if (IsUtf16()) {
1176 // With /u, invalid escapes are not treated as identity escapes.
1177 ParseError("Invalid unicode escape");
1178 return 0;
1179 }
1180 // If \u is not followed by a two-digit hexadecimal, treat it
1181 // as an identity escape.
1182 result = 'u';
1183 break;
1184 }
1185 // IdentityEscape[?U]
1186 case '$':
1187 case '(':
1188 case ')':
1189 case '*':
1190 case '+':
1191 case '.':
1192 case '/':
1193 case '?':
1194 case '[':
1195 case '\\':
1196 case ']':
1197 case '^':
1198 case '{':
1199 case '|':
1200 case '}':
1201 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1202 PrintF("IdentityEscape %c\n", c0_);
1203 result = c0_;
1204 Advance();
1205 break;
1206 default: {
1207 if (IsUtf16()) {
1208 ParseError("Invalid unicode escape");
1209 return 0;
1210 }
1211 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1212 PrintF("SourceCharacter %c\n", c0_);
1213 result = c0_;
1214 if (result < CHAR_MAXS) {
1215 Advance();
1216 } else {
1217 Prev();
1218 const uint8_t *p = pc_;
1219 result = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
1220 int offset = static_cast<int>(p - pc_);
1221 Advance(offset + 1);
1222 }
1223 break;
1224 }
1225 }
1226 return static_cast<int>(result);
1227 }
1228
ParseClassRanges(RangeSet * result)1229 bool RegExpParser::ParseClassRanges(RangeSet *result)
1230 {
1231 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1232 PrintF("Parse ClassRanges------\n");
1233 while (c0_ != ']') {
1234 RangeSet s1;
1235 bool needInter = false;
1236 uint32_t c1 = ParseClassAtom(&s1);
1237 if (c1 == UINT32_MAX) {
1238 ParseError("invalid class range");
1239 return false;
1240 }
1241 needInter = NeedIntersection(c1);
1242 int next_c0 = *pc_;
1243 if (c0_ == '-' && next_c0 != ']') {
1244 if (c1 == CLASS_RANGE_BASE) {
1245 if (IsUtf16()) {
1246 ParseError("invalid class range");
1247 return false;
1248 }
1249 result->Insert(s1);
1250 continue;
1251 }
1252 Advance();
1253 RangeSet s2;
1254 uint32_t c2 = ParseClassAtom(&s2);
1255 if (c2 == UINT32_MAX) {
1256 ParseError("invalid class range");
1257 return false;
1258 }
1259 if (c2 == CLASS_RANGE_BASE) {
1260 if (IsUtf16()) {
1261 ParseError("invalid class range");
1262 return false;
1263 }
1264 result->Insert(s2);
1265 continue;
1266 }
1267 if (c1 < INT8_MAX) {
1268 if (c1 > c2) {
1269 ParseError("invalid class range");
1270 return false;
1271 }
1272 }
1273 needInter = NeedIntersection(c2);
1274 result->Insert(c1, c2);
1275 if (IsIgnoreCase() && needInter) {
1276 ProcessIntersection(result);
1277 }
1278 } else {
1279 result->Insert(s1);
1280 if (!(IsIgnoreCase() && needInter)) {
1281 continue;
1282 }
1283 if (c1 <= 'z' && c1 >= 'a') {
1284 result->Insert(RangeSet(c1 - 'a' + 'A'));
1285 } else {
1286 result->Insert(RangeSet(c1 - 'A' + 'a'));
1287 }
1288 }
1289 }
1290 Advance();
1291 return true;
1292 }
1293
ParseClassAtom(RangeSet * atom)1294 uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1295 {
1296 uint32_t ret = UINT32_MAX;
1297 switch (c0_) {
1298 case '\\': {
1299 Advance();
1300 ret = static_cast<uint32_t>(ParseClassEscape(atom));
1301 break;
1302 }
1303 case KEY_EOF:
1304 break;
1305 case 0: {
1306 if (pc_ >= end_) {
1307 return UINT32_MAX;
1308 }
1309 [[fallthrough]];
1310 }
1311 default: {
1312 uint32_t value = c0_;
1313 size_t u16_size = 0;
1314 if (c0_ > INT8_MAX) { // NOLINTNEXTLINE(readability-magic-numbers)
1315 pc_ -= 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1316 auto u16_result = base::utf_helper::ConvertUtf8ToUtf16Pair(pc_, true);
1317 value = u16_result.first;
1318 u16_size = u16_result.second;
1319 Advance(u16_size + 1);
1320 } else {
1321 Advance();
1322 }
1323 atom->Insert(RangeSet(value));
1324 ret = value;
1325 break;
1326 }
1327 }
1328 return ret;
1329 }
1330
ParseClassEscape(RangeSet * atom)1331 int RegExpParser::ParseClassEscape(RangeSet *atom)
1332 {
1333 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1334 PrintF("Parse ClassEscape------\n");
1335 int result = -1;
1336 switch (c0_) {
1337 case 'b':
1338 Advance();
1339 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1340 PrintF("ClassEscape %c", 'b');
1341 result = '\b';
1342 atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1343 break;
1344 case '-':
1345 Advance();
1346 result = '-';
1347 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1348 PrintF("ClassEscape %c", '-');
1349 atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1350 break;
1351 // CharacterClassEscape
1352 case 'd':
1353 case 'D':
1354 result = CLASS_RANGE_BASE;
1355 atom->Insert(g_rangeD);
1356 if (c0_ == 'D') {
1357 atom->Invert(IsUtf16());
1358 }
1359 Advance();
1360 break;
1361 case 's':
1362 case 'S':
1363 result = CLASS_RANGE_BASE;
1364 atom->Insert(g_rangeS);
1365 if (c0_ == 'S') {
1366 atom->Invert(IsUtf16());
1367 }
1368 Advance();
1369 break;
1370 case 'w':
1371 case 'W':
1372 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1373 PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1374 result = CLASS_RANGE_BASE;
1375 atom->Insert(g_rangeW);
1376 if (c0_ == 'W') {
1377 atom->Invert(IsUtf16());
1378 }
1379 Advance();
1380 break;
1381 // P{UnicodePropertyValueExpression}
1382 // p{UnicodePropertyValueExpression}
1383 case 'P':
1384 case 'p':
1385 PrintF("Warning: \\p is not supported in ECMA 2015!");
1386 Advance();
1387 if (c0_ == '{') {
1388 Advance();
1389 if (c0_ == '}') {
1390 break; // p{}, invalid
1391 }
1392 bool isValue = false;
1393 ParseUnicodePropertyValueCharacters(&isValue);
1394 if (!isValue && c0_ == '=') {
1395 // UnicodePropertyName = UnicodePropertyValue
1396 Advance();
1397 if (c0_ == '}') {
1398 break; // p{xxx=}, invalid
1399 }
1400 ParseUnicodePropertyValueCharacters(&isValue);
1401 }
1402 if (c0_ != '}') {
1403 break; // p{xxx, invalid
1404 }
1405 // should do atom->Invert() here after ECMA 9.0
1406 Advance();
1407 result = CLASS_RANGE_BASE;
1408 }
1409 break;
1410 default:
1411 result = ParseCharacterEscape();
1412 int value = result;
1413 if (IsIgnoreCase()) {
1414 value = Canonicalize(value, IsUtf16());
1415 }
1416 atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1417 break;
1418 }
1419 return result;
1420 }
1421
ParseUnicodePropertyValueCharacters(bool * isValue)1422 void RegExpParser::ParseUnicodePropertyValueCharacters(bool *isValue)
1423 {
1424 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1425 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1426 PrintF("UnicodePropertyCharacter::ControlLetter %c\n", c0_);
1427 } else if (c0_ == '_') {
1428 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1429 PrintF("UnicodePropertyCharacter:: _ \n");
1430 } else if (c0_ >= '0' && c0_ <= '9') {
1431 *isValue = true;
1432 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1433 PrintF("UnicodePropertyValueCharacter::DecimalDigit %c\n", c0_);
1434 } else {
1435 return;
1436 }
1437 Advance();
1438 ParseUnicodePropertyValueCharacters(isValue);
1439 }
1440
1441 // NOLINTNEXTLINE(cert-dcl50-cpp)
PrintF(const char * fmt,...)1442 void RegExpParser::PrintF(const char *fmt, ...)
1443 {
1444 #ifndef _NO_DEBUG_
1445 va_list args;
1446 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,)
1447 va_start(args, fmt);
1448 vprintf(fmt, args);
1449 va_end(args);
1450 #else
1451 (void)fmt;
1452 #endif
1453 }
1454
ParseError(const char * errorMessage)1455 void RegExpParser::ParseError(const char *errorMessage)
1456 {
1457 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1458 PrintF("error: ");
1459 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1460 PrintF(errorMessage);
1461 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1462 PrintF("\n");
1463 SetIsError();
1464 size_t length = strlen(errorMessage) + 1;
1465 if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1466 LOG_FULL(FATAL) << "memcpy_s failed";
1467 UNREACHABLE();
1468 }
1469 }
1470
IsIdentFirst(uint32_t c)1471 int RegExpParser::IsIdentFirst(uint32_t c)
1472 {
1473 if (c < CACHE_SIZE) {
1474 return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1; // 5: Shift five bits 31: and operation binary of 31
1475 } else {
1476 return static_cast<int>(u_isIDStart(c));
1477 }
1478 }
1479
Canonicalize(int c,bool isUnicode)1480 int RegExpParser::Canonicalize(int c, bool isUnicode)
1481 {
1482 if (c < TMP_BUF_SIZE) { // NOLINTNEXTLINE(readability-magic-numbers)
1483 if (c >= 'a' && c <= 'z') {
1484 c = c - 'a' + 'A';
1485 }
1486 } else {
1487 int cur = c;
1488 if (isUnicode) {
1489 c = u_tolower(static_cast<UChar32>(c));
1490 if (c >= 'a' && c <= 'z') {
1491 c = cur;
1492 }
1493 } else {
1494 c = u_toupper(static_cast<UChar32>(c));
1495 if (c >= 'A' && c <= 'Z') {
1496 c = cur;
1497 }
1498 }
1499 }
1500 return c;
1501 }
1502
NeedIntersection(uint32_t c)1503 bool RegExpParser::NeedIntersection(uint32_t c)
1504 {
1505 return (c <= 'z' && c >= 'a') || (c <= 'Z' && c >= 'A');
1506 }
1507
DoParserStackOverflowCheck(const char * errorMessage)1508 void RegExpParser::DoParserStackOverflowCheck(const char *errorMessage)
1509 {
1510 if (UNLIKELY(thread_->GetCurrentStackPosition() < thread_->GetStackLimit())) {
1511 LOG_ECMA(ERROR) << "Stack overflow! current:" << thread_->GetCurrentStackPosition() <<
1512 " limit:" << thread_->GetStackLimit();
1513 ParseError(errorMessage);
1514 return;
1515 }
1516 }
1517 } // namespace panda::ecmascript
1518