1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "ecmascript/regexp/regexp_parser.h"
17
18 #include "ecmascript/base/string_helper.h"
19 #include "ecmascript/ecma_macros.h"
20 #include "ecmascript/regexp/regexp_opcode.h"
21 #include "libpandabase/utils/utils.h"
22 #include "securec.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uniset.h"
25 #define _NO_DEBUG_
26
27 namespace panda::ecmascript {
28 static constexpr uint32_t CACHE_SIZE = 128;
29 static constexpr uint32_t CHAR_MAXS = 128;
30 static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
31 /* $ A-Z _ a-z */
32 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
33 };
34 static RangeSet g_rangeD(0x30, 0x39); // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
35 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
36 static RangeSet g_rangeS({
37 std::pair<uint32_t, uint32_t>(0x0009, 0x000D), // NOLINTNEXTLINE(readability-magic-numbers)
38 std::pair<uint32_t, uint32_t>(0x0020, 0x0020), // NOLINTNEXTLINE(readability-magic-numbers)
39 std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0), // NOLINTNEXTLINE(readability-magic-numbers)
40 std::pair<uint32_t, uint32_t>(0x1680, 0x1680), // NOLINTNEXTLINE(readability-magic-numbers)
41 std::pair<uint32_t, uint32_t>(0x2000, 0x200A), // NOLINTNEXTLINE(readability-magic-numbers)
42 /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
43 /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
44 std::pair<uint32_t, uint32_t>(0x2028, 0x2029), // NOLINTNEXTLINE(readability-magic-numbers)
45 std::pair<uint32_t, uint32_t>(0x202F, 0x202F), // NOLINTNEXTLINE(readability-magic-numbers)
46 std::pair<uint32_t, uint32_t>(0x205F, 0x205F), // NOLINTNEXTLINE(readability-magic-numbers)
47 std::pair<uint32_t, uint32_t>(0x3000, 0x3000), // NOLINTNEXTLINE(readability-magic-numbers)
48 /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
49 std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF), // NOLINTNEXTLINE(readability-magic-numbers)
50 });
51
52 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
53 static RangeSet g_rangeW({
54 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINTNEXTLINE(readability-magic-numbers)
55 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINTNEXTLINE(readability-magic-numbers)
56 std::pair<uint32_t, uint32_t>(0x005F, 0x005F), // NOLINTNEXTLINE(readability-magic-numbers)
57 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINTNEXTLINE(readability-magic-numbers)
58 });
59
60 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
61 static RangeSet g_regexpIdentifyStart({
62 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINTNEXTLINE(readability-magic-numbers)
63 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINTNEXTLINE(readability-magic-numbers)
64 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINTNEXTLINE(readability-magic-numbers)
65 });
66
67 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
68 static RangeSet g_regexpIdentifyContinue({
69 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINTNEXTLINE(readability-magic-numbers)
70 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINTNEXTLINE(readability-magic-numbers)
71 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINTNEXTLINE(readability-magic-numbers)
72 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINTNEXTLINE(readability-magic-numbers)
73 });
74
Parse()75 void RegExpParser::Parse()
76 {
77 // dynbuffer head init [size,capture_count,statck_count,flags]
78 buffer_.EmitU32(0);
79 buffer_.EmitU32(0);
80 buffer_.EmitU32(0);
81 buffer_.EmitU32(0);
82 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
83 PrintF("Parse Pattern------\n");
84 // Pattern[U, N]::
85 // Disjunction[?U, ?N]
86 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
87 Advance();
88 SaveStartOpCode saveStartOp;
89 int captureIndex = captureCount_++;
90 saveStartOp.EmitOpCode(&buffer_, captureIndex);
91 ParseDisjunction(false);
92 if (c0_ != KEY_EOF) {
93 ParseError("extraneous characters at the end");
94 return;
95 }
96 SaveEndOpCode saveEndOp;
97 saveEndOp.EmitOpCode(&buffer_, captureIndex);
98 MatchEndOpCode matchEndOp;
99 matchEndOp.EmitOpCode(&buffer_, 0);
100 // dynbuffer head assignments
101 buffer_.PutU32(0, buffer_.size_);
102 buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
103 buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
104 buffer_.PutU32(FLAGS_OFFSET, flags_);
105 #ifndef _NO_DEBUG_
106 RegExpOpCode::DumpRegExpOpCode(std::cout, buffer_);
107 #endif
108 }
109
ParseDisjunction(bool isBackward)110 void RegExpParser::ParseDisjunction(bool isBackward)
111 {
112 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
113 PrintF("Parse Disjunction------\n");
114 size_t start = buffer_.size_;
115 ParseAlternative(isBackward);
116 if (isError_) {
117 return;
118 }
119 do {
120 if (c0_ == '|') {
121 SplitNextOpCode splitOp;
122 uint32_t len = buffer_.size_ - start;
123 GotoOpCode gotoOp;
124 splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
125 uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
126 Advance();
127 ParseAlternative(isBackward);
128 gotoOp.UpdateOpPara(&buffer_, pos, buffer_.size_ - pos - gotoOp.GetSize());
129 }
130 } while (c0_ != KEY_EOF && c0_ != ')');
131 }
132
ParseOctalLiteral()133 uint32_t RegExpParser::ParseOctalLiteral()
134 {
135 // For compatibility with some other browsers (not all), we parse
136 // up to three octal digits with a value below 256.
137 // ES#prod-annexB-LegacyOctalEscapeSequence
138 uint32_t value = c0_ - '0';
139 Advance();
140 if (c0_ >= '0' && c0_ <= '7') {
141 value = value * OCTAL_VALUE + c0_ - '0';
142 Advance();
143 if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
144 value = value * OCTAL_VALUE + c0_ - '0';
145 Advance();
146 }
147 }
148 return value;
149 }
150
ParseUnlimitedLengthHexNumber(uint32_t maxValue,uint32_t * value)151 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
152 {
153 uint32_t x = 0;
154 int d = static_cast<int>(HexValue(c0_));
155 if (d < 0) {
156 return false;
157 }
158 while (d >= 0) {
159 if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
160 LOG_FULL(FATAL) << "value overflow";
161 return false;
162 }
163 x = x * HEX_VALUE + static_cast<uint32_t>(d);
164 if (x > maxValue) {
165 return false;
166 }
167 Advance();
168 d = static_cast<int>(HexValue(c0_));
169 }
170 *value = x;
171 return true;
172 }
173
174 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uint32_t * value)175 bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
176 {
177 // Accept both \uxxxx and \u{xxxxxx} (if allowed).
178 // In the latter case, the number of hex digits between { } is arbitrary.
179 // \ and u have already been read.
180 if (c0_ == '{' && IsUtf16()) {
181 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
182 Advance();
183 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { // NOLINTNEXTLINE(readability-magic-numbers)
184 if (c0_ == '}') {
185 Advance();
186 return true;
187 }
188 }
189 pc_ = start;
190 Advance();
191 return false;
192 }
193 // \u but no {, or \u{...} escapes not allowed.
194 bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
195 if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
196 // Attempt to read trail surrogate.
197 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
198 if (*pc_ == 'u') {
199 Advance(UNICODE_HEX_ADVANCE);
200 uint32_t trail = 0;
201 if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
202 *value = U16_GET_SUPPLEMENTARY((*value), (trail)); // NOLINTNEXTLINE(hicpp-signed-bitwise)
203 return true;
204 }
205 }
206 pc_ = start;
207 Advance();
208 }
209 return result;
210 }
211
ParseHexEscape(int length,uint32_t * value)212 bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
213 {
214 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
215 uint32_t val = 0;
216 for (int i = 0; i < length; ++i) {
217 uint32_t c = c0_;
218 int d = static_cast<int>(HexValue(c));
219 if (d < 0) {
220 pc_ = start;
221 Advance();
222 return false;
223 }
224 val = val * HEX_VALUE + static_cast<uint32_t>(d);
225 Advance();
226 }
227 *value = val;
228 return true;
229 }
230
231 // NOLINTNEXTLINE(readability-function-size)
ParseAlternative(bool isBackward)232 void RegExpParser::ParseAlternative(bool isBackward)
233 {
234 size_t start = buffer_.size_;
235 while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
236 if (isError_) {
237 return;
238 }
239 size_t atomBcStart = buffer_.GetSize();
240 int captureIndex = 0;
241 bool isAtom = false;
242 switch (c0_) {
243 case '^': {
244 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
245 PrintF("Assertion %c line start \n", c0_);
246 LineStartOpCode lineStartOp;
247 lineStartOp.EmitOpCode(&buffer_, 0);
248 Advance();
249 break;
250 }
251 case '$': {
252 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
253 PrintF("Assertion %c line end \n", c0_);
254 LineEndOpCode lineEndOp;
255 lineEndOp.EmitOpCode(&buffer_, 0);
256 Advance();
257 break;
258 }
259 case '\\': {
260 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
261 PrintF("Escape %c \n", c0_);
262 Advance();
263 switch (c0_) {
264 case 'b': {
265 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
266 PrintF("Assertion %c \n", c0_);
267 WordBoundaryOpCode wordBoundaryOp;
268 wordBoundaryOp.EmitOpCode(&buffer_, 0);
269 Advance();
270 break;
271 }
272 case 'B': {
273 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
274 PrintF("Assertion %c \n", c0_);
275 NotWordBoundaryOpCode notWordBoundaryOp;
276 notWordBoundaryOp.EmitOpCode(&buffer_, 0);
277 Advance();
278 break;
279 }
280 default: {
281 isAtom = true;
282 int atomValue = ParseAtomEscape(isBackward);
283 if (atomValue != -1) {
284 PrevOpCode prevOp;
285 if (isBackward) {
286 prevOp.EmitOpCode(&buffer_, 0);
287 }
288 if (IsIgnoreCase()) {
289 if (!IsUtf16()) {
290 atomValue = Canonicalize(atomValue, false);
291 } else {
292 icu::UnicodeSet set(atomValue, atomValue);
293 set.closeOver(USET_CASE_INSENSITIVE);
294 set.removeAllStrings();
295 int32_t size = set.size();
296 RangeOpCode rangeOp;
297 RangeSet rangeResult;
298 for (int32_t idx = 0; idx < size; idx++) {
299 int32_t uc = set.charAt(idx);
300 RangeSet curRange(uc);
301 rangeResult.Insert(curRange);
302 }
303 rangeOp.InsertOpCode(&buffer_, rangeResult);
304 break;
305 }
306 }
307 if (atomValue <= UINT16_MAX) {
308 CharOpCode charOp;
309 charOp.EmitOpCode(&buffer_, atomValue);
310 } else {
311 Char32OpCode charOp;
312 charOp.EmitOpCode(&buffer_, atomValue);
313 }
314 if (isBackward) {
315 prevOp.EmitOpCode(&buffer_, 0);
316 }
317 }
318 break;
319 }
320 }
321 break;
322 }
323 case '(': {
324 Advance();
325 isAtom = ParseAssertionCapture(&captureIndex, isBackward);
326 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
327 Advance();
328 break;
329 }
330 case '.': {
331 PrevOpCode prevOp;
332 if (isBackward) {
333 prevOp.EmitOpCode(&buffer_, 0);
334 }
335 if (IsDotAll()) {
336 AllOpCode allOp;
337 allOp.EmitOpCode(&buffer_, 0);
338 } else {
339 DotsOpCode dotsOp;
340 dotsOp.EmitOpCode(&buffer_, 0);
341 }
342 if (isBackward) {
343 prevOp.EmitOpCode(&buffer_, 0);
344 }
345 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
346 PrintF("Atom %c match any \n", c0_);
347 isAtom = true;
348 Advance();
349 break;
350 }
351 case '[': {
352 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
353 PrintF("Atom %c match range \n", c0_);
354 isAtom = true;
355 PrevOpCode prevOp;
356 Advance();
357 if (isBackward) {
358 prevOp.EmitOpCode(&buffer_, 0);
359 }
360 bool isInvert = false;
361 if (c0_ == '^') {
362 isInvert = true;
363 Advance();
364 }
365 RangeSet rangeResult;
366 if (!ParseClassRanges(&rangeResult)) {
367 break;
368 }
369 if (isInvert) {
370 rangeResult.Invert(IsUtf16());
371 }
372 uint32_t highValue = rangeResult.HighestValue();
373 if (highValue <= UINT16_MAX) {
374 RangeOpCode rangeOp;
375 rangeOp.InsertOpCode(&buffer_, rangeResult);
376 } else {
377 Range32OpCode rangeOp;
378 rangeOp.InsertOpCode(&buffer_, rangeResult);
379 }
380
381 if (isBackward) {
382 prevOp.EmitOpCode(&buffer_, 0);
383 }
384 break;
385 }
386 case '*':
387 case '+':
388 case '?':
389 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
390 ParseError("nothing to repeat");
391 return;
392 case '{': {
393 uint8_t *begin = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
394 int dummy;
395 if (ParserIntervalQuantifier(&dummy, &dummy)) {
396 ParseError("nothing to repeat");
397 return;
398 }
399 pc_ = begin;
400 Advance();
401 }
402 [[fallthrough]];
403 case '}':
404 case ']':
405 if (IsUtf16()) {
406 ParseError("syntax error");
407 return;
408 }
409 [[fallthrough]];
410 default: {
411 // PatternCharacter
412 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
413 PrintF("PatternCharacter %c\n", c0_);
414 isAtom = true;
415 {
416 PrevOpCode prevOp;
417 if (isBackward) {
418 prevOp.EmitOpCode(&buffer_, 0);
419 }
420 uint32_t matchedChar = c0_;
421 if (c0_ > (INT8_MAX + 1)) {
422 Prev();
423 int i = 0;
424 UChar32 c;
425 int32_t length = end_ - pc_ + 1;
426 // NOLINTNEXTLINE(hicpp-signed-bitwise)
427 U8_NEXT(pc_, i, length, c); // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
428 matchedChar = static_cast<uint32_t>(c);
429 pc_ += i; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
430 }
431 if (IsIgnoreCase()) {
432 matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
433 }
434 if (matchedChar > UINT16_MAX) {
435 Char32OpCode charOp;
436 charOp.EmitOpCode(&buffer_, matchedChar);
437 } else {
438 CharOpCode charOp;
439 charOp.EmitOpCode(&buffer_, matchedChar);
440 }
441 if (isBackward) {
442 prevOp.EmitOpCode(&buffer_, 0);
443 }
444 }
445 Advance();
446 break;
447 }
448 }
449 if (isAtom && !isError_) {
450 ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
451 }
452 if (isBackward) {
453 size_t end = buffer_.GetSize();
454 size_t termSize = end - atomBcStart;
455 size_t moveSize = end - start;
456 buffer_.Expand(end + termSize);
457 if (memmove_s(buffer_.buf_ + start + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
458 termSize, // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
459 moveSize,
460 buffer_.buf_ + start, // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
461 moveSize) != EOK) {
462 LOG_FULL(FATAL) << "memmove_s failed";
463 UNREACHABLE();
464 }
465 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
466 if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
467 LOG_FULL(FATAL) << "memcpy_s failed";
468 UNREACHABLE();
469 }
470 }
471 }
472 }
473
FindGroupName(const CString & name)474 int RegExpParser::FindGroupName(const CString &name)
475 {
476 size_t len = 0;
477 size_t nameLen = name.size();
478 const char *p = reinterpret_cast<char *>(groupNames_.buf_);
479 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
480 const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
481 int captureIndex = 1;
482 while (p < bufEnd) {
483 len = strlen(p);
484 if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
485 return captureIndex;
486 }
487 p += len + 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
488 captureIndex++;
489 }
490 return -1;
491 }
492
ParseAssertionCapture(int * captureIndex,bool isBackward)493 bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
494 {
495 bool isAtom = false;
496 do {
497 if (c0_ == '?') {
498 Advance();
499 switch (c0_) {
500 // (?=Disjunction[?U, ?N])
501 case '=': {
502 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
503 PrintF("Assertion(?= Disjunction)\n");
504 Advance();
505 uint32_t start = buffer_.size_;
506 ParseDisjunction(isBackward);
507 MatchOpCode matchOp;
508 matchOp.EmitOpCode(&buffer_, 0);
509 MatchAheadOpCode matchAheadOp;
510 uint32_t len = buffer_.size_ - start;
511 matchAheadOp.InsertOpCode(&buffer_, start, len);
512 break;
513 }
514 // (?!Disjunction[?U, ?N])
515 case '!': {
516 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
517 PrintF("Assertion(?! Disjunction)\n");
518 uint32_t start = buffer_.size_;
519 Advance();
520 ParseDisjunction(isBackward);
521 MatchOpCode matchOp;
522 matchOp.EmitOpCode(&buffer_, 0);
523 NegativeMatchAheadOpCode matchAheadOp;
524 uint32_t len = buffer_.size_ - start;
525 matchAheadOp.InsertOpCode(&buffer_, start, len);
526 break;
527 }
528 case '<': {
529 Advance();
530 // (?<=Disjunction[?U, ?N])
531 if (c0_ == '=') {
532 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
533 PrintF("Assertion(?<= Disjunction)\n");
534 Advance();
535 uint32_t start = buffer_.size_;
536 ParseDisjunction(true);
537 MatchOpCode matchOp;
538 matchOp.EmitOpCode(&buffer_, 0);
539 MatchAheadOpCode matchAheadOp;
540 uint32_t len = buffer_.size_ - start;
541 matchAheadOp.InsertOpCode(&buffer_, start, len);
542 // (?<!Disjunction[?U, ?N])
543 } else if (c0_ == '!') {
544 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
545 PrintF("Assertion(?<! Disjunction)\n");
546 Advance();
547 uint32_t start = buffer_.size_;
548 ParseDisjunction(true);
549 MatchOpCode matchOp;
550 matchOp.EmitOpCode(&buffer_, 0);
551 NegativeMatchAheadOpCode matchAheadOp;
552 uint32_t len = buffer_.size_ - start;
553 matchAheadOp.InsertOpCode(&buffer_, start, len);
554 } else {
555 Prev();
556 CString name;
557 auto **pp = const_cast<const uint8_t **>(&pc_);
558 if (!ParseGroupSpecifier(pp, name)) {
559 ParseError("GroupName Syntax error.");
560 return false;
561 }
562 if (FindGroupName(name) > 0) {
563 ParseError("Duplicate GroupName error.");
564 return false;
565 }
566 groupNames_.EmitStr(name.c_str());
567 newGroupNames_.push_back(name);
568 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
569 PrintF("group name %s", name.c_str());
570 Advance();
571 goto parseCapture; // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
572 }
573 break;
574 }
575 // (?:Disjunction[?U, ?N])
576 case ':':
577 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
578 PrintF("Atom(?<: Disjunction)\n");
579 isAtom = true;
580 Advance();
581 ParseDisjunction(isBackward);
582 break;
583 default:
584 Advance();
585 ParseError("? Syntax error.");
586 return false;
587 }
588 if (isError_) {
589 return false;
590 }
591 } else {
592 groupNames_.EmitChar(0);
593 parseCapture:
594 isAtom = true;
595 *captureIndex = captureCount_++;
596 SaveEndOpCode saveEndOp;
597 SaveStartOpCode saveStartOp;
598 if (isBackward) {
599 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
600 } else {
601 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
602 }
603 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
604 PrintF("capture start %d \n", *captureIndex);
605 ParseDisjunction(isBackward);
606 if (isError_) {
607 return false;
608 }
609 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
610 PrintF("capture end %d \n", *captureIndex);
611 if (isBackward) {
612 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
613 } else {
614 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
615 }
616 }
617 } while (c0_ != ')' && c0_ != KEY_EOF);
618 if (c0_ != ')') {
619 ParseError("capture syntax error");
620 return false;
621 }
622 return isAtom;
623 }
624
ParseDecimalDigits()625 int RegExpParser::ParseDecimalDigits()
626 {
627 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
628 PrintF("Parse DecimalDigits------\n");
629 uint32_t result = 0;
630 bool overflow = false;
631 while (true) {
632 if (c0_ < '0' || c0_ > '9') {
633 break;
634 }
635 if (!overflow) {
636 if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
637 overflow = true;
638 } else {
639 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
640 }
641 }
642 Advance();
643 }
644 if (overflow) {
645 return INT32_MAX;
646 }
647 return result;
648 }
649
ParserIntervalQuantifier(int * pmin,int * pmax)650 bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
651 {
652 // Quantifier::
653 // QuantifierPrefix
654 // QuantifierPrefix?
655 // QuantifierPrefix::
656 // *
657 // +
658 // ?
659 // {DecimalDigits}
660 // {DecimalDigits,}
661 // {DecimalDigits,DecimalDigits}
662 Advance();
663 *pmin = ParseDecimalDigits();
664 *pmax = *pmin;
665 switch (c0_) {
666 case ',': {
667 Advance();
668 if (c0_ == '}') {
669 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
670 PrintF("QuantifierPrefix{DecimalDigits,}\n");
671 *pmax = INT32_MAX;
672 Advance();
673 } else {
674 *pmax = ParseDecimalDigits();
675 if (c0_ == '}') {
676 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
677 PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
678 Advance();
679 } else {
680 return false;
681 }
682 }
683 break;
684 }
685 case '}':
686 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
687 PrintF("QuantifierPrefix{DecimalDigits}\n");
688 Advance();
689 break;
690 default:
691 Advance();
692 return false;
693 }
694 return true;
695 }
696
ParseQuantifier(size_t atomBcStart,int captureStart,int captureEnd)697 void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
698 {
699 int min = -1;
700 int max = -1;
701 bool isGreedy = true;
702 switch (c0_) {
703 case '*':
704 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
705 PrintF("QuantifierPrefix %c\n", c0_);
706 min = 0;
707 max = INT32_MAX;
708 Advance();
709 break;
710 case '+':
711 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
712 PrintF("QuantifierPrefix %c\n", c0_);
713 min = 1;
714 max = INT32_MAX;
715 Advance();
716 break;
717 case '?':
718 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
719 PrintF("QuantifierPrefix %c\n", c0_);
720 Advance();
721 min = 0;
722 max = 1;
723 break;
724 case '{': {
725 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
726 if (!ParserIntervalQuantifier(&min, &max)) {
727 pc_ = start;
728 Advance(); // back to '{'
729 return;
730 }
731 if (min > max) {
732 ParseError("Invalid repetition count");
733 return;
734 }
735 break;
736 }
737 default:
738 break;
739 }
740 if (c0_ == '?') {
741 isGreedy = false;
742 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
743 PrintF("Quantifier::QuantifierPrefix?\n");
744 Advance();
745 } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
746 ParseError("nothing to repeat");
747 return;
748 }
749 if (min != -1 && max != -1) {
750 stackCount_++;
751 PushOpCode pushOp;
752 pushOp.InsertOpCode(&buffer_, atomBcStart);
753 atomBcStart += pushOp.GetSize();
754
755 if (captureStart != 0) {
756 SaveResetOpCode saveResetOp;
757 saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
758 }
759
760 // zero advance check
761 if (max == INT32_MAX) {
762 stackCount_++;
763 PushCharOpCode pushCharOp;
764 pushCharOp.InsertOpCode(&buffer_, atomBcStart);
765 CheckCharOpCode checkCharOp;
766 // NOLINTNEXTLINE(readability-magic-numbers)
767 checkCharOp.EmitOpCode(&buffer_, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
768 }
769
770 if (isGreedy) {
771 LoopGreedyOpCode loopOp;
772 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
773 } else {
774 LoopOpCode loopOp;
775 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
776 }
777
778 if (min == 0) {
779 if (isGreedy) {
780 SplitNextOpCode splitNextOp;
781 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
782 } else {
783 SplitFirstOpCode splitFirstOp;
784 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
785 }
786 }
787
788 PopOpCode popOp;
789 popOp.EmitOpCode(&buffer_);
790 }
791 }
792
ParseGroupSpecifier(const uint8_t ** pp,CString & name)793 bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, CString &name)
794 {
795 const uint8_t *p = *pp;
796 uint32_t c = 0;
797 char buffer[CACHE_SIZE] = {0};
798 char *q = buffer;
799 while (true) {
800 if (p <= end_) {
801 c = *p;
802 } else {
803 c = KEY_EOF;
804 }
805 if (c == '\\') {
806 p++;
807 if (*p != 'u') {
808 return false;
809 }
810 if (!ParseUnicodeEscape(&c)) {
811 return false;
812 }
813 } else if (c == '>') {
814 break;
815 } else if (c > CACHE_SIZE && c != KEY_EOF) {
816 c = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
817 } else if (c != KEY_EOF) {
818 p++;
819 } else {
820 return false;
821 }
822 if (q == buffer) {
823 if (!IsIdentFirst(c)) {
824 return false;
825 }
826 } else {
827 if (!u_isIDPart(c)) {
828 return false;
829 }
830 }
831 if (q != nullptr) {
832 *q++ = c;
833 }
834 } // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
835 p++;
836 *pp = p;
837 name = buffer;
838 return true;
839 }
840
ParseCaptureCount(const char * groupName)841 int RegExpParser::ParseCaptureCount(const char *groupName)
842 {
843 const uint8_t *p = nullptr;
844 int captureIndex = 1;
845 CString name;
846 hasNamedCaptures_ = 0;
847 for (p = base_; p < end_; p++) { // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
848 switch (*p) {
849 case '(': {
850 if (p[1] == '?') { // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
851 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
852 if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
853 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
854 p[CAPTURE_CONUT_ADVANCE] != '=') {
855 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
856 hasNamedCaptures_ = 1;
857 p += CAPTURE_CONUT_ADVANCE;
858 if (groupName != nullptr) {
859 if (ParseGroupSpecifier(&p, name)) {
860 if (strcmp(name.c_str(), groupName) == 0) {
861 return captureIndex;
862 }
863 }
864 }
865 captureIndex++;
866 }
867 } else {
868 captureIndex++;
869 }
870 break;
871 }
872 case '\\':
873 p++; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
874 break;
875 case '[': {
876 while (p < end_ && *p != ']') {
877 if (*p == '\\') {
878 p++; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
879 }
880 p++; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
881 }
882 break;
883 }
884 default:
885 break;
886 }
887 }
888 return captureIndex;
889 }
890
891 // NOLINTNEXTLINE(readability-function-size)
ParseAtomEscape(bool isBackward)892 int RegExpParser::ParseAtomEscape(bool isBackward)
893 {
894 // AtomEscape[U, N]::
895 // DecimalEscape
896 // CharacterClassEscape[?U]
897 // CharacterEscape[?U]
898 // [+N]kGroupName[?U]
899 int result = -1;
900 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
901 PrintF("Parse AtomEscape------\n");
902 PrevOpCode prevOp;
903 switch (c0_) {
904 case KEY_EOF:
905 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
906 ParseError("unexpected end");
907 break;
908 // DecimalEscape
909 case '1':
910 case '2':
911 case '3':
912 case '4':
913 case '5':
914 case '6':
915 case '7':
916 case '8':
917 case '9': {
918 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
919 PrintF("NonZeroDigit %c\n", c0_);
920 int capture = ParseDecimalDigits();
921 if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
922 ParseError("invalid backreference count");
923 break;
924 }
925 if (isBackward) {
926 BackwardBackReferenceOpCode backReferenceOp;
927 backReferenceOp.EmitOpCode(&buffer_, capture);
928 } else {
929 BackReferenceOpCode backReferenceOp;
930 backReferenceOp.EmitOpCode(&buffer_, capture);
931 }
932 break;
933 }
934 // CharacterClassEscape
935 case 'd': {
936 // [0-9]
937 RangeOpCode rangeOp;
938 if (isBackward) {
939 prevOp.EmitOpCode(&buffer_, 0);
940 }
941 rangeOp.InsertOpCode(&buffer_, g_rangeD);
942 goto parseLookBehind;
943 break;
944 }
945 case 'D': {
946 // [^0-9]
947 RangeSet atomRange(g_rangeD);
948 atomRange.Invert(IsUtf16());
949 Range32OpCode rangeOp;
950 if (isBackward) {
951 prevOp.EmitOpCode(&buffer_, 0);
952 }
953 rangeOp.InsertOpCode(&buffer_, atomRange);
954 goto parseLookBehind;
955 break;
956 }
957 case 's': {
958 // [\f\n\r\t\v]
959 RangeOpCode rangeOp;
960 if (isBackward) {
961 prevOp.EmitOpCode(&buffer_, 0);
962 }
963 rangeOp.InsertOpCode(&buffer_, g_rangeS);
964 goto parseLookBehind;
965 break;
966 }
967 case 'S': {
968 RangeSet atomRange(g_rangeS);
969 Range32OpCode rangeOp;
970 atomRange.Invert(IsUtf16());
971 if (isBackward) {
972 prevOp.EmitOpCode(&buffer_, 0);
973 }
974 rangeOp.InsertOpCode(&buffer_, atomRange);
975 goto parseLookBehind;
976 break;
977 }
978 case 'w': {
979 // [A-Za-z0-9]
980 RangeOpCode rangeOp;
981 if (isBackward) {
982 prevOp.EmitOpCode(&buffer_, 0);
983 }
984 rangeOp.InsertOpCode(&buffer_, g_rangeW);
985 goto parseLookBehind;
986 break;
987 }
988 case 'W': {
989 // [^A-Za-z0-9]
990 RangeSet atomRange(g_rangeW);
991 atomRange.Invert(IsUtf16());
992 Range32OpCode rangeOp;
993 if (isBackward) {
994 prevOp.EmitOpCode(&buffer_, 0);
995 }
996 rangeOp.InsertOpCode(&buffer_, atomRange);
997 goto parseLookBehind;
998 break;
999 }
1000 // P{UnicodePropertyValueExpression}
1001 // p{UnicodePropertyValueExpression}
1002 case 'P':
1003 case 'p':
1004 // [+N]kGroupName[?U]
1005 case 'k': {
1006 Advance();
1007 if (c0_ != '<') {
1008 if (!IsUtf16() || HasNamedCaptures()) {
1009 ParseError("expecting group name.");
1010 break;
1011 }
1012 }
1013 Advance();
1014 Prev();
1015 CString name;
1016 auto **pp = const_cast<const uint8_t **>(&pc_);
1017 if (!ParseGroupSpecifier(pp, name)) {
1018 ParseError("GroupName Syntax error.");
1019 break;
1020 }
1021 int postion = FindGroupName(name);
1022 if (postion < 0) {
1023 postion = ParseCaptureCount(name.c_str());
1024 if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1025 ParseError("group name not defined");
1026 break;
1027 }
1028 }
1029 if (isBackward) {
1030 BackwardBackReferenceOpCode backReferenceOp;
1031 backReferenceOp.EmitOpCode(&buffer_, postion);
1032 } else {
1033 BackReferenceOpCode backReferenceOp;
1034 backReferenceOp.EmitOpCode(&buffer_, postion);
1035 }
1036 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1037 Advance();
1038 break;
1039 }
1040 parseLookBehind: {
1041 if (isBackward) {
1042 prevOp.EmitOpCode(&buffer_, 0);
1043 }
1044 Advance();
1045 break;
1046 }
1047 default:
1048 result = ParseCharacterEscape();
1049 break;
1050 }
1051 return result;
1052 }
1053
RecountCaptures()1054 int RegExpParser::RecountCaptures()
1055 {
1056 if (totalCaptureCount_ < 0) {
1057 const char *name = reinterpret_cast<const char*>(groupNames_.buf_);
1058 totalCaptureCount_ = ParseCaptureCount(name);
1059 }
1060 return totalCaptureCount_;
1061 }
HasNamedCaptures()1062 bool RegExpParser::HasNamedCaptures()
1063 {
1064 if (hasNamedCaptures_ < 0) {
1065 RecountCaptures();
1066 }
1067 return false;
1068 }
1069
ParseCharacterEscape()1070 int RegExpParser::ParseCharacterEscape()
1071 {
1072 // CharacterEscape[U]::
1073 // ControlEscape
1074 // c ControlLetter
1075 // 0 [lookahead ∉ DecimalDigit]
1076 // HexEscapeSequence
1077 // RegExpUnicodeEscapeSequence[?U]
1078 // IdentityEscape[?U]
1079 uint32_t result = 0;
1080 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1081 switch (c0_) {
1082 // ControlEscape
1083 case 'f':
1084 result = '\f';
1085 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1086 PrintF("ControlEscape %c\n", c0_);
1087 Advance();
1088 break;
1089 case 'n':
1090 result = '\n';
1091 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1092 PrintF("ControlEscape %c\n", c0_);
1093 Advance();
1094 break;
1095 case 'r':
1096 result = '\r';
1097 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1098 PrintF("ControlEscape %c\n", c0_);
1099 Advance();
1100 break;
1101 case 't':
1102 result = '\t';
1103 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1104 PrintF("ControlEscape %c\n", c0_);
1105 Advance();
1106 break;
1107 case 'v':
1108 result = '\v';
1109 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1110 PrintF("ControlEscape %c\n", c0_);
1111 Advance();
1112 break;
1113 // c ControlLetter
1114 case 'c': {
1115 Advance();
1116 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1117 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1118 PrintF("ControlLetter %c\n", c0_);
1119 result = static_cast<uint32_t>(c0_) & 0x1f; // NOLINTNEXTLINE(readability-magic-numbers)
1120 Advance();
1121 } else {
1122 if (!IsUtf16()) {
1123 pc_--; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1124 result = '\\';
1125 } else {
1126 ParseError("Invalid control letter");
1127 return -1;
1128 }
1129 }
1130 break;
1131 }
1132 case '0': {
1133 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1134 PrintF("CharacterEscape 0 [lookahead ∉ DecimalDigit]\n");
1135 if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) { // NOLINTNEXTLINE(readability-magic-numbers)
1136 Advance();
1137 result = 0;
1138 break;
1139 }
1140 [[fallthrough]];
1141 }
1142 case '1':
1143 case '2':
1144 case '3':
1145 case '4':
1146 case '5':
1147 case '6':
1148 case '7': {
1149 if (IsUtf16()) {
1150 // With /u, decimal escape is not interpreted as octal character code.
1151 ParseError("Invalid class escape");
1152 return 0;
1153 }
1154 result = ParseOctalLiteral();
1155 break;
1156 }
1157 // ParseHexEscapeSequence
1158 // ParseRegExpUnicodeEscapeSequence
1159 case 'x': {
1160 Advance();
1161 if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1162 return result;
1163 }
1164 if (IsUtf16()) {
1165 ParseError("Invalid class escape");
1166 return -1;
1167 }
1168 result = 'x';
1169 break;
1170 }
1171 case 'u': {
1172 Advance();
1173 if (ParseUnicodeEscape(&result)) {
1174 return result;
1175 }
1176 if (IsUtf16()) {
1177 // With /u, invalid escapes are not treated as identity escapes.
1178 ParseError("Invalid unicode escape");
1179 return 0;
1180 }
1181 // If \u is not followed by a two-digit hexadecimal, treat it
1182 // as an identity escape.
1183 result = 'u';
1184 break;
1185 }
1186 // IdentityEscape[?U]
1187 case '$':
1188 case '(':
1189 case ')':
1190 case '*':
1191 case '+':
1192 case '.':
1193 case '/':
1194 case '?':
1195 case '[':
1196 case '\\':
1197 case ']':
1198 case '^':
1199 case '{':
1200 case '|':
1201 case '}':
1202 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1203 PrintF("IdentityEscape %c\n", c0_);
1204 result = c0_;
1205 Advance();
1206 break;
1207 default: {
1208 if (IsUtf16()) {
1209 ParseError("Invalid unicode escape");
1210 return 0;
1211 }
1212 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1213 PrintF("SourceCharacter %c\n", c0_);
1214 result = c0_;
1215 if (result < CHAR_MAXS) {
1216 Advance();
1217 }
1218 break;
1219 }
1220 }
1221 return result;
1222 }
1223
ParseClassRanges(RangeSet * result)1224 bool RegExpParser::ParseClassRanges(RangeSet *result)
1225 {
1226 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1227 PrintF("Parse ClassRanges------\n");
1228 while (c0_ != ']') {
1229 RangeSet s1;
1230 uint32_t c1 = ParseClassAtom(&s1);
1231 if (c1 == UINT32_MAX) {
1232 ParseError("invalid class range");
1233 return false;
1234 }
1235
1236 int next_c0 = *pc_;
1237 if (c0_ == '-' && next_c0 != ']') {
1238 if (c1 == CLASS_RANGE_BASE) {
1239 if (IsUtf16()) {
1240 ParseError("invalid class range");
1241 return false;
1242 }
1243 result->Insert(s1);
1244 continue;
1245 }
1246 Advance();
1247 RangeSet s2;
1248 uint32_t c2 = ParseClassAtom(&s2);
1249 if (c2 == UINT32_MAX) {
1250 ParseError("invalid class range");
1251 return false;
1252 }
1253 if (c2 == CLASS_RANGE_BASE) {
1254 if (IsUtf16()) {
1255 ParseError("invalid class range");
1256 return false;
1257 }
1258 result->Insert(s2);
1259 continue;
1260 }
1261 if (c1 < INT8_MAX) {
1262 if (c1 > c2) {
1263 ParseError("invalid class range");
1264 return false;
1265 }
1266 }
1267 if (IsIgnoreCase()) {
1268 c1 = static_cast<uint32_t>(Canonicalize(c1, IsUtf16()));
1269 c2 = static_cast<uint32_t>(Canonicalize(c2, IsUtf16()));
1270 }
1271
1272 result->Insert(c1, c2);
1273 } else {
1274 result->Insert(s1);
1275 }
1276 }
1277 Advance();
1278 return true;
1279 }
1280
ParseClassAtom(RangeSet * atom)1281 uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1282 {
1283 uint32_t ret = UINT32_MAX;
1284 switch (c0_) {
1285 case '\\': {
1286 Advance();
1287 ret = static_cast<uint32_t>(ParseClassEscape(atom));
1288 break;
1289 }
1290 case KEY_EOF:
1291 break;
1292 case 0: {
1293 if (pc_ >= end_) {
1294 return UINT32_MAX;
1295 }
1296 [[fallthrough]];
1297 }
1298 default: {
1299 uint32_t value = c0_;
1300 size_t u16_size = 0;
1301 if (c0_ > INT8_MAX) { // NOLINTNEXTLINE(readability-magic-numbers)
1302 pc_ -= 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1303 auto u16_result = base::utf_helper::ConvertUtf8ToUtf16Pair(pc_, true);
1304 value = u16_result.first;
1305 u16_size = u16_result.second;
1306 Advance(u16_size + 1);
1307 } else {
1308 Advance();
1309 }
1310 if (IsIgnoreCase()) {
1311 value = static_cast<uint32_t>(Canonicalize(value, IsUtf16()));
1312 }
1313 atom->Insert(RangeSet(value));
1314 ret = value;
1315 break;
1316 }
1317 }
1318 return ret;
1319 }
1320
ParseClassEscape(RangeSet * atom)1321 int RegExpParser::ParseClassEscape(RangeSet *atom)
1322 {
1323 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1324 PrintF("Parse ClassEscape------\n");
1325 int result = -1;
1326 switch (c0_) {
1327 case 'b':
1328 Advance();
1329 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1330 PrintF("ClassEscape %c", 'b');
1331 result = '\b';
1332 atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1333 break;
1334 case '-':
1335 Advance();
1336 result = '-';
1337 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1338 PrintF("ClassEscape %c", '-');
1339 atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1340 break;
1341 // CharacterClassEscape
1342 case 'd':
1343 case 'D':
1344 result = CLASS_RANGE_BASE;
1345 atom->Insert(g_rangeD);
1346 if (c0_ == 'D') {
1347 atom->Invert(IsUtf16());
1348 }
1349 Advance();
1350 break;
1351 case 's':
1352 case 'S':
1353 result = CLASS_RANGE_BASE;
1354 atom->Insert(g_rangeS);
1355 if (c0_ == 'S') {
1356 atom->Invert(IsUtf16());
1357 }
1358 Advance();
1359 break;
1360 case 'w':
1361 case 'W':
1362 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1363 PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1364 result = CLASS_RANGE_BASE;
1365 atom->Insert(g_rangeW);
1366 if (c0_ == 'W') {
1367 atom->Invert(IsUtf16());
1368 }
1369 Advance();
1370 break;
1371 // P{UnicodePropertyValueExpression}
1372 // p{UnicodePropertyValueExpression}
1373 case 'P':
1374 case 'p':
1375 PrintF("Warning: \\p is not supported in ECMA 2015!");
1376 Advance();
1377 if (c0_ == '{') {
1378 Advance();
1379 if (c0_ == '}') {
1380 break; // p{}, invalid
1381 }
1382 bool isValue = false;
1383 ParseUnicodePropertyValueCharacters(&isValue);
1384 if (!isValue && c0_ == '=') {
1385 // UnicodePropertyName = UnicodePropertyValue
1386 Advance();
1387 if (c0_ == '}') {
1388 break; // p{xxx=}, invalid
1389 }
1390 ParseUnicodePropertyValueCharacters(&isValue);
1391 }
1392 if (c0_ != '}') {
1393 break; // p{xxx, invalid
1394 }
1395 // should do atom->Invert() here after ECMA 9.0
1396 Advance();
1397 result = CLASS_RANGE_BASE;
1398 }
1399 break;
1400 default:
1401 result = ParseCharacterEscape();
1402 int value = result;
1403 if (IsIgnoreCase()) {
1404 value = Canonicalize(value, IsUtf16());
1405 }
1406 atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1407 break;
1408 }
1409 return result;
1410 }
1411
ParseUnicodePropertyValueCharacters(bool * isValue)1412 void RegExpParser::ParseUnicodePropertyValueCharacters(bool *isValue)
1413 {
1414 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1415 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1416 PrintF("UnicodePropertyCharacter::ControlLetter %c\n", c0_);
1417 } else if (c0_ == '_') {
1418 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1419 PrintF("UnicodePropertyCharacter:: _ \n");
1420 } else if (c0_ >= '0' && c0_ <= '9') {
1421 *isValue = true;
1422 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1423 PrintF("UnicodePropertyValueCharacter::DecimalDigit %c\n", c0_);
1424 } else {
1425 return;
1426 }
1427 Advance();
1428 ParseUnicodePropertyValueCharacters(isValue);
1429 }
1430
1431 // NOLINTNEXTLINE(cert-dcl50-cpp)
PrintF(const char * fmt,...)1432 void RegExpParser::PrintF(const char *fmt, ...)
1433 {
1434 #ifndef _NO_DEBUG_
1435 va_list args;
1436 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,)
1437 va_start(args, fmt);
1438 vprintf(fmt, args);
1439 va_end(args);
1440 #else
1441 (void)fmt;
1442 #endif
1443 }
1444
ParseError(const char * errorMessage)1445 void RegExpParser::ParseError(const char *errorMessage)
1446 {
1447 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1448 PrintF("error: ");
1449 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1450 PrintF(errorMessage);
1451 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1452 PrintF("\n");
1453 SetIsError();
1454 size_t length = strlen(errorMessage) + 1;
1455 if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1456 LOG_FULL(FATAL) << "memcpy_s failed";
1457 UNREACHABLE();
1458 }
1459 }
1460
IsIdentFirst(uint32_t c)1461 int RegExpParser::IsIdentFirst(uint32_t c)
1462 {
1463 if (c < CACHE_SIZE) {
1464 return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1; // 5: Shift five bits 31: and operation binary of 31
1465 } else {
1466 return static_cast<int>(u_isIDStart(c));
1467 }
1468 }
1469 } // namespace panda::ecmascript