1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "ecmascript/regexp/regexp_parser.h"
17
18 #include "ecmascript/base/string_helper.h"
19 #include "ecmascript/ecma_macros.h"
20 #include "ecmascript/regexp/regexp_opcode.h"
21 #include "libpandabase/utils/utils.h"
22 #include "securec.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uniset.h"
25 #define _NO_DEBUG_
26
27 namespace panda::ecmascript {
28 static constexpr uint32_t CACHE_SIZE = 128;
29 static constexpr uint32_t CHAR_MAXS = 128;
30 static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
31 /* $ A-Z _ a-z */
32 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
33 };
34 static RangeSet g_rangeD(0x30, 0x39); // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
35 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
36 static RangeSet g_rangeS({
37 std::pair<uint32_t, uint32_t>(0x0009, 0x000D), // NOLINTNEXTLINE(readability-magic-numbers)
38 std::pair<uint32_t, uint32_t>(0x0020, 0x0020), // NOLINTNEXTLINE(readability-magic-numbers)
39 std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0), // NOLINTNEXTLINE(readability-magic-numbers)
40 std::pair<uint32_t, uint32_t>(0x1680, 0x1680), // NOLINTNEXTLINE(readability-magic-numbers)
41 std::pair<uint32_t, uint32_t>(0x2000, 0x200A), // NOLINTNEXTLINE(readability-magic-numbers)
42 /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
43 /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
44 std::pair<uint32_t, uint32_t>(0x2028, 0x2029), // NOLINTNEXTLINE(readability-magic-numbers)
45 std::pair<uint32_t, uint32_t>(0x202F, 0x202F), // NOLINTNEXTLINE(readability-magic-numbers)
46 std::pair<uint32_t, uint32_t>(0x205F, 0x205F), // NOLINTNEXTLINE(readability-magic-numbers)
47 std::pair<uint32_t, uint32_t>(0x3000, 0x3000), // NOLINTNEXTLINE(readability-magic-numbers)
48 /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
49 std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF), // NOLINTNEXTLINE(readability-magic-numbers)
50 });
51
52 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
53 static RangeSet g_rangeW({
54 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINTNEXTLINE(readability-magic-numbers)
55 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINTNEXTLINE(readability-magic-numbers)
56 std::pair<uint32_t, uint32_t>(0x005F, 0x005F), // NOLINTNEXTLINE(readability-magic-numbers)
57 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINTNEXTLINE(readability-magic-numbers)
58 });
59
60 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
61 static RangeSet g_regexpIdentifyStart({
62 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINTNEXTLINE(readability-magic-numbers)
63 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINTNEXTLINE(readability-magic-numbers)
64 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINTNEXTLINE(readability-magic-numbers)
65 });
66
67 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
68 static RangeSet g_regexpIdentifyContinue({
69 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINTNEXTLINE(readability-magic-numbers)
70 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINTNEXTLINE(readability-magic-numbers)
71 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINTNEXTLINE(readability-magic-numbers)
72 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINTNEXTLINE(readability-magic-numbers)
73 });
74
Parse()75 void RegExpParser::Parse()
76 {
77 // dynbuffer head init [size,capture_count,statck_count,flags]
78 buffer_.EmitU32(0);
79 buffer_.EmitU32(0);
80 buffer_.EmitU32(0);
81 buffer_.EmitU32(0);
82 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
83 PrintF("Parse Pattern------\n");
84 // Pattern[U, N]::
85 // Disjunction[?U, ?N]
86 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
87 Advance();
88 SaveStartOpCode saveStartOp;
89 int captureIndex = captureCount_++;
90 saveStartOp.EmitOpCode(&buffer_, captureIndex);
91 ParseDisjunction(false);
92 if (c0_ != KEY_EOF) {
93 ParseError("extraneous characters at the end");
94 return;
95 }
96 SaveEndOpCode saveEndOp;
97 saveEndOp.EmitOpCode(&buffer_, captureIndex);
98 MatchEndOpCode matchEndOp;
99 matchEndOp.EmitOpCode(&buffer_, 0);
100 // dynbuffer head assignments
101 buffer_.PutU32(0, buffer_.size_);
102 buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
103 buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
104 buffer_.PutU32(FLAGS_OFFSET, flags_);
105 #ifndef _NO_DEBUG_
106 RegExpOpCode::DumpRegExpOpCode(std::cout, buffer_);
107 #endif
108 }
109
ParseDisjunction(bool isBackward)110 void RegExpParser::ParseDisjunction(bool isBackward)
111 {
112 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
113 PrintF("Parse Disjunction------\n");
114 size_t start = buffer_.size_;
115 ParseAlternative(isBackward);
116 if (isError_) {
117 return;
118 }
119 do {
120 if (c0_ == '|') {
121 SplitNextOpCode splitOp;
122 uint32_t len = buffer_.size_ - start;
123 GotoOpCode gotoOp;
124 splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
125 uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
126 Advance();
127 ParseAlternative(isBackward);
128 gotoOp.UpdateOpPara(&buffer_, pos, buffer_.size_ - pos - gotoOp.GetSize());
129 }
130 } while (c0_ != KEY_EOF && c0_ != ')');
131 }
132
ParseOctalLiteral()133 uint32_t RegExpParser::ParseOctalLiteral()
134 {
135 // For compatibility with some other browsers (not all), we parse
136 // up to three octal digits with a value below 256.
137 // ES#prod-annexB-LegacyOctalEscapeSequence
138 uint32_t value = c0_ - '0';
139 Advance();
140 if (c0_ >= '0' && c0_ <= '7') {
141 value = value * OCTAL_VALUE + c0_ - '0';
142 Advance();
143 if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
144 value = value * OCTAL_VALUE + c0_ - '0';
145 Advance();
146 }
147 }
148 return value;
149 }
150
ParseUnlimitedLengthHexNumber(uint32_t maxValue,uint32_t * value)151 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
152 {
153 uint32_t x = 0;
154 int d = static_cast<int>(HexValue(c0_));
155 if (d < 0) {
156 return false;
157 }
158 while (d >= 0) {
159 if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
160 LOG_FULL(FATAL) << "value overflow";
161 return false;
162 }
163 x = x * HEX_VALUE + static_cast<uint32_t>(d);
164 if (x > maxValue) {
165 return false;
166 }
167 Advance();
168 d = static_cast<int>(HexValue(c0_));
169 }
170 *value = x;
171 return true;
172 }
173
174 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uint32_t * value)175 bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
176 {
177 // Accept both \uxxxx and \u{xxxxxx} (if allowed).
178 // In the latter case, the number of hex digits between { } is arbitrary.
179 // \ and u have already been read.
180 if (c0_ == '{' && IsUtf16()) {
181 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
182 Advance();
183 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { // NOLINTNEXTLINE(readability-magic-numbers)
184 if (c0_ == '}') {
185 Advance();
186 return true;
187 }
188 }
189 pc_ = start;
190 Advance();
191 return false;
192 }
193 // \u but no {, or \u{...} escapes not allowed.
194 bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
195 if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
196 // Attempt to read trail surrogate.
197 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
198 if (*pc_ == 'u') {
199 Advance(UNICODE_HEX_ADVANCE);
200 uint32_t trail = 0;
201 if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
202 *value = U16_GET_SUPPLEMENTARY((*value), (trail)); // NOLINTNEXTLINE(hicpp-signed-bitwise)
203 return true;
204 }
205 }
206 pc_ = start;
207 Advance();
208 }
209 return result;
210 }
211
ParseHexEscape(int length,uint32_t * value)212 bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
213 {
214 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
215 uint32_t val = 0;
216 for (int i = 0; i < length; ++i) {
217 uint32_t c = c0_;
218 int d = static_cast<int>(HexValue(c));
219 if (d < 0) {
220 pc_ = start;
221 Advance();
222 return false;
223 }
224 val = val * HEX_VALUE + static_cast<uint32_t>(d);
225 Advance();
226 }
227 *value = val;
228 return true;
229 }
230
231 // NOLINTNEXTLINE(readability-function-size)
ParseAlternative(bool isBackward)232 void RegExpParser::ParseAlternative(bool isBackward)
233 {
234 size_t start = buffer_.size_;
235 while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
236 if (isError_) {
237 return;
238 }
239 size_t atomBcStart = buffer_.GetSize();
240 int captureIndex = 0;
241 bool isAtom = false;
242 switch (c0_) {
243 case '^': {
244 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
245 PrintF("Assertion %c line start \n", c0_);
246 LineStartOpCode lineStartOp;
247 lineStartOp.EmitOpCode(&buffer_, 0);
248 Advance();
249 break;
250 }
251 case '$': {
252 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
253 PrintF("Assertion %c line end \n", c0_);
254 LineEndOpCode lineEndOp;
255 lineEndOp.EmitOpCode(&buffer_, 0);
256 Advance();
257 break;
258 }
259 case '\\': {
260 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
261 PrintF("Escape %c \n", c0_);
262 Advance();
263 switch (c0_) {
264 case 'b': {
265 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
266 PrintF("Assertion %c \n", c0_);
267 WordBoundaryOpCode wordBoundaryOp;
268 wordBoundaryOp.EmitOpCode(&buffer_, 0);
269 Advance();
270 break;
271 }
272 case 'B': {
273 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
274 PrintF("Assertion %c \n", c0_);
275 NotWordBoundaryOpCode notWordBoundaryOp;
276 notWordBoundaryOp.EmitOpCode(&buffer_, 0);
277 Advance();
278 break;
279 }
280 default: {
281 isAtom = true;
282 int atomValue = ParseAtomEscape(isBackward);
283 if (atomValue != -1) {
284 PrevOpCode prevOp;
285 if (isBackward) {
286 prevOp.EmitOpCode(&buffer_, 0);
287 }
288 if (IsIgnoreCase()) {
289 if (!IsUtf16()) {
290 atomValue = Canonicalize(atomValue, false);
291 } else {
292 icu::UnicodeSet set(atomValue, atomValue);
293 set.closeOver(USET_CASE_INSENSITIVE);
294 set.removeAllStrings();
295 uint32_t size = static_cast<uint32_t>(set.size());
296 RangeOpCode rangeOp;
297 RangeSet rangeResult;
298 for (uint32_t idx = 0; idx < size; idx++) {
299 int32_t uc = set.charAt(idx);
300 RangeSet curRange(uc);
301 rangeResult.Insert(curRange);
302 }
303 rangeOp.InsertOpCode(&buffer_, rangeResult);
304 break;
305 }
306 }
307 if (atomValue <= UINT16_MAX) {
308 CharOpCode charOp;
309 charOp.EmitOpCode(&buffer_, atomValue);
310 } else {
311 Char32OpCode charOp;
312 charOp.EmitOpCode(&buffer_, atomValue);
313 }
314 if (isBackward) {
315 prevOp.EmitOpCode(&buffer_, 0);
316 }
317 }
318 break;
319 }
320 }
321 break;
322 }
323 case '(': {
324 Advance();
325 isAtom = ParseAssertionCapture(&captureIndex, isBackward);
326 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
327 Advance();
328 break;
329 }
330 case '.': {
331 PrevOpCode prevOp;
332 if (isBackward) {
333 prevOp.EmitOpCode(&buffer_, 0);
334 }
335 if (IsDotAll()) {
336 AllOpCode allOp;
337 allOp.EmitOpCode(&buffer_, 0);
338 } else {
339 DotsOpCode dotsOp;
340 dotsOp.EmitOpCode(&buffer_, 0);
341 }
342 if (isBackward) {
343 prevOp.EmitOpCode(&buffer_, 0);
344 }
345 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
346 PrintF("Atom %c match any \n", c0_);
347 isAtom = true;
348 Advance();
349 break;
350 }
351 case '[': {
352 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
353 PrintF("Atom %c match range \n", c0_);
354 isAtom = true;
355 PrevOpCode prevOp;
356 Advance();
357 if (isBackward) {
358 prevOp.EmitOpCode(&buffer_, 0);
359 }
360 bool isInvert = false;
361 if (c0_ == '^') {
362 isInvert = true;
363 Advance();
364 }
365 RangeSet rangeResult;
366 if (!ParseClassRanges(&rangeResult)) {
367 break;
368 }
369 if (isInvert) {
370 rangeResult.Invert(IsUtf16());
371 }
372 uint32_t highValue = rangeResult.HighestValue();
373 if (highValue <= UINT16_MAX) {
374 RangeOpCode rangeOp;
375 rangeOp.InsertOpCode(&buffer_, rangeResult);
376 } else {
377 Range32OpCode rangeOp;
378 rangeOp.InsertOpCode(&buffer_, rangeResult);
379 }
380
381 if (isBackward) {
382 prevOp.EmitOpCode(&buffer_, 0);
383 }
384 break;
385 }
386 case '*':
387 case '+':
388 case '?':
389 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
390 ParseError("nothing to repeat");
391 return;
392 case '{': {
393 uint8_t *begin = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
394 int dummy;
395 if (ParserIntervalQuantifier(&dummy, &dummy)) {
396 ParseError("nothing to repeat");
397 return;
398 }
399 pc_ = begin;
400 Advance();
401 }
402 [[fallthrough]];
403 case '}':
404 case ']':
405 if (IsUtf16()) {
406 ParseError("syntax error");
407 return;
408 }
409 [[fallthrough]];
410 default: {
411 // PatternCharacter
412 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
413 PrintF("PatternCharacter %c\n", c0_);
414 isAtom = true;
415 {
416 PrevOpCode prevOp;
417 if (isBackward) {
418 prevOp.EmitOpCode(&buffer_, 0);
419 }
420 uint32_t matchedChar = c0_;
421 if (c0_ > (INT8_MAX + 1)) {
422 Prev();
423 int i = 0;
424 UChar32 c;
425 int32_t length = end_ - pc_ + 1;
426 // NOLINTNEXTLINE(hicpp-signed-bitwise)
427 U8_NEXT(pc_, i, length, c); // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
428 matchedChar = static_cast<uint32_t>(c);
429 pc_ += i; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
430 }
431 if (IsIgnoreCase()) {
432 matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
433 }
434 if (matchedChar > UINT16_MAX) {
435 Char32OpCode charOp;
436 charOp.EmitOpCode(&buffer_, matchedChar);
437 } else {
438 CharOpCode charOp;
439 charOp.EmitOpCode(&buffer_, matchedChar);
440 }
441 if (isBackward) {
442 prevOp.EmitOpCode(&buffer_, 0);
443 }
444 }
445 Advance();
446 break;
447 }
448 }
449 if (isAtom && !isError_) {
450 ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
451 }
452 if (isBackward) {
453 size_t end = buffer_.GetSize();
454 size_t termSize = end - atomBcStart;
455 size_t moveSize = end - start;
456 buffer_.Expand(end + termSize);
457 if (memmove_s(buffer_.buf_ + start + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
458 termSize, // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
459 moveSize,
460 buffer_.buf_ + start, // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
461 moveSize) != EOK) {
462 LOG_FULL(FATAL) << "memmove_s failed";
463 UNREACHABLE();
464 }
465 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
466 if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
467 LOG_FULL(FATAL) << "memcpy_s failed";
468 UNREACHABLE();
469 }
470 }
471 }
472 }
473
FindGroupName(const CString & name)474 int RegExpParser::FindGroupName(const CString &name)
475 {
476 size_t len = 0;
477 size_t nameLen = name.size();
478 const char *p = reinterpret_cast<char *>(groupNames_.buf_);
479 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
480 const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
481 int captureIndex = 1;
482 while (p < bufEnd) {
483 len = strlen(p);
484 if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
485 return captureIndex;
486 }
487 p += len + 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
488 captureIndex++;
489 }
490 return -1;
491 }
492
ParseAssertionCapture(int * captureIndex,bool isBackward)493 bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
494 {
495 bool isAtom = false;
496 do {
497 if (c0_ == '?') {
498 Advance();
499 switch (c0_) {
500 // (?=Disjunction[?U, ?N])
501 case '=': {
502 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
503 PrintF("Assertion(?= Disjunction)\n");
504 Advance();
505 uint32_t start = buffer_.size_;
506 ParseDisjunction(isBackward);
507 MatchOpCode matchOp;
508 matchOp.EmitOpCode(&buffer_, 0);
509 MatchAheadOpCode matchAheadOp;
510 uint32_t len = buffer_.size_ - start;
511 matchAheadOp.InsertOpCode(&buffer_, start, len);
512 break;
513 }
514 // (?!Disjunction[?U, ?N])
515 case '!': {
516 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
517 PrintF("Assertion(?! Disjunction)\n");
518 uint32_t start = buffer_.size_;
519 Advance();
520 ParseDisjunction(isBackward);
521 MatchOpCode matchOp;
522 matchOp.EmitOpCode(&buffer_, 0);
523 NegativeMatchAheadOpCode matchAheadOp;
524 uint32_t len = buffer_.size_ - start;
525 matchAheadOp.InsertOpCode(&buffer_, start, len);
526 break;
527 }
528 case '<': {
529 Advance();
530 // (?<=Disjunction[?U, ?N])
531 if (c0_ == '=') {
532 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
533 PrintF("Assertion(?<= Disjunction)\n");
534 Advance();
535 uint32_t start = buffer_.size_;
536 ParseDisjunction(true);
537 MatchOpCode matchOp;
538 matchOp.EmitOpCode(&buffer_, 0);
539 MatchAheadOpCode matchAheadOp;
540 uint32_t len = buffer_.size_ - start;
541 matchAheadOp.InsertOpCode(&buffer_, start, len);
542 // (?<!Disjunction[?U, ?N])
543 } else if (c0_ == '!') {
544 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
545 PrintF("Assertion(?<! Disjunction)\n");
546 Advance();
547 uint32_t start = buffer_.size_;
548 ParseDisjunction(true);
549 MatchOpCode matchOp;
550 matchOp.EmitOpCode(&buffer_, 0);
551 NegativeMatchAheadOpCode matchAheadOp;
552 uint32_t len = buffer_.size_ - start;
553 matchAheadOp.InsertOpCode(&buffer_, start, len);
554 } else {
555 Prev();
556 CString name;
557 auto **pp = const_cast<const uint8_t **>(&pc_);
558 if (!ParseGroupSpecifier(pp, name)) {
559 ParseError("GroupName Syntax error.");
560 return false;
561 }
562 if (FindGroupName(name) > 0) {
563 ParseError("Duplicate GroupName error.");
564 return false;
565 }
566 groupNames_.EmitStr(name.c_str());
567 newGroupNames_.push_back(name);
568 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
569 PrintF("group name %s", name.c_str());
570 Advance();
571 goto parseCapture; // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
572 }
573 break;
574 }
575 // (?:Disjunction[?U, ?N])
576 case ':':
577 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
578 PrintF("Atom(?<: Disjunction)\n");
579 isAtom = true;
580 Advance();
581 ParseDisjunction(isBackward);
582 break;
583 default:
584 Advance();
585 ParseError("? Syntax error.");
586 return false;
587 }
588 if (isError_) {
589 return false;
590 }
591 } else {
592 groupNames_.EmitChar(0);
593 parseCapture:
594 isAtom = true;
595 *captureIndex = captureCount_++;
596 SaveEndOpCode saveEndOp;
597 SaveStartOpCode saveStartOp;
598 if (isBackward) {
599 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
600 } else {
601 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
602 }
603 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
604 PrintF("capture start %d \n", *captureIndex);
605 ParseDisjunction(isBackward);
606 if (isError_) {
607 return false;
608 }
609 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
610 PrintF("capture end %d \n", *captureIndex);
611 if (isBackward) {
612 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
613 } else {
614 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
615 }
616 }
617 } while (c0_ != ')' && c0_ != KEY_EOF);
618 if (c0_ != ')') {
619 ParseError("capture syntax error");
620 return false;
621 }
622 return isAtom;
623 }
624
ParseDecimalDigits()625 int RegExpParser::ParseDecimalDigits()
626 {
627 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
628 PrintF("Parse DecimalDigits------\n");
629 uint32_t result = 0;
630 bool overflow = false;
631 while (true) {
632 if (c0_ < '0' || c0_ > '9') {
633 break;
634 }
635 if (!overflow) {
636 if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
637 overflow = true;
638 } else {
639 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
640 }
641 }
642 Advance();
643 }
644 if (overflow) {
645 return INT32_MAX;
646 }
647 return result;
648 }
649
ParserIntervalQuantifier(int * pmin,int * pmax)650 bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
651 {
652 // Quantifier::
653 // QuantifierPrefix
654 // QuantifierPrefix?
655 // QuantifierPrefix::
656 // *
657 // +
658 // ?
659 // {DecimalDigits}
660 // {DecimalDigits,}
661 // {DecimalDigits,DecimalDigits}
662 Advance();
663 *pmin = ParseDecimalDigits();
664 *pmax = *pmin;
665 switch (c0_) {
666 case ',': {
667 Advance();
668 if (c0_ == '}') {
669 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
670 PrintF("QuantifierPrefix{DecimalDigits,}\n");
671 *pmax = INT32_MAX;
672 Advance();
673 } else {
674 *pmax = ParseDecimalDigits();
675 if (c0_ == '}') {
676 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
677 PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
678 Advance();
679 } else {
680 return false;
681 }
682 }
683 break;
684 }
685 case '}':
686 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
687 PrintF("QuantifierPrefix{DecimalDigits}\n");
688 Advance();
689 break;
690 default:
691 Advance();
692 return false;
693 }
694 return true;
695 }
696
ParseQuantifier(size_t atomBcStart,int captureStart,int captureEnd)697 void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
698 {
699 int min = -1;
700 int max = -1;
701 bool isGreedy = true;
702 switch (c0_) {
703 case '*':
704 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
705 PrintF("QuantifierPrefix %c\n", c0_);
706 min = 0;
707 max = INT32_MAX;
708 Advance();
709 break;
710 case '+':
711 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
712 PrintF("QuantifierPrefix %c\n", c0_);
713 min = 1;
714 max = INT32_MAX;
715 Advance();
716 break;
717 case '?':
718 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
719 PrintF("QuantifierPrefix %c\n", c0_);
720 Advance();
721 min = 0;
722 max = 1;
723 break;
724 case '{': {
725 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
726 if (!ParserIntervalQuantifier(&min, &max)) {
727 pc_ = start;
728 Advance(); // back to '{'
729 return;
730 }
731 if (min > max) {
732 ParseError("Invalid repetition count");
733 return;
734 }
735 break;
736 }
737 default:
738 break;
739 }
740 if (c0_ == '?') {
741 isGreedy = false;
742 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
743 PrintF("Quantifier::QuantifierPrefix?\n");
744 Advance();
745 } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
746 ParseError("nothing to repeat");
747 return;
748 }
749 if (min != -1 && max != -1) {
750 stackCount_++;
751 PushOpCode pushOp;
752 pushOp.InsertOpCode(&buffer_, atomBcStart);
753 atomBcStart += pushOp.GetSize();
754
755 if (captureStart != 0) {
756 SaveResetOpCode saveResetOp;
757 saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
758 }
759
760 // zero advance check
761 if (max == INT32_MAX) {
762 stackCount_++;
763 PushCharOpCode pushCharOp;
764 pushCharOp.InsertOpCode(&buffer_, atomBcStart);
765 CheckCharOpCode checkCharOp;
766 // NOLINTNEXTLINE(readability-magic-numbers)
767 checkCharOp.EmitOpCode(&buffer_, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
768 }
769
770 if (isGreedy) {
771 LoopGreedyOpCode loopOp;
772 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
773 } else {
774 LoopOpCode loopOp;
775 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
776 }
777
778 if (min == 0) {
779 if (isGreedy) {
780 SplitNextOpCode splitNextOp;
781 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
782 } else {
783 SplitFirstOpCode splitFirstOp;
784 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
785 }
786 }
787
788 PopOpCode popOp;
789 popOp.EmitOpCode(&buffer_);
790 }
791 }
792
ParseGroupSpecifier(const uint8_t ** pp,CString & name)793 bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, CString &name)
794 {
795 const uint8_t *p = *pp;
796 uint32_t c = 0;
797 char buffer[CACHE_SIZE] = {0};
798 char *q = buffer;
799 while (true) {
800 if (p <= end_) {
801 c = *p;
802 } else {
803 c = KEY_EOF;
804 }
805 if (c == '\\') {
806 p++;
807 if (*p != 'u') {
808 return false;
809 }
810 if (!ParseUnicodeEscape(&c)) {
811 return false;
812 }
813 } else if (c == '>') {
814 break;
815 } else if (c > CACHE_SIZE && c != KEY_EOF) {
816 c = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
817 } else if (c != KEY_EOF) {
818 p++;
819 } else {
820 return false;
821 }
822 if (q == buffer) {
823 if (!IsIdentFirst(c)) {
824 return false;
825 }
826 } else {
827 if (!u_isIDPart(c)) {
828 return false;
829 }
830 }
831 if (q != nullptr) {
832 *q++ = c;
833 }
834 } // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
835 p++;
836 *pp = p;
837 name = buffer;
838 return true;
839 }
840
ParseCaptureCount(const char * groupName)841 int RegExpParser::ParseCaptureCount(const char *groupName)
842 {
843 const uint8_t *p = nullptr;
844 int captureIndex = 1;
845 CString name;
846 hasNamedCaptures_ = 0;
847 for (p = base_; p < end_; p++) { // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
848 switch (*p) {
849 case '(': {
850 if (p[1] == '?') { // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
851 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
852 if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
853 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
854 p[CAPTURE_CONUT_ADVANCE] != '=') {
855 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
856 hasNamedCaptures_ = 1;
857 p += CAPTURE_CONUT_ADVANCE;
858 if (groupName != nullptr) {
859 if (ParseGroupSpecifier(&p, name)) {
860 if (strcmp(name.c_str(), groupName) == 0) {
861 return captureIndex;
862 }
863 }
864 }
865 captureIndex++;
866 }
867 } else {
868 captureIndex++;
869 }
870 break;
871 }
872 case '\\':
873 p++; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
874 break;
875 case '[': {
876 while (p < end_ && *p != ']') {
877 if (*p == '\\') {
878 p++; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
879 }
880 p++; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
881 }
882 break;
883 }
884 default:
885 break;
886 }
887 }
888 return captureIndex;
889 }
890
891 // NOLINTNEXTLINE(readability-function-size)
ParseAtomEscape(bool isBackward)892 int RegExpParser::ParseAtomEscape(bool isBackward)
893 {
894 // AtomEscape[U, N]::
895 // DecimalEscape
896 // CharacterClassEscape[?U]
897 // CharacterEscape[?U]
898 // [+N]kGroupName[?U]
899 int result = -1;
900 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
901 PrintF("Parse AtomEscape------\n");
902 PrevOpCode prevOp;
903 switch (c0_) {
904 case KEY_EOF:
905 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
906 ParseError("unexpected end");
907 break;
908 // DecimalEscape
909 case '1':
910 case '2':
911 case '3':
912 case '4':
913 case '5':
914 case '6':
915 case '7':
916 case '8':
917 case '9': {
918 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
919 PrintF("NonZeroDigit %c\n", c0_);
920 int capture = ParseDecimalDigits();
921 if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
922 ParseError("invalid backreference count");
923 break;
924 }
925 if (isBackward) {
926 BackwardBackReferenceOpCode backReferenceOp;
927 backReferenceOp.EmitOpCode(&buffer_, capture);
928 } else {
929 BackReferenceOpCode backReferenceOp;
930 backReferenceOp.EmitOpCode(&buffer_, capture);
931 }
932 break;
933 }
934 // CharacterClassEscape
935 case 'd': {
936 // [0-9]
937 RangeOpCode rangeOp;
938 if (isBackward) {
939 prevOp.EmitOpCode(&buffer_, 0);
940 }
941 rangeOp.InsertOpCode(&buffer_, g_rangeD);
942 goto parseLookBehind;
943 }
944 case 'D': {
945 // [^0-9]
946 RangeSet atomRange(g_rangeD);
947 atomRange.Invert(IsUtf16());
948 Range32OpCode rangeOp;
949 if (isBackward) {
950 prevOp.EmitOpCode(&buffer_, 0);
951 }
952 rangeOp.InsertOpCode(&buffer_, atomRange);
953 goto parseLookBehind;
954 }
955 case 's': {
956 // [\f\n\r\t\v]
957 RangeOpCode rangeOp;
958 if (isBackward) {
959 prevOp.EmitOpCode(&buffer_, 0);
960 }
961 rangeOp.InsertOpCode(&buffer_, g_rangeS);
962 goto parseLookBehind;
963 }
964 case 'S': {
965 RangeSet atomRange(g_rangeS);
966 Range32OpCode rangeOp;
967 atomRange.Invert(IsUtf16());
968 if (isBackward) {
969 prevOp.EmitOpCode(&buffer_, 0);
970 }
971 rangeOp.InsertOpCode(&buffer_, atomRange);
972 goto parseLookBehind;
973 }
974 case 'w': {
975 // [A-Za-z0-9]
976 RangeOpCode rangeOp;
977 if (isBackward) {
978 prevOp.EmitOpCode(&buffer_, 0);
979 }
980 rangeOp.InsertOpCode(&buffer_, g_rangeW);
981 goto parseLookBehind;
982 }
983 case 'W': {
984 // [^A-Za-z0-9]
985 RangeSet atomRange(g_rangeW);
986 atomRange.Invert(IsUtf16());
987 Range32OpCode rangeOp;
988 if (isBackward) {
989 prevOp.EmitOpCode(&buffer_, 0);
990 }
991 rangeOp.InsertOpCode(&buffer_, atomRange);
992 goto parseLookBehind;
993 }
994 // P{UnicodePropertyValueExpression}
995 // p{UnicodePropertyValueExpression}
996 case 'P':
997 case 'p':
998 // [+N]kGroupName[?U]
999 case 'k': {
1000 Advance();
1001 if (c0_ != '<') {
1002 if (!IsUtf16() || HasNamedCaptures()) {
1003 ParseError("expecting group name.");
1004 break;
1005 }
1006 }
1007 Advance();
1008 Prev();
1009 CString name;
1010 auto **pp = const_cast<const uint8_t **>(&pc_);
1011 if (!ParseGroupSpecifier(pp, name)) {
1012 ParseError("GroupName Syntax error.");
1013 break;
1014 }
1015 int postion = FindGroupName(name);
1016 if (postion < 0) {
1017 postion = ParseCaptureCount(name.c_str());
1018 if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1019 ParseError("group name not defined");
1020 break;
1021 }
1022 }
1023 if (isBackward) {
1024 BackwardBackReferenceOpCode backReferenceOp;
1025 backReferenceOp.EmitOpCode(&buffer_, postion);
1026 } else {
1027 BackReferenceOpCode backReferenceOp;
1028 backReferenceOp.EmitOpCode(&buffer_, postion);
1029 }
1030 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1031 Advance();
1032 break;
1033 }
1034 parseLookBehind: {
1035 if (isBackward) {
1036 prevOp.EmitOpCode(&buffer_, 0);
1037 }
1038 Advance();
1039 break;
1040 }
1041 default:
1042 result = ParseCharacterEscape();
1043 break;
1044 }
1045 return result;
1046 }
1047
RecountCaptures()1048 int RegExpParser::RecountCaptures()
1049 {
1050 if (totalCaptureCount_ < 0) {
1051 const char *name = reinterpret_cast<const char*>(groupNames_.buf_);
1052 totalCaptureCount_ = ParseCaptureCount(name);
1053 }
1054 return totalCaptureCount_;
1055 }
HasNamedCaptures()1056 bool RegExpParser::HasNamedCaptures()
1057 {
1058 if (hasNamedCaptures_ < 0) {
1059 RecountCaptures();
1060 }
1061 return false;
1062 }
1063
ParseCharacterEscape()1064 int RegExpParser::ParseCharacterEscape()
1065 {
1066 // CharacterEscape[U]::
1067 // ControlEscape
1068 // c ControlLetter
1069 // 0 [lookahead ∉ DecimalDigit]
1070 // HexEscapeSequence
1071 // RegExpUnicodeEscapeSequence[?U]
1072 // IdentityEscape[?U]
1073 uint32_t result = 0;
1074 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1075 switch (c0_) {
1076 // ControlEscape
1077 case 'f':
1078 result = '\f';
1079 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1080 PrintF("ControlEscape %c\n", c0_);
1081 Advance();
1082 break;
1083 case 'n':
1084 result = '\n';
1085 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1086 PrintF("ControlEscape %c\n", c0_);
1087 Advance();
1088 break;
1089 case 'r':
1090 result = '\r';
1091 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1092 PrintF("ControlEscape %c\n", c0_);
1093 Advance();
1094 break;
1095 case 't':
1096 result = '\t';
1097 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1098 PrintF("ControlEscape %c\n", c0_);
1099 Advance();
1100 break;
1101 case 'v':
1102 result = '\v';
1103 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1104 PrintF("ControlEscape %c\n", c0_);
1105 Advance();
1106 break;
1107 // c ControlLetter
1108 case 'c': {
1109 Advance();
1110 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1111 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1112 PrintF("ControlLetter %c\n", c0_);
1113 result = static_cast<uint32_t>(c0_) & 0x1f; // NOLINTNEXTLINE(readability-magic-numbers)
1114 Advance();
1115 } else {
1116 if (!IsUtf16()) {
1117 pc_--; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1118 result = '\\';
1119 } else {
1120 ParseError("Invalid control letter");
1121 return -1;
1122 }
1123 }
1124 break;
1125 }
1126 case '0': {
1127 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1128 PrintF("CharacterEscape 0 [lookahead ∉ DecimalDigit]\n");
1129 if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) { // NOLINTNEXTLINE(readability-magic-numbers)
1130 Advance();
1131 result = 0;
1132 break;
1133 }
1134 [[fallthrough]];
1135 }
1136 case '1':
1137 case '2':
1138 case '3':
1139 case '4':
1140 case '5':
1141 case '6':
1142 case '7': {
1143 if (IsUtf16()) {
1144 // With /u, decimal escape is not interpreted as octal character code.
1145 ParseError("Invalid class escape");
1146 return 0;
1147 }
1148 result = ParseOctalLiteral();
1149 break;
1150 }
1151 // ParseHexEscapeSequence
1152 // ParseRegExpUnicodeEscapeSequence
1153 case 'x': {
1154 Advance();
1155 if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1156 return result;
1157 }
1158 if (IsUtf16()) {
1159 ParseError("Invalid class escape");
1160 return -1;
1161 }
1162 result = 'x';
1163 break;
1164 }
1165 case 'u': {
1166 Advance();
1167 if (ParseUnicodeEscape(&result)) {
1168 return result;
1169 }
1170 if (IsUtf16()) {
1171 // With /u, invalid escapes are not treated as identity escapes.
1172 ParseError("Invalid unicode escape");
1173 return 0;
1174 }
1175 // If \u is not followed by a two-digit hexadecimal, treat it
1176 // as an identity escape.
1177 result = 'u';
1178 break;
1179 }
1180 // IdentityEscape[?U]
1181 case '$':
1182 case '(':
1183 case ')':
1184 case '*':
1185 case '+':
1186 case '.':
1187 case '/':
1188 case '?':
1189 case '[':
1190 case '\\':
1191 case ']':
1192 case '^':
1193 case '{':
1194 case '|':
1195 case '}':
1196 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1197 PrintF("IdentityEscape %c\n", c0_);
1198 result = c0_;
1199 Advance();
1200 break;
1201 default: {
1202 if (IsUtf16()) {
1203 ParseError("Invalid unicode escape");
1204 return 0;
1205 }
1206 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1207 PrintF("SourceCharacter %c\n", c0_);
1208 result = c0_;
1209 if (result < CHAR_MAXS) {
1210 Advance();
1211 } else {
1212 Prev();
1213 const uint8_t *p = pc_;
1214 result = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
1215 int offset = static_cast<int>(p - pc_);
1216 Advance(offset + 1);
1217 }
1218 break;
1219 }
1220 }
1221 return static_cast<int>(result);
1222 }
1223
ParseClassRanges(RangeSet * result)1224 bool RegExpParser::ParseClassRanges(RangeSet *result)
1225 {
1226 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1227 PrintF("Parse ClassRanges------\n");
1228 while (c0_ != ']') {
1229 RangeSet s1;
1230 bool needInter = false;
1231 uint32_t c1 = ParseClassAtom(&s1);
1232 if (c1 == UINT32_MAX) {
1233 ParseError("invalid class range");
1234 return false;
1235 }
1236 needInter = NeedIntersection(c1);
1237 int next_c0 = *pc_;
1238 if (c0_ == '-' && next_c0 != ']') {
1239 if (c1 == CLASS_RANGE_BASE) {
1240 if (IsUtf16()) {
1241 ParseError("invalid class range");
1242 return false;
1243 }
1244 result->Insert(s1);
1245 continue;
1246 }
1247 Advance();
1248 RangeSet s2;
1249 uint32_t c2 = ParseClassAtom(&s2);
1250 if (c2 == UINT32_MAX) {
1251 ParseError("invalid class range");
1252 return false;
1253 }
1254 if (c2 == CLASS_RANGE_BASE) {
1255 if (IsUtf16()) {
1256 ParseError("invalid class range");
1257 return false;
1258 }
1259 result->Insert(s2);
1260 continue;
1261 }
1262 if (c1 < INT8_MAX) {
1263 if (c1 > c2) {
1264 ParseError("invalid class range");
1265 return false;
1266 }
1267 }
1268 needInter = NeedIntersection(c2);
1269 result->Insert(c1, c2);
1270 if (IsIgnoreCase() && needInter) {
1271 ProcessIntersection(result);
1272 }
1273 } else {
1274 result->Insert(s1);
1275 if (!(IsIgnoreCase() && needInter)) {
1276 continue;
1277 }
1278 if (c1 <= 'z' && c1 >= 'a') {
1279 result->Insert(RangeSet(c1 - 'a' + 'A'));
1280 } else {
1281 result->Insert(RangeSet(c1 - 'A' + 'a'));
1282 }
1283 }
1284 }
1285 Advance();
1286 return true;
1287 }
1288
ParseClassAtom(RangeSet * atom)1289 uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1290 {
1291 uint32_t ret = UINT32_MAX;
1292 switch (c0_) {
1293 case '\\': {
1294 Advance();
1295 ret = static_cast<uint32_t>(ParseClassEscape(atom));
1296 break;
1297 }
1298 case KEY_EOF:
1299 break;
1300 case 0: {
1301 if (pc_ >= end_) {
1302 return UINT32_MAX;
1303 }
1304 [[fallthrough]];
1305 }
1306 default: {
1307 uint32_t value = c0_;
1308 size_t u16_size = 0;
1309 if (c0_ > INT8_MAX) { // NOLINTNEXTLINE(readability-magic-numbers)
1310 pc_ -= 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1311 auto u16_result = base::utf_helper::ConvertUtf8ToUtf16Pair(pc_, true);
1312 value = u16_result.first;
1313 u16_size = u16_result.second;
1314 Advance(u16_size + 1);
1315 } else {
1316 Advance();
1317 }
1318 atom->Insert(RangeSet(value));
1319 ret = value;
1320 break;
1321 }
1322 }
1323 return ret;
1324 }
1325
ParseClassEscape(RangeSet * atom)1326 int RegExpParser::ParseClassEscape(RangeSet *atom)
1327 {
1328 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1329 PrintF("Parse ClassEscape------\n");
1330 int result = -1;
1331 switch (c0_) {
1332 case 'b':
1333 Advance();
1334 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1335 PrintF("ClassEscape %c", 'b');
1336 result = '\b';
1337 atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1338 break;
1339 case '-':
1340 Advance();
1341 result = '-';
1342 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1343 PrintF("ClassEscape %c", '-');
1344 atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1345 break;
1346 // CharacterClassEscape
1347 case 'd':
1348 case 'D':
1349 result = CLASS_RANGE_BASE;
1350 atom->Insert(g_rangeD);
1351 if (c0_ == 'D') {
1352 atom->Invert(IsUtf16());
1353 }
1354 Advance();
1355 break;
1356 case 's':
1357 case 'S':
1358 result = CLASS_RANGE_BASE;
1359 atom->Insert(g_rangeS);
1360 if (c0_ == 'S') {
1361 atom->Invert(IsUtf16());
1362 }
1363 Advance();
1364 break;
1365 case 'w':
1366 case 'W':
1367 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1368 PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1369 result = CLASS_RANGE_BASE;
1370 atom->Insert(g_rangeW);
1371 if (c0_ == 'W') {
1372 atom->Invert(IsUtf16());
1373 }
1374 Advance();
1375 break;
1376 // P{UnicodePropertyValueExpression}
1377 // p{UnicodePropertyValueExpression}
1378 case 'P':
1379 case 'p':
1380 PrintF("Warning: \\p is not supported in ECMA 2015!");
1381 Advance();
1382 if (c0_ == '{') {
1383 Advance();
1384 if (c0_ == '}') {
1385 break; // p{}, invalid
1386 }
1387 bool isValue = false;
1388 ParseUnicodePropertyValueCharacters(&isValue);
1389 if (!isValue && c0_ == '=') {
1390 // UnicodePropertyName = UnicodePropertyValue
1391 Advance();
1392 if (c0_ == '}') {
1393 break; // p{xxx=}, invalid
1394 }
1395 ParseUnicodePropertyValueCharacters(&isValue);
1396 }
1397 if (c0_ != '}') {
1398 break; // p{xxx, invalid
1399 }
1400 // should do atom->Invert() here after ECMA 9.0
1401 Advance();
1402 result = CLASS_RANGE_BASE;
1403 }
1404 break;
1405 default:
1406 result = ParseCharacterEscape();
1407 int value = result;
1408 if (IsIgnoreCase()) {
1409 value = Canonicalize(value, IsUtf16());
1410 }
1411 atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1412 break;
1413 }
1414 return result;
1415 }
1416
ParseUnicodePropertyValueCharacters(bool * isValue)1417 void RegExpParser::ParseUnicodePropertyValueCharacters(bool *isValue)
1418 {
1419 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1420 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1421 PrintF("UnicodePropertyCharacter::ControlLetter %c\n", c0_);
1422 } else if (c0_ == '_') {
1423 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1424 PrintF("UnicodePropertyCharacter:: _ \n");
1425 } else if (c0_ >= '0' && c0_ <= '9') {
1426 *isValue = true;
1427 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1428 PrintF("UnicodePropertyValueCharacter::DecimalDigit %c\n", c0_);
1429 } else {
1430 return;
1431 }
1432 Advance();
1433 ParseUnicodePropertyValueCharacters(isValue);
1434 }
1435
1436 // NOLINTNEXTLINE(cert-dcl50-cpp)
PrintF(const char * fmt,...)1437 void RegExpParser::PrintF(const char *fmt, ...)
1438 {
1439 #ifndef _NO_DEBUG_
1440 va_list args;
1441 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,)
1442 va_start(args, fmt);
1443 vprintf(fmt, args);
1444 va_end(args);
1445 #else
1446 (void)fmt;
1447 #endif
1448 }
1449
ParseError(const char * errorMessage)1450 void RegExpParser::ParseError(const char *errorMessage)
1451 {
1452 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1453 PrintF("error: ");
1454 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1455 PrintF(errorMessage);
1456 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1457 PrintF("\n");
1458 SetIsError();
1459 size_t length = strlen(errorMessage) + 1;
1460 if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1461 LOG_FULL(FATAL) << "memcpy_s failed";
1462 UNREACHABLE();
1463 }
1464 }
1465
IsIdentFirst(uint32_t c)1466 int RegExpParser::IsIdentFirst(uint32_t c)
1467 {
1468 if (c < CACHE_SIZE) {
1469 return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1; // 5: Shift five bits 31: and operation binary of 31
1470 } else {
1471 return static_cast<int>(u_isIDStart(c));
1472 }
1473 }
1474
Canonicalize(int c,bool isUnicode)1475 int RegExpParser::Canonicalize(int c, bool isUnicode)
1476 {
1477 if (c < TMP_BUF_SIZE) { // NOLINTNEXTLINE(readability-magic-numbers)
1478 if (c >= 'a' && c <= 'z') {
1479 c = c - 'a' + 'A';
1480 }
1481 } else {
1482 int cur = c;
1483 if (isUnicode) {
1484 c = u_tolower(static_cast<UChar32>(c));
1485 if (c >= 'a' && c <= 'z') {
1486 c = cur;
1487 }
1488 } else {
1489 c = u_toupper(static_cast<UChar32>(c));
1490 if (c >= 'A' && c <= 'Z') {
1491 c = cur;
1492 }
1493 }
1494 }
1495 return c;
1496 }
1497
NeedIntersection(uint32_t c)1498 bool RegExpParser::NeedIntersection(uint32_t c)
1499 {
1500 return (c <= 'z' && c >= 'a') || (c <= 'Z' && c >= 'A');
1501 }
1502 } // namespace panda::ecmascript
1503