1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "ecmascript/regexp/regexp_parser.h"
17
18 #include "ecmascript/base/string_helper.h"
19 #include "ecmascript/ecma_macros.h"
20 #include "ecmascript/regexp/regexp_opcode.h"
21 #include "libpandabase/utils/utils.h"
22 #include "securec.h"
23 #include "unicode/uchar.h"
24 #include "unicode/uniset.h"
25 #define _NO_DEBUG_
26
27 namespace panda::ecmascript {
28 static constexpr uint32_t CACHE_SIZE = 128;
29 static constexpr uint32_t CHAR_MAXS = 128;
30 static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
31 /* $ A-Z _ a-z */
32 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
33 };
34 static RangeSet g_rangeD(0x30, 0x39); // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
35 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
36 static RangeSet g_rangeS({
37 std::pair<uint32_t, uint32_t>(0x0009, 0x000D), // NOLINTNEXTLINE(readability-magic-numbers)
38 std::pair<uint32_t, uint32_t>(0x0020, 0x0020), // NOLINTNEXTLINE(readability-magic-numbers)
39 std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0), // NOLINTNEXTLINE(readability-magic-numbers)
40 std::pair<uint32_t, uint32_t>(0x1680, 0x1680), // NOLINTNEXTLINE(readability-magic-numbers)
41 std::pair<uint32_t, uint32_t>(0x2000, 0x200A), // NOLINTNEXTLINE(readability-magic-numbers)
42 /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
43 /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
44 std::pair<uint32_t, uint32_t>(0x2028, 0x2029), // NOLINTNEXTLINE(readability-magic-numbers)
45 std::pair<uint32_t, uint32_t>(0x202F, 0x202F), // NOLINTNEXTLINE(readability-magic-numbers)
46 std::pair<uint32_t, uint32_t>(0x205F, 0x205F), // NOLINTNEXTLINE(readability-magic-numbers)
47 std::pair<uint32_t, uint32_t>(0x3000, 0x3000), // NOLINTNEXTLINE(readability-magic-numbers)
48 /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
49 std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF), // NOLINTNEXTLINE(readability-magic-numbers)
50 });
51
52 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
53 static RangeSet g_rangeW({
54 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINTNEXTLINE(readability-magic-numbers)
55 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINTNEXTLINE(readability-magic-numbers)
56 std::pair<uint32_t, uint32_t>(0x005F, 0x005F), // NOLINTNEXTLINE(readability-magic-numbers)
57 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINTNEXTLINE(readability-magic-numbers)
58 });
59
60 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
61 static RangeSet g_regexpIdentifyStart({
62 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINTNEXTLINE(readability-magic-numbers)
63 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINTNEXTLINE(readability-magic-numbers)
64 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINTNEXTLINE(readability-magic-numbers)
65 });
66
67 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
68 static RangeSet g_regexpIdentifyContinue({
69 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINTNEXTLINE(readability-magic-numbers)
70 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINTNEXTLINE(readability-magic-numbers)
71 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINTNEXTLINE(readability-magic-numbers)
72 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINTNEXTLINE(readability-magic-numbers)
73 });
74
Parse()75 void RegExpParser::Parse()
76 {
77 // dynbuffer head init [size,capture_count,statck_count,flags]
78 buffer_.EmitU32(0);
79 buffer_.EmitU32(0);
80 buffer_.EmitU32(0);
81 buffer_.EmitU32(0);
82 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
83 PrintF("Parse Pattern------\n");
84 // Pattern[U, N]::
85 // Disjunction[?U, ?N]
86 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
87 Advance();
88 SaveStartOpCode saveStartOp;
89 int captureIndex = captureCount_++;
90 saveStartOp.EmitOpCode(&buffer_, captureIndex);
91 ParseDisjunction(false);
92 if (c0_ != KEY_EOF) {
93 ParseError("extraneous characters at the end");
94 return;
95 }
96 SaveEndOpCode saveEndOp;
97 saveEndOp.EmitOpCode(&buffer_, captureIndex);
98 MatchEndOpCode matchEndOp;
99 matchEndOp.EmitOpCode(&buffer_, 0);
100 // dynbuffer head assignments
101 buffer_.PutU32(0, buffer_.size_);
102 buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
103 buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
104 buffer_.PutU32(FLAGS_OFFSET, flags_);
105 #ifndef _NO_DEBUG_
106 RegExpOpCode::DumpRegExpOpCode(std::cout, buffer_);
107 #endif
108 }
109
ParseDisjunction(bool isBackward)110 void RegExpParser::ParseDisjunction(bool isBackward)
111 {
112 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
113 PrintF("Parse Disjunction------\n");
114 if (c0_ == ')') {
115 isEmpty_ = true;
116 return;
117 }
118 size_t start = buffer_.size_;
119 ParseAlternative(isBackward);
120 if (isError_) {
121 return;
122 }
123 do {
124 if (c0_ == '|') {
125 SplitNextOpCode splitOp;
126 uint32_t len = buffer_.size_ - start;
127 GotoOpCode gotoOp;
128 splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
129 uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
130 Advance();
131 ParseAlternative(isBackward);
132 gotoOp.UpdateOpPara(&buffer_, pos, buffer_.size_ - pos - gotoOp.GetSize());
133 }
134 } while (c0_ != KEY_EOF && c0_ != ')');
135 }
136
ParseOctalLiteral()137 uint32_t RegExpParser::ParseOctalLiteral()
138 {
139 // For compatibility with some other browsers (not all), we parse
140 // up to three octal digits with a value below 256.
141 // ES#prod-annexB-LegacyOctalEscapeSequence
142 uint32_t value = c0_ - '0';
143 Advance();
144 if (c0_ >= '0' && c0_ <= '7') {
145 value = value * OCTAL_VALUE + c0_ - '0';
146 Advance();
147 if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
148 value = value * OCTAL_VALUE + c0_ - '0';
149 Advance();
150 }
151 }
152 return value;
153 }
154
ParseUnlimitedLengthHexNumber(uint32_t maxValue,uint32_t * value)155 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
156 {
157 uint32_t x = 0;
158 int d = static_cast<int>(HexValue(c0_));
159 if (d < 0) {
160 return false;
161 }
162 while (d >= 0) {
163 if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
164 LOG_FULL(FATAL) << "value overflow";
165 return false;
166 }
167 x = x * HEX_VALUE + static_cast<uint32_t>(d);
168 if (x > maxValue) {
169 return false;
170 }
171 Advance();
172 d = static_cast<int>(HexValue(c0_));
173 }
174 *value = x;
175 return true;
176 }
177
178 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uint32_t * value)179 bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
180 {
181 // Accept both \uxxxx and \u{xxxxxx} (if allowed).
182 // In the latter case, the number of hex digits between { } is arbitrary.
183 // \ and u have already been read.
184 if (c0_ == '{' && IsUtf16()) {
185 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
186 Advance();
187 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { // NOLINTNEXTLINE(readability-magic-numbers)
188 if (c0_ == '}') {
189 Advance();
190 return true;
191 }
192 }
193 pc_ = start;
194 Advance();
195 return false;
196 }
197 // \u but no {, or \u{...} escapes not allowed.
198 bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
199 if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
200 // Attempt to read trail surrogate.
201 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
202 if (*pc_ == 'u') {
203 Advance(UNICODE_HEX_ADVANCE);
204 uint32_t trail = 0;
205 if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
206 *value = U16_GET_SUPPLEMENTARY((*value), (trail)); // NOLINTNEXTLINE(hicpp-signed-bitwise)
207 return true;
208 }
209 }
210 pc_ = start;
211 Advance();
212 }
213 return result;
214 }
215
ParseHexEscape(int length,uint32_t * value)216 bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
217 {
218 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
219 uint32_t val = 0;
220 for (int i = 0; i < length; ++i) {
221 uint32_t c = c0_;
222 int d = static_cast<int>(HexValue(c));
223 if (d < 0) {
224 pc_ = start;
225 Advance();
226 return false;
227 }
228 val = val * HEX_VALUE + static_cast<uint32_t>(d);
229 Advance();
230 }
231 *value = val;
232 return true;
233 }
234
235 // NOLINTNEXTLINE(readability-function-size)
ParseAlternative(bool isBackward)236 void RegExpParser::ParseAlternative(bool isBackward)
237 {
238 size_t start = buffer_.size_;
239 while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
240 if (isError_) {
241 return;
242 }
243 size_t atomBcStart = buffer_.GetSize();
244 int captureIndex = 0;
245 bool isAtom = false;
246 switch (c0_) {
247 case '^': {
248 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
249 PrintF("Assertion %c line start \n", c0_);
250 LineStartOpCode lineStartOp;
251 lineStartOp.EmitOpCode(&buffer_, 0);
252 Advance();
253 break;
254 }
255 case '$': {
256 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
257 PrintF("Assertion %c line end \n", c0_);
258 LineEndOpCode lineEndOp;
259 lineEndOp.EmitOpCode(&buffer_, 0);
260 Advance();
261 break;
262 }
263 case '\\': {
264 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
265 PrintF("Escape %c \n", c0_);
266 Advance();
267 switch (c0_) {
268 case 'b': {
269 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
270 PrintF("Assertion %c \n", c0_);
271 WordBoundaryOpCode wordBoundaryOp;
272 wordBoundaryOp.EmitOpCode(&buffer_, 0);
273 Advance();
274 break;
275 }
276 case 'B': {
277 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
278 PrintF("Assertion %c \n", c0_);
279 NotWordBoundaryOpCode notWordBoundaryOp;
280 notWordBoundaryOp.EmitOpCode(&buffer_, 0);
281 Advance();
282 break;
283 }
284 default: {
285 isAtom = true;
286 int atomValue = ParseAtomEscape(isBackward);
287 if (atomValue != -1) {
288 PrevOpCode prevOp;
289 if (isBackward) {
290 prevOp.EmitOpCode(&buffer_, 0);
291 }
292 if (IsIgnoreCase()) {
293 if (!IsUtf16()) {
294 atomValue = Canonicalize(atomValue, false);
295 } else {
296 icu::UnicodeSet set(atomValue, atomValue);
297 set.closeOver(USET_CASE_INSENSITIVE);
298 set.removeAllStrings();
299 uint32_t size = static_cast<uint32_t>(set.size());
300 RangeOpCode rangeOp;
301 RangeSet rangeResult;
302 for (uint32_t idx = 0; idx < size; idx++) {
303 int32_t uc = set.charAt(idx);
304 RangeSet curRange(uc);
305 rangeResult.Insert(curRange);
306 }
307 rangeOp.InsertOpCode(&buffer_, rangeResult);
308 break;
309 }
310 }
311 if (atomValue <= UINT16_MAX) {
312 CharOpCode charOp;
313 charOp.EmitOpCode(&buffer_, atomValue);
314 } else {
315 Char32OpCode charOp;
316 charOp.EmitOpCode(&buffer_, atomValue);
317 }
318 if (isBackward) {
319 prevOp.EmitOpCode(&buffer_, 0);
320 }
321 }
322 break;
323 }
324 }
325 break;
326 }
327 case '(': {
328 Advance();
329 isAtom = ParseAssertionCapture(&captureIndex, isBackward);
330 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
331 Advance();
332 break;
333 }
334 case '.': {
335 PrevOpCode prevOp;
336 if (isBackward) {
337 prevOp.EmitOpCode(&buffer_, 0);
338 }
339 if (IsDotAll()) {
340 AllOpCode allOp;
341 allOp.EmitOpCode(&buffer_, 0);
342 } else {
343 DotsOpCode dotsOp;
344 dotsOp.EmitOpCode(&buffer_, 0);
345 }
346 if (isBackward) {
347 prevOp.EmitOpCode(&buffer_, 0);
348 }
349 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
350 PrintF("Atom %c match any \n", c0_);
351 isAtom = true;
352 Advance();
353 break;
354 }
355 case '[': {
356 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
357 PrintF("Atom %c match range \n", c0_);
358 isAtom = true;
359 PrevOpCode prevOp;
360 Advance();
361 if (isBackward) {
362 prevOp.EmitOpCode(&buffer_, 0);
363 }
364 bool isInvert = false;
365 if (c0_ == '^') {
366 isInvert = true;
367 Advance();
368 }
369 RangeSet rangeResult;
370 if (!ParseClassRanges(&rangeResult)) {
371 break;
372 }
373 if (isInvert) {
374 rangeResult.Invert(IsUtf16());
375 }
376 uint32_t highValue = rangeResult.HighestValue();
377 if (highValue <= UINT16_MAX) {
378 RangeOpCode rangeOp;
379 rangeOp.InsertOpCode(&buffer_, rangeResult);
380 } else {
381 Range32OpCode rangeOp;
382 rangeOp.InsertOpCode(&buffer_, rangeResult);
383 }
384
385 if (isBackward) {
386 prevOp.EmitOpCode(&buffer_, 0);
387 }
388 break;
389 }
390 case '*':
391 case '+':
392 case '?':
393 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
394 ParseError("nothing to repeat");
395 return;
396 case '{': {
397 uint8_t *begin = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
398 int dummy;
399 if (ParserIntervalQuantifier(&dummy, &dummy)) {
400 ParseError("nothing to repeat");
401 return;
402 }
403 pc_ = begin;
404 Advance();
405 }
406 [[fallthrough]];
407 case '}':
408 case ']':
409 if (IsUtf16()) {
410 ParseError("syntax error");
411 return;
412 }
413 [[fallthrough]];
414 default: {
415 // PatternCharacter
416 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
417 PrintF("PatternCharacter %c\n", c0_);
418 isAtom = true;
419 {
420 PrevOpCode prevOp;
421 if (isBackward) {
422 prevOp.EmitOpCode(&buffer_, 0);
423 }
424 uint32_t matchedChar = c0_;
425 if (c0_ > (INT8_MAX + 1)) {
426 Prev();
427 int i = 0;
428 UChar32 c;
429 int32_t length = end_ - pc_ + 1;
430 // NOLINTNEXTLINE(hicpp-signed-bitwise)
431 U8_NEXT(pc_, i, length, c); // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
432 matchedChar = static_cast<uint32_t>(c);
433 pc_ += i; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
434 }
435 if (IsIgnoreCase()) {
436 matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
437 }
438 if (matchedChar > UINT16_MAX) {
439 Char32OpCode charOp;
440 charOp.EmitOpCode(&buffer_, matchedChar);
441 } else {
442 CharOpCode charOp;
443 charOp.EmitOpCode(&buffer_, matchedChar);
444 }
445 if (isBackward) {
446 prevOp.EmitOpCode(&buffer_, 0);
447 }
448 }
449 Advance();
450 break;
451 }
452 }
453 if (isAtom && !isError_) {
454 ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
455 }
456 if (isBackward) {
457 size_t end = buffer_.GetSize();
458 size_t termSize = end - atomBcStart;
459 size_t moveSize = end - start;
460 buffer_.Expand(end + termSize);
461 if (memmove_s(buffer_.buf_ + start + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
462 termSize, // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
463 moveSize,
464 buffer_.buf_ + start, // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
465 moveSize) != EOK) {
466 LOG_FULL(FATAL) << "memmove_s failed";
467 UNREACHABLE();
468 }
469 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
470 if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
471 LOG_FULL(FATAL) << "memcpy_s failed";
472 UNREACHABLE();
473 }
474 }
475 }
476 }
477
FindGroupName(const CString & name)478 int RegExpParser::FindGroupName(const CString &name)
479 {
480 size_t len = 0;
481 size_t nameLen = name.size();
482 const char *p = reinterpret_cast<char *>(groupNames_.buf_);
483 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
484 const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
485 int captureIndex = 1;
486 while (p < bufEnd) {
487 len = strlen(p);
488 if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
489 return captureIndex;
490 }
491 p += len + 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
492 captureIndex++;
493 }
494 return -1;
495 }
496
ParseAssertionCapture(int * captureIndex,bool isBackward)497 bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
498 {
499 bool isAtom = false;
500 do {
501 if (c0_ == '?') {
502 Advance();
503 switch (c0_) {
504 // (?=Disjunction[?U, ?N])
505 case '=': {
506 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
507 PrintF("Assertion(?= Disjunction)\n");
508 Advance();
509 uint32_t start = buffer_.size_;
510 ParseDisjunction(isBackward);
511 MatchOpCode matchOp;
512 matchOp.EmitOpCode(&buffer_, 0);
513 MatchAheadOpCode matchAheadOp;
514 uint32_t len = buffer_.size_ - start;
515 matchAheadOp.InsertOpCode(&buffer_, start, len);
516 break;
517 }
518 // (?!Disjunction[?U, ?N])
519 case '!': {
520 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
521 PrintF("Assertion(?! Disjunction)\n");
522 uint32_t start = buffer_.size_;
523 Advance();
524 ParseDisjunction(isBackward);
525 MatchOpCode matchOp;
526 matchOp.EmitOpCode(&buffer_, 0);
527 NegativeMatchAheadOpCode matchAheadOp;
528 uint32_t len = buffer_.size_ - start;
529 matchAheadOp.InsertOpCode(&buffer_, start, len);
530 break;
531 }
532 case '<': {
533 Advance();
534 // (?<=Disjunction[?U, ?N])
535 if (c0_ == '=') {
536 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
537 PrintF("Assertion(?<= Disjunction)\n");
538 Advance();
539 uint32_t start = buffer_.size_;
540 ParseDisjunction(true);
541 MatchOpCode matchOp;
542 matchOp.EmitOpCode(&buffer_, 0);
543 MatchAheadOpCode matchAheadOp;
544 uint32_t len = buffer_.size_ - start;
545 matchAheadOp.InsertOpCode(&buffer_, start, len);
546 // (?<!Disjunction[?U, ?N])
547 } else if (c0_ == '!') {
548 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
549 PrintF("Assertion(?<! Disjunction)\n");
550 Advance();
551 uint32_t start = buffer_.size_;
552 ParseDisjunction(true);
553 MatchOpCode matchOp;
554 matchOp.EmitOpCode(&buffer_, 0);
555 NegativeMatchAheadOpCode matchAheadOp;
556 uint32_t len = buffer_.size_ - start;
557 matchAheadOp.InsertOpCode(&buffer_, start, len);
558 } else {
559 Prev();
560 CString name;
561 auto **pp = const_cast<const uint8_t **>(&pc_);
562 if (!ParseGroupSpecifier(pp, name)) {
563 ParseError("GroupName Syntax error.");
564 return false;
565 }
566 if (FindGroupName(name) > 0) {
567 ParseError("Duplicate GroupName error.");
568 return false;
569 }
570 groupNames_.EmitStr(name.c_str());
571 newGroupNames_.push_back(name);
572 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
573 PrintF("group name %s", name.c_str());
574 Advance();
575 goto parseCapture; // NOLINTNEXTLINE(cppcoreguidelines-avoid-goto)
576 }
577 break;
578 }
579 // (?:Disjunction[?U, ?N])
580 case ':':
581 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
582 PrintF("Atom(?<: Disjunction)\n");
583 isAtom = true;
584 Advance();
585 ParseDisjunction(isBackward);
586 break;
587 default:
588 Advance();
589 ParseError("? Syntax error.");
590 return false;
591 }
592 if (isError_) {
593 return false;
594 }
595 } else {
596 groupNames_.EmitChar(0);
597 parseCapture:
598 isAtom = true;
599 *captureIndex = captureCount_++;
600 SaveEndOpCode saveEndOp;
601 SaveStartOpCode saveStartOp;
602 if (isBackward) {
603 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
604 } else {
605 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
606 }
607 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
608 PrintF("capture start %d \n", *captureIndex);
609 ParseDisjunction(isBackward);
610 if (isError_) {
611 return false;
612 }
613 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
614 PrintF("capture end %d \n", *captureIndex);
615 if (isBackward) {
616 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
617 } else {
618 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
619 }
620 }
621 } while (c0_ != ')' && c0_ != KEY_EOF);
622 if (c0_ != ')') {
623 ParseError("capture syntax error");
624 return false;
625 }
626 return isAtom;
627 }
628
ParseDecimalDigits()629 int RegExpParser::ParseDecimalDigits()
630 {
631 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
632 PrintF("Parse DecimalDigits------\n");
633 uint32_t result = 0;
634 bool overflow = false;
635 while (true) {
636 if (c0_ < '0' || c0_ > '9') {
637 break;
638 }
639 if (!overflow) {
640 if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
641 overflow = true;
642 } else {
643 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
644 }
645 }
646 Advance();
647 }
648 if (overflow) {
649 return INT32_MAX;
650 }
651 return result;
652 }
653
ParserIntervalQuantifier(int * pmin,int * pmax)654 bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
655 {
656 // Quantifier::
657 // QuantifierPrefix
658 // QuantifierPrefix?
659 // QuantifierPrefix::
660 // *
661 // +
662 // ?
663 // {DecimalDigits}
664 // {DecimalDigits,}
665 // {DecimalDigits,DecimalDigits}
666 Advance();
667 *pmin = ParseDecimalDigits();
668 *pmax = *pmin;
669 switch (c0_) {
670 case ',': {
671 Advance();
672 if (c0_ == '}') {
673 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
674 PrintF("QuantifierPrefix{DecimalDigits,}\n");
675 *pmax = INT32_MAX;
676 Advance();
677 } else {
678 *pmax = ParseDecimalDigits();
679 if (c0_ == '}') {
680 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
681 PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
682 Advance();
683 } else {
684 return false;
685 }
686 }
687 break;
688 }
689 case '}':
690 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
691 PrintF("QuantifierPrefix{DecimalDigits}\n");
692 Advance();
693 break;
694 default:
695 Advance();
696 return false;
697 }
698 return true;
699 }
700
ParseQuantifier(size_t atomBcStart,int captureStart,int captureEnd)701 void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
702 {
703 int min = -1;
704 int max = -1;
705 bool isGreedy = true;
706 switch (c0_) {
707 case '*':
708 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
709 PrintF("QuantifierPrefix %c\n", c0_);
710 min = 0;
711 max = INT32_MAX;
712 Advance();
713 break;
714 case '+':
715 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
716 PrintF("QuantifierPrefix %c\n", c0_);
717 min = 1;
718 max = INT32_MAX;
719 Advance();
720 break;
721 case '?':
722 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
723 PrintF("QuantifierPrefix %c\n", c0_);
724 Advance();
725 min = 0;
726 max = 1;
727 break;
728 case '{': {
729 uint8_t *start = pc_ - 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
730 if (!ParserIntervalQuantifier(&min, &max)) {
731 pc_ = start;
732 Advance(); // back to '{'
733 return;
734 }
735 if (min > max) {
736 ParseError("Invalid repetition count");
737 return;
738 }
739 break;
740 }
741 default:
742 break;
743 }
744 if (c0_ == '?') {
745 isGreedy = false;
746 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
747 PrintF("Quantifier::QuantifierPrefix?\n");
748 Advance();
749 } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
750 ParseError("nothing to repeat");
751 return;
752 }
753 if (min != -1 && max != -1 && !isEmpty_) {
754 stackCount_++;
755 PushOpCode pushOp;
756 pushOp.InsertOpCode(&buffer_, atomBcStart);
757 atomBcStart += pushOp.GetSize();
758
759 if (captureStart != 0) {
760 SaveResetOpCode saveResetOp;
761 saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
762 }
763
764 // zero advance check
765 if (max == INT32_MAX) {
766 stackCount_++;
767 PushCharOpCode pushCharOp;
768 pushCharOp.InsertOpCode(&buffer_, atomBcStart);
769 CheckCharOpCode checkCharOp;
770 // NOLINTNEXTLINE(readability-magic-numbers)
771 checkCharOp.EmitOpCode(&buffer_, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
772 }
773
774 if (isGreedy) {
775 LoopGreedyOpCode loopOp;
776 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
777 } else {
778 LoopOpCode loopOp;
779 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
780 }
781
782 if (min == 0) {
783 if (isGreedy) {
784 SplitNextOpCode splitNextOp;
785 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
786 } else {
787 SplitFirstOpCode splitFirstOp;
788 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
789 }
790 }
791
792 PopOpCode popOp;
793 popOp.EmitOpCode(&buffer_);
794 }
795 isEmpty_ = false;
796 }
797
ParseGroupSpecifier(const uint8_t ** pp,CString & name)798 bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, CString &name)
799 {
800 const uint8_t *p = *pp;
801 uint32_t c = 0;
802 char buffer[CACHE_SIZE] = {0};
803 char *q = buffer;
804 while (true) {
805 if (p <= end_) {
806 c = *p;
807 } else {
808 c = KEY_EOF;
809 }
810 if (c == '\\') {
811 p++;
812 if (*p != 'u') {
813 return false;
814 }
815 if (!ParseUnicodeEscape(&c)) {
816 return false;
817 }
818 } else if (c == '>') {
819 break;
820 } else if (c > CACHE_SIZE && c != KEY_EOF) {
821 c = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
822 } else if (c != KEY_EOF) {
823 p++;
824 } else {
825 return false;
826 }
827 if (q == buffer) {
828 if (!IsIdentFirst(c)) {
829 return false;
830 }
831 } else {
832 if (!u_isIDPart(c)) {
833 return false;
834 }
835 }
836 if (q != nullptr) {
837 *q++ = c;
838 }
839 } // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
840 p++;
841 *pp = p;
842 name = buffer;
843 return true;
844 }
845
ParseCaptureCount(const char * groupName)846 int RegExpParser::ParseCaptureCount(const char *groupName)
847 {
848 const uint8_t *p = nullptr;
849 int captureIndex = 1;
850 CString name;
851 hasNamedCaptures_ = 0;
852 for (p = base_; p < end_; p++) { // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
853 switch (*p) {
854 case '(': {
855 if (p[1] == '?') { // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
856 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
857 if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
858 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
859 p[CAPTURE_CONUT_ADVANCE] != '=') {
860 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
861 hasNamedCaptures_ = 1;
862 p += CAPTURE_CONUT_ADVANCE;
863 if (groupName != nullptr) {
864 if (ParseGroupSpecifier(&p, name)) {
865 if (strcmp(name.c_str(), groupName) == 0) {
866 return captureIndex;
867 }
868 }
869 }
870 captureIndex++;
871 }
872 } else {
873 captureIndex++;
874 }
875 break;
876 }
877 case '\\':
878 p++; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
879 break;
880 case '[': {
881 while (p < end_ && *p != ']') {
882 if (*p == '\\') {
883 p++; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
884 }
885 p++; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
886 }
887 break;
888 }
889 default:
890 break;
891 }
892 }
893 return captureIndex;
894 }
895
896 // NOLINTNEXTLINE(readability-function-size)
ParseAtomEscape(bool isBackward)897 int RegExpParser::ParseAtomEscape(bool isBackward)
898 {
899 // AtomEscape[U, N]::
900 // DecimalEscape
901 // CharacterClassEscape[?U]
902 // CharacterEscape[?U]
903 // [+N]kGroupName[?U]
904 int result = -1;
905 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
906 PrintF("Parse AtomEscape------\n");
907 PrevOpCode prevOp;
908 switch (c0_) {
909 case KEY_EOF:
910 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
911 ParseError("unexpected end");
912 break;
913 // DecimalEscape
914 case '1':
915 case '2':
916 case '3':
917 case '4':
918 case '5':
919 case '6':
920 case '7':
921 case '8':
922 case '9': {
923 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
924 PrintF("NonZeroDigit %c\n", c0_);
925 int capture = ParseDecimalDigits();
926 if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
927 ParseError("invalid backreference count");
928 break;
929 }
930 if (isBackward) {
931 BackwardBackReferenceOpCode backReferenceOp;
932 backReferenceOp.EmitOpCode(&buffer_, capture);
933 } else {
934 BackReferenceOpCode backReferenceOp;
935 backReferenceOp.EmitOpCode(&buffer_, capture);
936 }
937 break;
938 }
939 // CharacterClassEscape
940 case 'd': {
941 // [0-9]
942 RangeOpCode rangeOp;
943 if (isBackward) {
944 prevOp.EmitOpCode(&buffer_, 0);
945 }
946 rangeOp.InsertOpCode(&buffer_, g_rangeD);
947 goto parseLookBehind;
948 }
949 case 'D': {
950 // [^0-9]
951 RangeSet atomRange(g_rangeD);
952 atomRange.Invert(IsUtf16());
953 Range32OpCode rangeOp;
954 if (isBackward) {
955 prevOp.EmitOpCode(&buffer_, 0);
956 }
957 rangeOp.InsertOpCode(&buffer_, atomRange);
958 goto parseLookBehind;
959 }
960 case 's': {
961 // [\f\n\r\t\v]
962 RangeOpCode rangeOp;
963 if (isBackward) {
964 prevOp.EmitOpCode(&buffer_, 0);
965 }
966 rangeOp.InsertOpCode(&buffer_, g_rangeS);
967 goto parseLookBehind;
968 }
969 case 'S': {
970 RangeSet atomRange(g_rangeS);
971 Range32OpCode rangeOp;
972 atomRange.Invert(IsUtf16());
973 if (isBackward) {
974 prevOp.EmitOpCode(&buffer_, 0);
975 }
976 rangeOp.InsertOpCode(&buffer_, atomRange);
977 goto parseLookBehind;
978 }
979 case 'w': {
980 // [A-Za-z0-9]
981 RangeOpCode rangeOp;
982 if (isBackward) {
983 prevOp.EmitOpCode(&buffer_, 0);
984 }
985 rangeOp.InsertOpCode(&buffer_, g_rangeW);
986 goto parseLookBehind;
987 }
988 case 'W': {
989 // [^A-Za-z0-9]
990 RangeSet atomRange(g_rangeW);
991 atomRange.Invert(IsUtf16());
992 Range32OpCode rangeOp;
993 if (isBackward) {
994 prevOp.EmitOpCode(&buffer_, 0);
995 }
996 rangeOp.InsertOpCode(&buffer_, atomRange);
997 goto parseLookBehind;
998 }
999 // P{UnicodePropertyValueExpression}
1000 // p{UnicodePropertyValueExpression}
1001 case 'P':
1002 case 'p':
1003 // [+N]kGroupName[?U]
1004 case 'k': {
1005 Advance();
1006 if (c0_ != '<') {
1007 if (!IsUtf16() || HasNamedCaptures()) {
1008 ParseError("expecting group name.");
1009 break;
1010 }
1011 }
1012 Advance();
1013 Prev();
1014 CString name;
1015 auto **pp = const_cast<const uint8_t **>(&pc_);
1016 if (!ParseGroupSpecifier(pp, name)) {
1017 ParseError("GroupName Syntax error.");
1018 break;
1019 }
1020 int postion = FindGroupName(name);
1021 if (postion < 0) {
1022 postion = ParseCaptureCount(name.c_str());
1023 if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1024 ParseError("group name not defined");
1025 break;
1026 }
1027 }
1028 if (isBackward) {
1029 BackwardBackReferenceOpCode backReferenceOp;
1030 backReferenceOp.EmitOpCode(&buffer_, postion);
1031 } else {
1032 BackReferenceOpCode backReferenceOp;
1033 backReferenceOp.EmitOpCode(&buffer_, postion);
1034 }
1035 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1036 Advance();
1037 break;
1038 }
1039 parseLookBehind: {
1040 if (isBackward) {
1041 prevOp.EmitOpCode(&buffer_, 0);
1042 }
1043 Advance();
1044 break;
1045 }
1046 default:
1047 result = ParseCharacterEscape();
1048 break;
1049 }
1050 return result;
1051 }
1052
RecountCaptures()1053 int RegExpParser::RecountCaptures()
1054 {
1055 if (totalCaptureCount_ < 0) {
1056 const char *name = reinterpret_cast<const char*>(groupNames_.buf_);
1057 totalCaptureCount_ = ParseCaptureCount(name);
1058 }
1059 return totalCaptureCount_;
1060 }
HasNamedCaptures()1061 bool RegExpParser::HasNamedCaptures()
1062 {
1063 if (hasNamedCaptures_ < 0) {
1064 RecountCaptures();
1065 }
1066 return false;
1067 }
1068
ParseCharacterEscape()1069 int RegExpParser::ParseCharacterEscape()
1070 {
1071 // CharacterEscape[U]::
1072 // ControlEscape
1073 // c ControlLetter
1074 // 0 [lookahead ∉ DecimalDigit]
1075 // HexEscapeSequence
1076 // RegExpUnicodeEscapeSequence[?U]
1077 // IdentityEscape[?U]
1078 uint32_t result = 0;
1079 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1080 switch (c0_) {
1081 // ControlEscape
1082 case 'f':
1083 result = '\f';
1084 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1085 PrintF("ControlEscape %c\n", c0_);
1086 Advance();
1087 break;
1088 case 'n':
1089 result = '\n';
1090 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1091 PrintF("ControlEscape %c\n", c0_);
1092 Advance();
1093 break;
1094 case 'r':
1095 result = '\r';
1096 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1097 PrintF("ControlEscape %c\n", c0_);
1098 Advance();
1099 break;
1100 case 't':
1101 result = '\t';
1102 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1103 PrintF("ControlEscape %c\n", c0_);
1104 Advance();
1105 break;
1106 case 'v':
1107 result = '\v';
1108 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1109 PrintF("ControlEscape %c\n", c0_);
1110 Advance();
1111 break;
1112 // c ControlLetter
1113 case 'c': {
1114 Advance();
1115 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1116 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1117 PrintF("ControlLetter %c\n", c0_);
1118 result = static_cast<uint32_t>(c0_) & 0x1f; // NOLINTNEXTLINE(readability-magic-numbers)
1119 Advance();
1120 } else {
1121 if (!IsUtf16()) {
1122 pc_--; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1123 result = '\\';
1124 } else {
1125 ParseError("Invalid control letter");
1126 return -1;
1127 }
1128 }
1129 break;
1130 }
1131 case '0': {
1132 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1133 PrintF("CharacterEscape 0 [lookahead ∉ DecimalDigit]\n");
1134 if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) { // NOLINTNEXTLINE(readability-magic-numbers)
1135 Advance();
1136 result = 0;
1137 break;
1138 }
1139 [[fallthrough]];
1140 }
1141 case '1':
1142 case '2':
1143 case '3':
1144 case '4':
1145 case '5':
1146 case '6':
1147 case '7': {
1148 if (IsUtf16()) {
1149 // With /u, decimal escape is not interpreted as octal character code.
1150 ParseError("Invalid class escape");
1151 return 0;
1152 }
1153 result = ParseOctalLiteral();
1154 break;
1155 }
1156 // ParseHexEscapeSequence
1157 // ParseRegExpUnicodeEscapeSequence
1158 case 'x': {
1159 Advance();
1160 if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1161 return result;
1162 }
1163 if (IsUtf16()) {
1164 ParseError("Invalid class escape");
1165 return -1;
1166 }
1167 result = 'x';
1168 break;
1169 }
1170 case 'u': {
1171 Advance();
1172 if (ParseUnicodeEscape(&result)) {
1173 return result;
1174 }
1175 if (IsUtf16()) {
1176 // With /u, invalid escapes are not treated as identity escapes.
1177 ParseError("Invalid unicode escape");
1178 return 0;
1179 }
1180 // If \u is not followed by a two-digit hexadecimal, treat it
1181 // as an identity escape.
1182 result = 'u';
1183 break;
1184 }
1185 // IdentityEscape[?U]
1186 case '$':
1187 case '(':
1188 case ')':
1189 case '*':
1190 case '+':
1191 case '.':
1192 case '/':
1193 case '?':
1194 case '[':
1195 case '\\':
1196 case ']':
1197 case '^':
1198 case '{':
1199 case '|':
1200 case '}':
1201 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1202 PrintF("IdentityEscape %c\n", c0_);
1203 result = c0_;
1204 Advance();
1205 break;
1206 default: {
1207 if (IsUtf16()) {
1208 ParseError("Invalid unicode escape");
1209 return 0;
1210 }
1211 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1212 PrintF("SourceCharacter %c\n", c0_);
1213 result = c0_;
1214 if (result < CHAR_MAXS) {
1215 Advance();
1216 } else {
1217 Prev();
1218 const uint8_t *p = pc_;
1219 result = static_cast<uint32_t>(base::StringHelper::UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
1220 int offset = static_cast<int>(p - pc_);
1221 Advance(offset + 1);
1222 }
1223 break;
1224 }
1225 }
1226 return static_cast<int>(result);
1227 }
1228
ParseClassRanges(RangeSet * result)1229 bool RegExpParser::ParseClassRanges(RangeSet *result)
1230 {
1231 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1232 PrintF("Parse ClassRanges------\n");
1233 while (c0_ != ']') {
1234 RangeSet s1;
1235 bool needInter = false;
1236 uint32_t c1 = ParseClassAtom(&s1);
1237 if (c1 == UINT32_MAX) {
1238 ParseError("invalid class range");
1239 return false;
1240 }
1241 needInter = NeedIntersection(c1);
1242 int next_c0 = *pc_;
1243 if (c0_ == '-' && next_c0 != ']') {
1244 if (c1 == CLASS_RANGE_BASE) {
1245 if (IsUtf16()) {
1246 ParseError("invalid class range");
1247 return false;
1248 }
1249 result->Insert(s1);
1250 continue;
1251 }
1252 Advance();
1253 RangeSet s2;
1254 uint32_t c2 = ParseClassAtom(&s2);
1255 if (c2 == UINT32_MAX) {
1256 ParseError("invalid class range");
1257 return false;
1258 }
1259 if (c2 == CLASS_RANGE_BASE) {
1260 if (IsUtf16()) {
1261 ParseError("invalid class range");
1262 return false;
1263 }
1264 result->Insert(s2);
1265 continue;
1266 }
1267 if (c1 < INT8_MAX) {
1268 if (c1 > c2) {
1269 ParseError("invalid class range");
1270 return false;
1271 }
1272 }
1273 needInter = NeedIntersection(c2);
1274 result->Insert(c1, c2);
1275 if (IsIgnoreCase() && needInter) {
1276 ProcessIntersection(result);
1277 }
1278 } else {
1279 result->Insert(s1);
1280 if (!(IsIgnoreCase() && needInter)) {
1281 continue;
1282 }
1283 if (c1 <= 'z' && c1 >= 'a') {
1284 result->Insert(RangeSet(c1 - 'a' + 'A'));
1285 } else {
1286 result->Insert(RangeSet(c1 - 'A' + 'a'));
1287 }
1288 }
1289 }
1290 Advance();
1291 return true;
1292 }
1293
ParseClassAtom(RangeSet * atom)1294 uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1295 {
1296 uint32_t ret = UINT32_MAX;
1297 switch (c0_) {
1298 case '\\': {
1299 Advance();
1300 ret = static_cast<uint32_t>(ParseClassEscape(atom));
1301 break;
1302 }
1303 case KEY_EOF:
1304 break;
1305 case 0: {
1306 if (pc_ >= end_) {
1307 return UINT32_MAX;
1308 }
1309 [[fallthrough]];
1310 }
1311 default: {
1312 uint32_t value = c0_;
1313 size_t u16_size = 0;
1314 if (c0_ > INT8_MAX) { // NOLINTNEXTLINE(readability-magic-numbers)
1315 pc_ -= 1; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1316 auto u16_result = base::utf_helper::ConvertUtf8ToUtf16Pair(pc_, true);
1317 value = u16_result.first;
1318 u16_size = u16_result.second;
1319 Advance(u16_size + 1);
1320 } else {
1321 Advance();
1322 }
1323 atom->Insert(RangeSet(value));
1324 ret = value;
1325 break;
1326 }
1327 }
1328 return ret;
1329 }
1330
ParseClassEscape(RangeSet * atom)1331 int RegExpParser::ParseClassEscape(RangeSet *atom)
1332 {
1333 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1334 PrintF("Parse ClassEscape------\n");
1335 int result = -1;
1336 switch (c0_) {
1337 case 'b':
1338 Advance();
1339 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1340 PrintF("ClassEscape %c", 'b');
1341 result = '\b';
1342 atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1343 break;
1344 case '-':
1345 Advance();
1346 result = '-';
1347 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1348 PrintF("ClassEscape %c", '-');
1349 atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1350 break;
1351 // CharacterClassEscape
1352 case 'd':
1353 case 'D':
1354 result = CLASS_RANGE_BASE;
1355 atom->Insert(g_rangeD);
1356 if (c0_ == 'D') {
1357 atom->Invert(IsUtf16());
1358 }
1359 Advance();
1360 break;
1361 case 's':
1362 case 'S':
1363 result = CLASS_RANGE_BASE;
1364 atom->Insert(g_rangeS);
1365 if (c0_ == 'S') {
1366 atom->Invert(IsUtf16());
1367 }
1368 Advance();
1369 break;
1370 case 'w':
1371 case 'W':
1372 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1373 PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1374 result = CLASS_RANGE_BASE;
1375 atom->Insert(g_rangeW);
1376 if (c0_ == 'W') {
1377 atom->Invert(IsUtf16());
1378 }
1379 Advance();
1380 break;
1381 // P{UnicodePropertyValueExpression}
1382 // p{UnicodePropertyValueExpression}
1383 case 'P':
1384 case 'p':
1385 PrintF("Warning: \\p is not supported in ECMA 2015!");
1386 Advance();
1387 if (c0_ == '{') {
1388 Advance();
1389 if (c0_ == '}') {
1390 break; // p{}, invalid
1391 }
1392 bool isValue = false;
1393 ParseUnicodePropertyValueCharacters(&isValue);
1394 if (!isValue && c0_ == '=') {
1395 // UnicodePropertyName = UnicodePropertyValue
1396 Advance();
1397 if (c0_ == '}') {
1398 break; // p{xxx=}, invalid
1399 }
1400 ParseUnicodePropertyValueCharacters(&isValue);
1401 }
1402 if (c0_ != '}') {
1403 break; // p{xxx, invalid
1404 }
1405 // should do atom->Invert() here after ECMA 9.0
1406 Advance();
1407 result = CLASS_RANGE_BASE;
1408 }
1409 break;
1410 default:
1411 result = ParseCharacterEscape();
1412 int value = result;
1413 if (IsIgnoreCase()) {
1414 value = Canonicalize(value, IsUtf16());
1415 }
1416 atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1417 break;
1418 }
1419 return result;
1420 }
1421
ParseUnicodePropertyValueCharacters(bool * isValue)1422 void RegExpParser::ParseUnicodePropertyValueCharacters(bool *isValue)
1423 {
1424 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1425 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1426 PrintF("UnicodePropertyCharacter::ControlLetter %c\n", c0_);
1427 } else if (c0_ == '_') {
1428 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1429 PrintF("UnicodePropertyCharacter:: _ \n");
1430 } else if (c0_ >= '0' && c0_ <= '9') {
1431 *isValue = true;
1432 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1433 PrintF("UnicodePropertyValueCharacter::DecimalDigit %c\n", c0_);
1434 } else {
1435 return;
1436 }
1437 Advance();
1438 ParseUnicodePropertyValueCharacters(isValue);
1439 }
1440
1441 // NOLINTNEXTLINE(cert-dcl50-cpp)
PrintF(const char * fmt,...)1442 void RegExpParser::PrintF(const char *fmt, ...)
1443 {
1444 #ifndef _NO_DEBUG_
1445 va_list args;
1446 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,)
1447 va_start(args, fmt);
1448 vprintf(fmt, args);
1449 va_end(args);
1450 #else
1451 (void)fmt;
1452 #endif
1453 }
1454
ParseError(const char * errorMessage)1455 void RegExpParser::ParseError(const char *errorMessage)
1456 {
1457 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1458 PrintF("error: ");
1459 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1460 PrintF(errorMessage);
1461 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1462 PrintF("\n");
1463 SetIsError();
1464 size_t length = strlen(errorMessage) + 1;
1465 if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1466 LOG_FULL(FATAL) << "memcpy_s failed";
1467 UNREACHABLE();
1468 }
1469 }
1470
IsIdentFirst(uint32_t c)1471 int RegExpParser::IsIdentFirst(uint32_t c)
1472 {
1473 if (c < CACHE_SIZE) {
1474 return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1; // 5: Shift five bits 31: and operation binary of 31
1475 } else {
1476 return static_cast<int>(u_isIDStart(c));
1477 }
1478 }
1479
Canonicalize(int c,bool isUnicode)1480 int RegExpParser::Canonicalize(int c, bool isUnicode)
1481 {
1482 if (c < TMP_BUF_SIZE) { // NOLINTNEXTLINE(readability-magic-numbers)
1483 if (c >= 'a' && c <= 'z') {
1484 c = c - 'a' + 'A';
1485 }
1486 } else {
1487 int cur = c;
1488 if (isUnicode) {
1489 c = u_tolower(static_cast<UChar32>(c));
1490 if (c >= 'a' && c <= 'z') {
1491 c = cur;
1492 }
1493 } else {
1494 c = u_toupper(static_cast<UChar32>(c));
1495 if (c >= 'A' && c <= 'Z') {
1496 c = cur;
1497 }
1498 }
1499 }
1500 return c;
1501 }
1502
NeedIntersection(uint32_t c)1503 bool RegExpParser::NeedIntersection(uint32_t c)
1504 {
1505 return (c <= 'z' && c >= 'a') || (c <= 'Z' && c >= 'A');
1506 }
1507 } // namespace panda::ecmascript
1508