1 /**
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "runtime/regexp/ecmascript/regexp_parser.h"
17 #include "runtime/regexp/ecmascript/regexp_opcode.h"
18 #include "runtime/include/coretypes/string-inl.h"
19
20 #include "libpandabase/utils/utils.h"
21 #include "libpandabase/utils/utf.h"
22
23 #include "securec.h"
24 #include "unicode/uchar.h"
25 #include "unicode/uniset.h"
26
27 #define NO_DEBUG
28 namespace {
29 constexpr int INVALID_UNICODE_FROM_UTF8 = -1;
30
31 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
32 constexpr int UICODE_FROM_UTF8[] = {
33 0x80, 0xc0, 0xdf, 0xe0, 0xef, 0xf0, 0xf7, 0xf8, 0xfb, 0xfc, 0xfd,
34 };
35
36 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
37 constexpr int UTF8_MIN_CODE[] = {
38 0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
39 };
40
41 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
42 constexpr char UTF8_FIRST_CODE[] = {
43 0x1f, 0xf, 0x7, 0x3, 0x1,
44 };
45
FromUtf8(int c,int l,const uint8_t * p,const uint8_t ** pp)46 int FromUtf8(int c, int l, const uint8_t *p, const uint8_t **pp)
47 {
48 uint32_t b;
49 // NOLINTNEXTLINE(hicpp-signed-bitwise)
50 c &= UTF8_FIRST_CODE[l - 1];
51 for (int i = 0; i < l; i++) {
52 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
53 b = *p++;
54 if (b < panda::utf::UTF8_2B_SECOND || b >= panda::utf::UTF8_2B_FIRST) {
55 return INVALID_UNICODE_FROM_UTF8;
56 }
57 // NOLINTNEXTLINE(hicpp-signed-bitwise)
58 c = (c << 6) | (b & panda::utf::UTF8_2B_THIRD); // 6: Maximum Unicode range
59 }
60 if (c < UTF8_MIN_CODE[l - 1]) {
61 return INVALID_UNICODE_FROM_UTF8;
62 }
63 *pp = p;
64 return c;
65 }
66
UnicodeFromUtf8(const uint8_t * p,int maxLen,const uint8_t ** pp)67 int UnicodeFromUtf8(const uint8_t *p, int maxLen, const uint8_t **pp)
68 {
69 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
70 int c = *p++;
71 if (c < UICODE_FROM_UTF8[0]) {
72 *pp = p;
73 return c;
74 }
75 int l = 0;
76 if (c >= UICODE_FROM_UTF8[1U] && c <= UICODE_FROM_UTF8[2U]) { // 1 - 2: 0000 0080 - 0000 07FF
77 l = 1; // 1: 0000 0080 - 0000 07FF Unicode
78 } else if (c >= UICODE_FROM_UTF8[3U] && c <= UICODE_FROM_UTF8[4U]) { // 3 - 4: 0000 0800 - 0000 FFFF
79 l = 2; // 2: 0000 0800 - 0000 FFFF Unicode
80 } else if (c >= UICODE_FROM_UTF8[5U] && c <= UICODE_FROM_UTF8[6U]) { // 5 - 6: 0001 0000 - 0010 FFFF
81 l = 3; // 3: 0001 0000 - 0010 FFFF Unicode
82 } else if (c >= UICODE_FROM_UTF8[7U] && c <= UICODE_FROM_UTF8[8U]) { // 7 - 8: 0020 0000 - 03FF FFFF
83 l = 4; // 4: 0020 0000 - 03FF FFFF Unicode
84 // NOLINTNEXTLINE(readability-magic-numbers)
85 } else if (c == UICODE_FROM_UTF8[9U] || c == UICODE_FROM_UTF8[10U]) { // 9 - 10: 0400 0000 - 7FFF FFFF
86 l = 5; // 5: 0400 0000 - 7FFF FFFF Unicode
87 } else {
88 return INVALID_UNICODE_FROM_UTF8;
89 }
90 /* check that we have enough characters */
91 if (l > (maxLen - 1)) {
92 return INVALID_UNICODE_FROM_UTF8;
93 }
94 return FromUtf8(c, l, p, pp);
95 }
96 } // namespace
97
98 namespace panda {
99 static constexpr uint32_t CACHE_SIZE = 128;
100 static constexpr uint32_t CHAR_MAXS = 128;
101 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
102 static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
103 /* $ A-Z _ a-z */
104 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE};
105 static RangeSet g_gRangeD(0x30, 0x39); // NOLINT(fuchsia-statically-constructed-objects, readability-magic-numbers)
106 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
107 static RangeSet g_gRangeS({
108 std::pair<uint32_t, uint32_t>(0x0009, 0x000D), // NOLINT(readability-magic-numbers)
109 std::pair<uint32_t, uint32_t>(0x0020, 0x0020), // NOLINT(readability-magic-numbers)
110 std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0), // NOLINT(readability-magic-numbers)
111 std::pair<uint32_t, uint32_t>(0x1680, 0x1680), // NOLINT(readability-magic-numbers)
112 std::pair<uint32_t, uint32_t>(0x2000, 0x200A), // NOLINT(readability-magic-numbers)
113 /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
114 /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
115 std::pair<uint32_t, uint32_t>(0x2028, 0x2029), // NOLINT(readability-magic-numbers)
116 std::pair<uint32_t, uint32_t>(0x202F, 0x202F), // NOLINT(readability-magic-numbers)
117 std::pair<uint32_t, uint32_t>(0x205F, 0x205F), // NOLINT(readability-magic-numbers)
118 std::pair<uint32_t, uint32_t>(0x3000, 0x3000), // NOLINT(readability-magic-numbers)
119 /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
120 std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF), // NOLINT(readability-magic-numbers)
121 });
122
123 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
124 static RangeSet g_gRangeW({
125 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINT(readability-magic-numbers)
126 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINT(readability-magic-numbers)
127 std::pair<uint32_t, uint32_t>(0x005F, 0x005F), // NOLINT(readability-magic-numbers)
128 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINT(readability-magic-numbers)
129 });
130
131 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
132 static RangeSet g_gRegexpIdentifyStart({
133 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINT(readability-magic-numbers)
134 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINT(readability-magic-numbers)
135 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINT(readability-magic-numbers)
136 });
137
138 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
139 static RangeSet g_gRegexpIdentifyContinue({
140 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINT(readability-magic-numbers)
141 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINT(readability-magic-numbers)
142 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINT(readability-magic-numbers)
143 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINT(readability-magic-numbers)
144 });
145
Parse()146 void RegExpParser::Parse()
147 {
148 // dynbuffer head init [size,capture_count,statck_count,flags]
149 buffer_.EmitU32(0);
150 buffer_.EmitU32(0);
151 buffer_.EmitU32(0);
152 buffer_.EmitU32(0);
153 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
154 PrintF("Parse Pattern------\n");
155 // Pattern[U, N]::
156 // Disjunction[?U, ?N]
157 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
158 Advance();
159 SaveStartOpCode saveStartOp;
160 int captureIndex = captureCount_++;
161 saveStartOp.EmitOpCode(&buffer_, captureIndex);
162 ParseDisjunction(false);
163 if (c0_ != KEY_EOF) {
164 ParseError("extraneous characters at the end");
165 return;
166 }
167 SaveEndOpCode saveEndOp;
168 saveEndOp.EmitOpCode(&buffer_, captureIndex);
169 MatchEndOpCode matchEndOp;
170 matchEndOp.EmitOpCode(&buffer_, 0);
171 // dynbuffer head assignments
172 buffer_.PutU32(0, buffer_.size_);
173 buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
174 buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
175 buffer_.PutU32(FLAGS_OFFSET, flags_);
176 #ifndef NO_DEBUG
177 RegExpOpCode::DumpRegExpOpCode(std::cout, buffer_);
178 #endif
179 }
180
ParseDisjunction(bool isBackward)181 void RegExpParser::ParseDisjunction(bool isBackward)
182 {
183 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
184 PrintF("Parse Disjunction------\n");
185 size_t start = buffer_.size_;
186 ParseAlternative(isBackward);
187 if (isError_) {
188 return;
189 }
190 do {
191 if (c0_ == '|') {
192 SplitNextOpCode splitOp;
193 uint32_t len = buffer_.size_ - start;
194 GotoOpCode gotoOp;
195 splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
196 uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
197 Advance();
198 ParseAlternative(isBackward);
199 gotoOp.UpdateOpPara(&buffer_, pos, buffer_.size_ - pos - gotoOp.GetSize());
200 }
201 } while (c0_ != KEY_EOF && c0_ != ')');
202 }
203
ParseOctalLiteral()204 uint32_t RegExpParser::ParseOctalLiteral()
205 {
206 // For compatibility with some other browsers (not all), we parse
207 // up to three octal digits with a value below 256.
208 // ES#prod-annexB-LegacyOctalEscapeSequence
209 uint32_t value = c0_ - '0';
210 Advance();
211 if (c0_ >= '0' && c0_ <= '7') {
212 value = value * OCTAL_VALUE + c0_ - '0';
213 Advance();
214 if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
215 value = value * OCTAL_VALUE + c0_ - '0';
216 Advance();
217 }
218 }
219 return value;
220 }
221
ParseUnlimitedLengthHexNumber(uint32_t maxValue,uint32_t * value)222 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
223 {
224 uint32_t x = 0;
225 int d = static_cast<int>(HexValue(c0_));
226 if (d < 0) {
227 return false;
228 }
229 while (d >= 0) {
230 if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
231 LOG(FATAL, COMMON) << "value overflow";
232 return false;
233 }
234 x = x * HEX_VALUE + static_cast<uint32_t>(d);
235 if (x > maxValue) {
236 return false;
237 }
238 Advance();
239 d = static_cast<int>(HexValue(c0_));
240 }
241 *value = x;
242 return true;
243 }
244
245 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uint32_t * value)246 bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
247 {
248 // Accept both \uxxxx and \u{xxxxxx} (if allowed).
249 // In the latter case, the number of hex digits between { } is arbitrary.
250 // \ and u have already been read.
251 if (c0_ == '{' && IsUtf16()) {
252 uint8_t *start = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
253 Advance();
254 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { // NOLINT(readability-magic-numbers)
255 if (c0_ == '}') {
256 Advance();
257 return true;
258 }
259 }
260 pc_ = start;
261 Advance();
262 return false;
263 }
264 // \u but no {, or \u{...} escapes not allowed.
265 bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
266 if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
267 // Attempt to read trail surrogate.
268 uint8_t *start = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
269 if (*pc_ == 'u') {
270 Advance(UNICODE_HEX_ADVANCE);
271 uint32_t trail;
272 if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
273 *value = U16_GET_SUPPLEMENTARY((*value), (trail)); // NOLINT(hicpp-signed-bitwise)
274 return true;
275 }
276 }
277 pc_ = start;
278 Advance();
279 }
280 return result;
281 }
282
ParseHexEscape(int length,uint32_t * value)283 bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
284 {
285 uint8_t *start = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
286 uint32_t val = 0;
287 for (int i = 0; i < length; ++i) {
288 uint32_t c = c0_;
289 int d = static_cast<int>(HexValue(c));
290 if (d < 0) {
291 pc_ = start;
292 Advance();
293 return false;
294 }
295 val = val * HEX_VALUE + static_cast<uint32_t>(d);
296 Advance();
297 }
298 *value = val;
299 return true;
300 }
301
302 // NOLINTNEXTLINE(readability-function-size)
ParseAlternative(bool isBackward)303 void RegExpParser::ParseAlternative(bool isBackward)
304 {
305 size_t start = buffer_.size_;
306 while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
307 if (isError_) {
308 return;
309 }
310 size_t atomBcStart = buffer_.GetSize();
311 int captureIndex = 0;
312 bool isAtom = false;
313 switch (c0_) {
314 case '^': {
315 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
316 PrintF("Assertion %c line start \n", c0_);
317 LineStartOpCode lineStartOp;
318 lineStartOp.EmitOpCode(&buffer_, 0);
319 Advance();
320 break;
321 }
322 case '$': {
323 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
324 PrintF("Assertion %c line end \n", c0_);
325 LineEndOpCode lineEndOp;
326 lineEndOp.EmitOpCode(&buffer_, 0);
327 Advance();
328 break;
329 }
330 case '\\': {
331 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
332 PrintF("Escape %c \n", c0_);
333 Advance();
334 switch (c0_) {
335 case 'b': {
336 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
337 PrintF("Assertion %c \n", c0_);
338 WordBoundaryOpCode wordBoundaryOp;
339 wordBoundaryOp.EmitOpCode(&buffer_, 0);
340 Advance();
341 break;
342 }
343 case 'B': {
344 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
345 PrintF("Assertion %c \n", c0_);
346 NotWordBoundaryOpCode notWordBoundaryOp;
347 notWordBoundaryOp.EmitOpCode(&buffer_, 0);
348 Advance();
349 break;
350 }
351 default: {
352 isAtom = true;
353 int atomValue = ParseAtomEscape(isBackward);
354 if (atomValue != -1) {
355 if (IsIgnoreCase()) {
356 if (!IsUtf16()) {
357 atomValue = Canonicalize(atomValue, false);
358 } else {
359 icu::UnicodeSet set(atomValue, atomValue);
360 set.closeOver(USET_CASE_INSENSITIVE);
361 set.removeAllStrings();
362 int32_t size = set.size();
363 RangeOpCode rangeOp;
364 RangeSet rangeResult;
365 for (int32_t idx = 0; idx < size; idx++) {
366 int32_t uc = set.charAt(idx);
367 RangeSet curRange(uc);
368 rangeResult.Insert(curRange);
369 }
370 rangeOp.InsertOpCode(&buffer_, rangeResult);
371 break;
372 }
373 }
374 if (atomValue <= UINT16_MAX) {
375 CharOpCode charOp;
376 charOp.EmitOpCode(&buffer_, atomValue);
377 } else {
378 Char32OpCode charOp;
379 charOp.EmitOpCode(&buffer_, atomValue);
380 }
381 }
382 break;
383 }
384 }
385 break;
386 }
387 case '(': {
388 Advance();
389 isAtom = ParseAssertionCapture(&captureIndex, isBackward);
390 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
391 Advance();
392 break;
393 }
394 case '.': {
395 PrevOpCode prevOp;
396 if (isBackward) {
397 prevOp.EmitOpCode(&buffer_, 0);
398 }
399 if (IsDotAll()) {
400 AllOpCode allOp;
401 allOp.EmitOpCode(&buffer_, 0);
402 } else {
403 DotsOpCode dotsOp;
404 dotsOp.EmitOpCode(&buffer_, 0);
405 }
406 if (isBackward) {
407 prevOp.EmitOpCode(&buffer_, 0);
408 }
409 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
410 PrintF("Atom %c match any \n", c0_);
411 isAtom = true;
412 Advance();
413 break;
414 }
415 case '[': {
416 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
417 PrintF("Atom %c match range \n", c0_);
418 isAtom = true;
419 PrevOpCode prevOp;
420 Advance();
421 if (isBackward) {
422 prevOp.EmitOpCode(&buffer_, 0);
423 }
424 bool isInvert = false;
425 if (c0_ == '^') {
426 isInvert = true;
427 Advance();
428 }
429 RangeSet rangeResult;
430 if (!ParseClassRanges(&rangeResult)) {
431 break;
432 }
433 if (isInvert) {
434 rangeResult.Invert(IsUtf16());
435 }
436 uint32_t highValue = rangeResult.HighestValue();
437 if (highValue <= UINT16_MAX) {
438 RangeOpCode rangeOp;
439 rangeOp.InsertOpCode(&buffer_, rangeResult);
440 } else {
441 Range32OpCode rangeOp;
442 rangeOp.InsertOpCode(&buffer_, rangeResult);
443 }
444
445 if (isBackward) {
446 prevOp.EmitOpCode(&buffer_, 0);
447 }
448 break;
449 }
450 case '*':
451 case '+':
452 case '?':
453 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
454 ParseError("nothing to repeat");
455 return;
456 case '{': {
457 uint8_t *begin = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
458 int dummy;
459 if (ParserIntervalQuantifier(&dummy, &dummy)) {
460 ParseError("nothing to repeat");
461 return;
462 }
463 pc_ = begin;
464 Advance();
465 }
466 [[fallthrough]];
467 case '}':
468 case ']':
469 if (IsUtf16()) {
470 ParseError("syntax error");
471 return;
472 }
473 [[fallthrough]];
474 default: {
475 // PatternCharacter
476 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
477 PrintF("PatternCharacter %c\n", c0_);
478 isAtom = true;
479 {
480 PrevOpCode prevOp;
481 if (isBackward) {
482 prevOp.EmitOpCode(&buffer_, 0);
483 }
484 uint32_t matchedChar = c0_;
485 if (c0_ > (INT8_MAX + 1)) {
486 Prev();
487 int i = 0;
488 UChar32 c;
489 int32_t length = end_ - pc_ + 1;
490 // NOLINTNEXTLINE(hicpp-signed-bitwise)
491 U8_NEXT(pc_, i, length, c); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
492 matchedChar = static_cast<uint32_t>(c);
493 pc_ += i; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
494 }
495 if (IsIgnoreCase()) {
496 matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
497 }
498 if (matchedChar > UINT16_MAX) {
499 Char32OpCode charOp;
500 charOp.EmitOpCode(&buffer_, matchedChar);
501 } else {
502 CharOpCode charOp;
503 charOp.EmitOpCode(&buffer_, matchedChar);
504 }
505 if (isBackward) {
506 prevOp.EmitOpCode(&buffer_, 0);
507 }
508 }
509 Advance();
510 break;
511 }
512 }
513 if (isAtom && !isError_) {
514 ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
515 }
516 if (isBackward) {
517 size_t end = buffer_.GetSize();
518 size_t termSize = end - atomBcStart;
519 size_t moveSize = end - start;
520 buffer_.Expand(end + termSize);
521 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
522 if (memmove_s(buffer_.buf_ + start + termSize, moveSize, buffer_.buf_ + start, moveSize) != EOK) {
523 LOG(FATAL, COMMON) << "memmove_s failed";
524 UNREACHABLE();
525 }
526 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
527 if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
528 LOG(FATAL, COMMON) << "memcpy_s failed";
529 UNREACHABLE();
530 }
531 }
532 }
533 }
534
FindGroupName(const PandaString & name)535 int RegExpParser::FindGroupName(const PandaString &name)
536 {
537 size_t len = 0;
538 size_t nameLen = name.size();
539 const char *p = reinterpret_cast<char *>(groupNames_.buf_);
540 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
541 const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
542 int captureIndex = 1;
543 while (p < bufEnd) {
544 len = strlen(p);
545 if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
546 return captureIndex;
547 }
548 p += len + 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
549 captureIndex++;
550 }
551 return -1;
552 }
553
ParseAssertionCapture(int * captureIndex,bool isBackward)554 bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
555 {
556 bool isAtom = false;
557 do {
558 if (c0_ == '?') {
559 Advance();
560 switch (c0_) {
561 // (?=Disjunction[?U, ?N])
562 case '=': {
563 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
564 PrintF("Assertion(?= Disjunction)\n");
565 Advance();
566 uint32_t start = buffer_.size_;
567 ParseDisjunction(isBackward);
568 MatchOpCode matchOp;
569 matchOp.EmitOpCode(&buffer_, 0);
570 MatchAheadOpCode matchAheadOp;
571 uint32_t len = buffer_.size_ - start;
572 matchAheadOp.InsertOpCode(&buffer_, start, len);
573 break;
574 }
575 // (?!Disjunction[?U, ?N])
576 case '!': {
577 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
578 PrintF("Assertion(?! Disjunction)\n");
579 uint32_t start = buffer_.size_;
580 Advance();
581 ParseDisjunction(isBackward);
582 MatchOpCode matchOp;
583 matchOp.EmitOpCode(&buffer_, 0);
584 NegativeMatchAheadOpCode matchAheadOp;
585 uint32_t len = buffer_.size_ - start;
586 matchAheadOp.InsertOpCode(&buffer_, start, len);
587 break;
588 }
589 case '<': {
590 Advance();
591 // (?<=Disjunction[?U, ?N])
592 if (c0_ == '=') {
593 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
594 PrintF("Assertion(?<= Disjunction)\n");
595 Advance();
596 uint32_t start = buffer_.size_;
597 ParseDisjunction(true);
598 MatchOpCode matchOp;
599 matchOp.EmitOpCode(&buffer_, 0);
600 MatchAheadOpCode matchAheadOp;
601 uint32_t len = buffer_.size_ - start;
602 matchAheadOp.InsertOpCode(&buffer_, start, len);
603 // (?<!Disjunction[?U, ?N])
604 } else if (c0_ == '!') {
605 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
606 PrintF("Assertion(?<! Disjunction)\n");
607 Advance();
608 uint32_t start = buffer_.size_;
609 ParseDisjunction(true);
610 MatchOpCode matchOp;
611 matchOp.EmitOpCode(&buffer_, 0);
612 NegativeMatchAheadOpCode matchAheadOp;
613 uint32_t len = buffer_.size_ - start;
614 matchAheadOp.InsertOpCode(&buffer_, start, len);
615 } else {
616 Prev();
617 PandaString name;
618 auto **pp = const_cast<const uint8_t **>(&pc_);
619 if (!ParseGroupSpecifier(pp, name)) {
620 ParseError("GroupName Syntax error.");
621 return false;
622 }
623 if (FindGroupName(name) > 0) {
624 ParseError("Duplicate GroupName error.");
625 return false;
626 }
627 groupNames_.EmitStr(name.c_str());
628 newGroupNames_.push_back(name);
629 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
630 PrintF("group name %s", name.c_str());
631 Advance();
632 goto parseCapture; // NOLINT(cppcoreguidelines-avoid-goto)
633 }
634 break;
635 }
636 // (?:Disjunction[?U, ?N])
637 case ':':
638 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
639 PrintF("Atom(?<: Disjunction)\n");
640 isAtom = true;
641 Advance();
642 ParseDisjunction(isBackward);
643 break;
644 default:
645 Advance();
646 ParseError("? Syntax error.");
647 return false;
648 }
649 } else {
650 groupNames_.EmitChar(0);
651 parseCapture:
652 isAtom = true;
653 *captureIndex = captureCount_++;
654 SaveEndOpCode saveEndOp;
655 SaveStartOpCode saveStartOp;
656 if (isBackward) {
657 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
658 } else {
659 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
660 }
661 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
662 PrintF("capture start %d \n", *captureIndex);
663 ParseDisjunction(isBackward);
664 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
665 PrintF("capture end %d \n", *captureIndex);
666 if (isBackward) {
667 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
668 } else {
669 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
670 }
671 }
672 } while (c0_ != ')' && c0_ != KEY_EOF);
673 if (c0_ != ')') {
674 ParseError("capture syntax error");
675 return false;
676 }
677 return isAtom;
678 }
679
ParseDecimalDigits()680 int RegExpParser::ParseDecimalDigits()
681 {
682 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
683 PrintF("Parse DecimalDigits------\n");
684 uint32_t result = 0;
685 bool overflow = false;
686 while (true) {
687 if (c0_ < '0' || c0_ > '9') {
688 break;
689 }
690 if (!overflow) {
691 if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
692 overflow = true;
693 } else {
694 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
695 }
696 }
697 Advance();
698 }
699 if (overflow) {
700 return INT32_MAX;
701 }
702 return result;
703 }
704
ParserIntervalQuantifier(int * pmin,int * pmax)705 bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
706 {
707 // Quantifier::
708 // QuantifierPrefix
709 // QuantifierPrefix?
710 // QuantifierPrefix::
711 // *
712 // +
713 // ?
714 // {DecimalDigits}
715 // {DecimalDigits,}
716 // {DecimalDigits,DecimalDigits}
717 Advance();
718 *pmin = ParseDecimalDigits();
719 *pmax = *pmin;
720 switch (c0_) {
721 case ',': {
722 Advance();
723 if (c0_ == '}') {
724 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
725 PrintF("QuantifierPrefix{DecimalDigits,}\n");
726 *pmax = INT32_MAX;
727 Advance();
728 } else {
729 *pmax = ParseDecimalDigits();
730 if (c0_ == '}') {
731 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
732 PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
733 Advance();
734 } else {
735 return false;
736 }
737 }
738 break;
739 }
740 case '}':
741 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
742 PrintF("QuantifierPrefix{DecimalDigits}\n");
743 Advance();
744 break;
745 default:
746 Advance();
747 return false;
748 }
749 return true;
750 }
751
ParseQuantifier(size_t atomBcStart,int captureStart,int captureEnd)752 void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
753 {
754 int min = -1;
755 int max = -1;
756 bool isGreedy = true;
757 switch (c0_) {
758 case '*':
759 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
760 PrintF("QuantifierPrefix %c\n", c0_);
761 min = 0;
762 max = INT32_MAX;
763 Advance();
764 break;
765 case '+':
766 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
767 PrintF("QuantifierPrefix %c\n", c0_);
768 min = 1;
769 max = INT32_MAX;
770 Advance();
771 break;
772 case '?':
773 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
774 PrintF("QuantifierPrefix %c\n", c0_);
775 Advance();
776 min = 0;
777 max = 1;
778 break;
779 case '{': {
780 uint8_t *start = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
781 if (!ParserIntervalQuantifier(&min, &max)) {
782 pc_ = start;
783 Advance(); // back to '{'
784 return;
785 }
786 if (min > max) {
787 ParseError("Invalid repetition count");
788 return;
789 }
790 break;
791 }
792 default:
793 break;
794 }
795 if (c0_ == '?') {
796 isGreedy = false;
797 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
798 PrintF("Quantifier::QuantifierPrefix?\n");
799 Advance();
800 } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
801 ParseError("nothing to repeat");
802 return;
803 }
804 if (min != -1 && max != -1) {
805 stackCount_++;
806 PushOpCode pushOp;
807 pushOp.InsertOpCode(&buffer_, atomBcStart);
808 atomBcStart += pushOp.GetSize();
809
810 if (captureStart != 0) {
811 SaveResetOpCode saveResetOp;
812 saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
813 }
814
815 // zero advance check
816 if (max == INT32_MAX) {
817 stackCount_++;
818 PushCharOpCode pushCharOp;
819 pushCharOp.InsertOpCode(&buffer_, atomBcStart);
820 CheckCharOpCode checkCharOp;
821 // NOLINTNEXTLINE(readability-magic-numbers)
822 checkCharOp.EmitOpCode(&buffer_, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
823 }
824
825 if (isGreedy) {
826 LoopGreedyOpCode loopOp;
827 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
828 } else {
829 LoopOpCode loopOp;
830 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
831 }
832
833 if (min == 0) {
834 if (isGreedy) {
835 SplitNextOpCode splitNextOp;
836 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
837 } else {
838 SplitFirstOpCode splitFirstOp;
839 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
840 }
841 }
842
843 PopOpCode popOp;
844 popOp.EmitOpCode(&buffer_);
845 }
846 }
847
ParseGroupSpecifier(const uint8_t ** pp,PandaString & name)848 bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, PandaString &name)
849 {
850 const uint8_t *p = *pp;
851 uint32_t c;
852 std::array<char, CACHE_SIZE> buffer {};
853 char *q = buffer.data();
854 while (true) {
855 if (p <= end_) {
856 c = *p;
857 } else {
858 c = KEY_EOF;
859 }
860 if (c == '\\') {
861 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
862 p++;
863 if (*p != 'u') {
864 return false;
865 }
866 if (!ParseUnicodeEscape(&c)) {
867 return false;
868 }
869 } else if (c == '>') {
870 break;
871 } else if (c > CACHE_SIZE && c != KEY_EOF) {
872 c = static_cast<uint32_t>(UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
873 } else if (c != KEY_EOF) {
874 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
875 p++;
876 } else {
877 return false;
878 }
879 if (q == buffer.data()) {
880 if (IsIdentFirst(c) != 0) {
881 return false;
882 }
883 } else {
884 if (!u_isIDPart(c)) {
885 return false;
886 }
887 }
888 if (q != nullptr) {
889 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
890 *q++ = c;
891 }
892 }
893 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
894 p++;
895 *pp = p;
896 name = buffer.data();
897 return true;
898 }
899
ParseCaptureCount(const char * groupName)900 int RegExpParser::ParseCaptureCount(const char *groupName)
901 {
902 const uint8_t *p = nullptr;
903 int captureIndex = 1;
904 PandaString name;
905 hasNamedCaptures_ = 0;
906 for (p = base_; p < end_; p++) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
907 switch (*p) {
908 case '(': {
909 if (p[1] == '?') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
910 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
911 if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
912 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
913 p[CAPTURE_CONUT_ADVANCE] != '=') {
914 hasNamedCaptures_ = 1;
915 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
916 p += CAPTURE_CONUT_ADVANCE;
917 if (groupName != nullptr) {
918 if (ParseGroupSpecifier(&p, name)) {
919 if (strcmp(name.c_str(), groupName) == 0) {
920 return captureIndex;
921 }
922 }
923 }
924 captureIndex++;
925 }
926 } else {
927 captureIndex++;
928 }
929 break;
930 }
931 case '\\':
932 p++; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
933 break;
934 case '[': {
935 while (p < end_ && *p != ']') {
936 if (*p == '\\') {
937 p++; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
938 }
939 p++; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
940 }
941 break;
942 }
943 default:
944 break;
945 }
946 }
947 return captureIndex;
948 }
949
950 // NOLINTNEXTLINE(readability-function-size)
ParseAtomEscape(bool isBackward)951 int RegExpParser::ParseAtomEscape(bool isBackward)
952 {
953 // AtomEscape[U, N]::
954 // DecimalEscape
955 // CharacterClassEscape[?U]
956 // CharacterEscape[?U]
957 // [+N]kGroupName[?U]
958 int result = -1;
959 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
960 PrintF("Parse AtomEscape------\n");
961 PrevOpCode prevOp;
962 switch (c0_) {
963 case KEY_EOF:
964 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
965 ParseError("unexpected end");
966 break;
967 // DecimalEscape
968 case '1':
969 case '2':
970 case '3':
971 case '4':
972 case '5':
973 case '6':
974 case '7':
975 case '8':
976 case '9': {
977 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
978 PrintF("NonZeroDigit %c\n", c0_);
979 int capture = ParseDecimalDigits();
980 if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
981 ParseError("invalid backreference count");
982 break;
983 }
984 if (isBackward) {
985 BackwardBackReferenceOpCode backReferenceOp;
986 backReferenceOp.EmitOpCode(&buffer_, capture);
987 } else {
988 BackReferenceOpCode backReferenceOp;
989 backReferenceOp.EmitOpCode(&buffer_, capture);
990 }
991 break;
992 }
993 // CharacterClassEscape
994 case 'd': {
995 // [0-9]
996 RangeOpCode rangeOp;
997 if (isBackward) {
998 prevOp.EmitOpCode(&buffer_, 0);
999 }
1000 rangeOp.InsertOpCode(&buffer_, g_gRangeD);
1001 goto parseLookBehind; // NOLINT(cppcoreguidelines-avoid-goto)
1002 break;
1003 }
1004 case 'D': {
1005 // [^0-9]
1006 RangeSet atomRange(g_gRangeD);
1007 atomRange.Invert(IsUtf16());
1008 Range32OpCode rangeOp;
1009 if (isBackward) {
1010 prevOp.EmitOpCode(&buffer_, 0);
1011 }
1012 rangeOp.InsertOpCode(&buffer_, atomRange);
1013 goto parseLookBehind; // NOLINT(cppcoreguidelines-avoid-goto)
1014 break;
1015 }
1016 case 's': {
1017 // [\f\n\r\t\v]
1018 RangeOpCode rangeOp;
1019 if (isBackward) {
1020 prevOp.EmitOpCode(&buffer_, 0);
1021 }
1022 rangeOp.InsertOpCode(&buffer_, g_gRangeS);
1023 goto parseLookBehind; // NOLINT(cppcoreguidelines-avoid-goto)
1024 break;
1025 }
1026 case 'S': {
1027 RangeSet atomRange(g_gRangeS);
1028 Range32OpCode rangeOp;
1029 atomRange.Invert(IsUtf16());
1030 if (isBackward) {
1031 prevOp.EmitOpCode(&buffer_, 0);
1032 }
1033 rangeOp.InsertOpCode(&buffer_, atomRange);
1034 goto parseLookBehind; // NOLINT(cppcoreguidelines-avoid-goto)
1035 break;
1036 }
1037 case 'w': {
1038 // [A-Za-z0-9]
1039 RangeOpCode rangeOp;
1040 if (isBackward) {
1041 prevOp.EmitOpCode(&buffer_, 0);
1042 }
1043 rangeOp.InsertOpCode(&buffer_, g_gRangeW);
1044 goto parseLookBehind; // NOLINT(cppcoreguidelines-avoid-goto)
1045 break;
1046 }
1047 case 'W': {
1048 // [^A-Za-z0-9]
1049 RangeSet atomRange(g_gRangeW);
1050 atomRange.Invert(IsUtf16());
1051 Range32OpCode rangeOp;
1052 if (isBackward) {
1053 prevOp.EmitOpCode(&buffer_, 0);
1054 }
1055 rangeOp.InsertOpCode(&buffer_, atomRange);
1056 goto parseLookBehind; // NOLINT(cppcoreguidelines-avoid-goto)
1057 break;
1058 }
1059 // P{UnicodePropertyValueExpression}
1060 // p{UnicodePropertyValueExpression}
1061 case 'P':
1062 case 'p':
1063 // [+N]kGroupName[?U]
1064 case 'k': {
1065 Advance();
1066 if (c0_ != '<') {
1067 if (!IsUtf16() || HasNamedCaptures()) {
1068 ParseError("expecting group name.");
1069 break;
1070 }
1071 }
1072 Advance();
1073 Prev();
1074 PandaString name;
1075 auto **pp = const_cast<const uint8_t **>(&pc_);
1076 if (!ParseGroupSpecifier(pp, name)) {
1077 ParseError("GroupName Syntax error.");
1078 break;
1079 }
1080 int postion = FindGroupName(name);
1081 if (postion < 0) {
1082 postion = ParseCaptureCount(name.c_str());
1083 if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1084 ParseError("group name not defined");
1085 break;
1086 }
1087 }
1088 if (isBackward) {
1089 BackwardBackReferenceOpCode backReferenceOp;
1090 backReferenceOp.EmitOpCode(&buffer_, postion);
1091 } else {
1092 BackReferenceOpCode backReferenceOp;
1093 backReferenceOp.EmitOpCode(&buffer_, postion);
1094 }
1095 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1096 Advance();
1097 break;
1098 }
1099 parseLookBehind : {
1100 if (isBackward) {
1101 prevOp.EmitOpCode(&buffer_, 0);
1102 }
1103 Advance();
1104 break;
1105 }
1106 default:
1107 result = ParseCharacterEscape();
1108 break;
1109 }
1110 return result;
1111 }
1112
RecountCaptures()1113 int RegExpParser::RecountCaptures()
1114 {
1115 if (totalCaptureCount_ < 0) {
1116 const char *name = reinterpret_cast<const char *>(groupNames_.buf_);
1117 totalCaptureCount_ = ParseCaptureCount(name);
1118 }
1119 return totalCaptureCount_;
1120 }
HasNamedCaptures()1121 bool RegExpParser::HasNamedCaptures()
1122 {
1123 if (hasNamedCaptures_ < 0) {
1124 RecountCaptures();
1125 }
1126 return false;
1127 }
1128
ParseCharacterEscape()1129 int RegExpParser::ParseCharacterEscape()
1130 {
1131 // CharacterEscape[U]::
1132 // ControlEscape
1133 // c ControlLetter
1134 // 0 [lookahead ∉ DecimalDigit]
1135 // HexEscapeSequence
1136 // RegExpUnicodeEscapeSequence[?U]
1137 // IdentityEscape[?U]
1138 uint32_t result = 0;
1139 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1140 switch (c0_) {
1141 // ControlEscape
1142 case 'f':
1143 result = '\f';
1144 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1145 PrintF("ControlEscape %c\n", c0_);
1146 Advance();
1147 break;
1148 case 'n':
1149 result = '\n';
1150 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1151 PrintF("ControlEscape %c\n", c0_);
1152 Advance();
1153 break;
1154 case 'r':
1155 result = '\r';
1156 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1157 PrintF("ControlEscape %c\n", c0_);
1158 Advance();
1159 break;
1160 case 't':
1161 result = '\t';
1162 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1163 PrintF("ControlEscape %c\n", c0_);
1164 Advance();
1165 break;
1166 case 'v':
1167 result = '\v';
1168 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1169 PrintF("ControlEscape %c\n", c0_);
1170 Advance();
1171 break;
1172 // c ControlLetter
1173 case 'c': {
1174 Advance();
1175 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1176 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1177 PrintF("ControlLetter %c\n", c0_);
1178 result = static_cast<uint32_t>(c0_) & 0x1f; // NOLINT(readability-magic-numbers, hicpp-signed-bitwise)
1179 Advance();
1180 } else {
1181 if (!IsUtf16()) {
1182 pc_--; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1183 result = '\\';
1184 } else {
1185 ParseError("Invalid control letter");
1186 return -1;
1187 }
1188 }
1189 break;
1190 }
1191 case '0': {
1192 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1193 PrintF("CharacterEscape 0 [lookahead ∉ DecimalDigit]\n");
1194 if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) { // NOLINT(readability-magic-numbers)
1195 Advance();
1196 result = 0;
1197 break;
1198 }
1199 [[fallthrough]];
1200 }
1201 case '1':
1202 case '2':
1203 case '3':
1204 case '4':
1205 case '5':
1206 case '6':
1207 case '7': {
1208 if (IsUtf16()) {
1209 // With /u, decimal escape is not interpreted as octal character code.
1210 ParseError("Invalid class escape");
1211 return 0;
1212 }
1213 result = ParseOctalLiteral();
1214 break;
1215 }
1216 // ParseHexEscapeSequence
1217 // ParseRegExpUnicodeEscapeSequence
1218 case 'x': {
1219 Advance();
1220 if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1221 return result;
1222 }
1223 if (IsUtf16()) {
1224 ParseError("Invalid class escape");
1225 return -1;
1226 }
1227 result = 'x';
1228 break;
1229 }
1230 case 'u': {
1231 Advance();
1232 if (ParseUnicodeEscape(&result)) {
1233 return result;
1234 }
1235 if (IsUtf16()) {
1236 // With /u, invalid escapes are not treated as identity escapes.
1237 ParseError("Invalid unicode escape");
1238 return 0;
1239 }
1240 // If \u is not followed by a two-digit hexadecimal, treat it
1241 // as an identity escape.
1242 result = 'u';
1243 break;
1244 }
1245 // IdentityEscape[?U]
1246 case '$':
1247 case '(':
1248 case ')':
1249 case '*':
1250 case '+':
1251 case '.':
1252 case '/':
1253 case '?':
1254 case '[':
1255 case '\\':
1256 case ']':
1257 case '^':
1258 case '{':
1259 case '|':
1260 case '}':
1261 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1262 PrintF("IdentityEscape %c\n", c0_);
1263 result = c0_;
1264 Advance();
1265 break;
1266 default: {
1267 if (IsUtf16()) {
1268 ParseError("Invalid unicode escape");
1269 return 0;
1270 }
1271 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1272 PrintF("SourceCharacter %c\n", c0_);
1273 result = c0_;
1274 if (result < CHAR_MAXS) {
1275 Advance();
1276 }
1277 break;
1278 }
1279 }
1280 return result;
1281 }
1282
ParseClassRanges(RangeSet * result)1283 bool RegExpParser::ParseClassRanges(RangeSet *result)
1284 {
1285 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1286 PrintF("Parse ClassRanges------\n");
1287 while (c0_ != ']') {
1288 RangeSet s1;
1289 uint32_t c1 = ParseClassAtom(&s1);
1290 if (c1 == UINT32_MAX) {
1291 ParseError("invalid class range");
1292 return false;
1293 }
1294
1295 int nextC0 = *pc_;
1296 if (c0_ == '-' && nextC0 != ']') {
1297 if (c1 == CLASS_RANGE_BASE) {
1298 if (IsUtf16()) {
1299 ParseError("invalid class range");
1300 return false;
1301 }
1302 result->Insert(s1);
1303 continue;
1304 }
1305 Advance();
1306 RangeSet s2;
1307 uint32_t c2 = ParseClassAtom(&s2);
1308 if (c2 == UINT32_MAX) {
1309 ParseError("invalid class range");
1310 return false;
1311 }
1312 if (c2 == CLASS_RANGE_BASE) {
1313 if (IsUtf16()) {
1314 ParseError("invalid class range");
1315 return false;
1316 }
1317 result->Insert(s2);
1318 continue;
1319 }
1320 if (c1 < INT8_MAX) {
1321 if (c1 > c2) {
1322 ParseError("invalid class range");
1323 return false;
1324 }
1325 }
1326 if (IsIgnoreCase()) {
1327 c1 = static_cast<uint32_t>(Canonicalize(c1, IsUtf16()));
1328 c2 = static_cast<uint32_t>(Canonicalize(c2, IsUtf16()));
1329 }
1330
1331 result->Insert(c1, c2);
1332 } else {
1333 result->Insert(s1);
1334 }
1335 }
1336 Advance();
1337 return true;
1338 }
1339
ParseClassAtom(RangeSet * atom)1340 uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1341 {
1342 uint32_t ret = UINT32_MAX;
1343 switch (c0_) {
1344 case '\\': {
1345 Advance();
1346 ret = static_cast<uint32_t>(ParseClassEscape(atom));
1347 break;
1348 }
1349 case KEY_EOF:
1350 break;
1351 case 0: {
1352 if (pc_ >= end_) {
1353 return UINT32_MAX;
1354 }
1355 [[fallthrough]];
1356 }
1357 default: {
1358 uint32_t value = c0_;
1359 size_t u16Size = 0;
1360 if (c0_ > INT8_MAX) {
1361 pc_ -= 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1362 auto u16Result = utf::ConvertUtf8ToUtf16Pair(pc_, true);
1363 value = u16Result.first;
1364 u16Size = u16Result.second;
1365 Advance(u16Size + 1);
1366 } else {
1367 Advance();
1368 }
1369 if (IsIgnoreCase()) {
1370 value = static_cast<uint32_t>(Canonicalize(value, IsUtf16()));
1371 }
1372 atom->Insert(RangeSet(value));
1373 ret = value;
1374 break;
1375 }
1376 }
1377 return ret;
1378 }
1379
ParseClassEscape(RangeSet * atom)1380 int RegExpParser::ParseClassEscape(RangeSet *atom)
1381 {
1382 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1383 PrintF("Parse ClassEscape------\n");
1384 int result = -1;
1385 switch (c0_) {
1386 case 'b':
1387 Advance();
1388 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1389 PrintF("ClassEscape %c", 'b');
1390 result = '\b';
1391 atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1392 break;
1393 case '-':
1394 Advance();
1395 result = '-';
1396 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1397 PrintF("ClassEscape %c", '-');
1398 atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1399 break;
1400 // CharacterClassEscape
1401 case 'd':
1402 case 'D':
1403 result = CLASS_RANGE_BASE;
1404 atom->Insert(g_gRangeD);
1405 if (c0_ == 'D') {
1406 atom->Invert(IsUtf16());
1407 }
1408 Advance();
1409 break;
1410 case 's':
1411 case 'S':
1412 result = CLASS_RANGE_BASE;
1413 atom->Insert(g_gRangeS);
1414 if (c0_ == 'S') {
1415 atom->Invert(IsUtf16());
1416 }
1417 Advance();
1418 break;
1419 case 'w':
1420 case 'W':
1421 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1422 PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1423 result = CLASS_RANGE_BASE;
1424 atom->Insert(g_gRangeW);
1425 if (c0_ == 'W') {
1426 atom->Invert(IsUtf16());
1427 }
1428 Advance();
1429 break;
1430 // P{UnicodePropertyValueExpression}
1431 // p{UnicodePropertyValueExpression}
1432 case 'P':
1433 case 'p':
1434 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1435 PrintF("Warning: \\p is not supported in ECMA 2015!");
1436 Advance();
1437 if (c0_ == '{') {
1438 Advance();
1439 if (c0_ == '}') {
1440 break; // p{}, invalid
1441 }
1442 bool isValue = false;
1443 ParseUnicodePropertyValueCharacters(&isValue);
1444 if (!isValue && c0_ == '=') {
1445 // UnicodePropertyName = UnicodePropertyValue
1446 Advance();
1447 if (c0_ == '}') {
1448 break; // p{xxx=}, invalid
1449 }
1450 ParseUnicodePropertyValueCharacters(&isValue);
1451 }
1452 if (c0_ != '}') {
1453 break; // p{xxx, invalid
1454 }
1455 // should do atom->Invert() here after ECMA 9.0
1456 Advance();
1457 result = CLASS_RANGE_BASE;
1458 }
1459 break;
1460 default:
1461 result = ParseCharacterEscape();
1462 int value = result;
1463 if (IsIgnoreCase()) {
1464 value = Canonicalize(value, IsUtf16());
1465 }
1466 atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1467 break;
1468 }
1469 return result;
1470 }
1471
ParseUnicodePropertyValueCharacters(bool * isValue)1472 void RegExpParser::ParseUnicodePropertyValueCharacters(bool *isValue)
1473 {
1474 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1475 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1476 PrintF("UnicodePropertyCharacter::ControlLetter %c\n", c0_);
1477 } else if (c0_ == '_') {
1478 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1479 PrintF("UnicodePropertyCharacter:: _ \n");
1480 } else if (c0_ >= '0' && c0_ <= '9') {
1481 *isValue = true;
1482 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1483 PrintF("UnicodePropertyValueCharacter::DecimalDigit %c\n", c0_);
1484 } else {
1485 return;
1486 }
1487 Advance();
1488 ParseUnicodePropertyValueCharacters(isValue);
1489 }
1490
1491 // NOLINTNEXTLINE(cert-dcl50-cpp)
PrintF(const char * fmt,...)1492 void RegExpParser::PrintF(const char *fmt, ...)
1493 {
1494 #ifndef NO_DEBUG
1495 va_list args;
1496 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg,)
1497 va_start(args, fmt);
1498 vprintf(fmt, args);
1499 va_end(args);
1500 #else
1501 (void)fmt;
1502 #endif
1503 }
1504
ParseError(const char * errorMessage)1505 void RegExpParser::ParseError(const char *errorMessage)
1506 {
1507 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1508 PrintF("error: ");
1509 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1510 PrintF(errorMessage);
1511 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1512 PrintF("\n");
1513 SetIsError();
1514 size_t length = strlen(errorMessage) + 1;
1515 if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1516 LOG(FATAL, COMMON) << "memcpy_s failed";
1517 UNREACHABLE();
1518 }
1519 }
1520
IsIdentFirst(uint32_t c)1521 int RegExpParser::IsIdentFirst(uint32_t c)
1522 {
1523 if (c < CACHE_SIZE) {
1524 // NOLINTNEXTLINE(hicpp-signed-bitwise
1525 return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1; // 5: Shift five bits 31: and operation binary of 31
1526 }
1527 return static_cast<int>(u_isIDStart(c));
1528 }
1529 } // namespace panda