1 /**
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "runtime/regexp/ecmascript/regexp_parser.h"
17 #include "runtime/regexp/ecmascript/regexp_opcode.h"
18 #include "runtime/include/coretypes/string-inl.h"
19
20 #include "libpandabase/utils/utils.h"
21 #include "libpandabase/utils/utf.h"
22
23 #include "securec.h"
24 #include "unicode/uchar.h"
25 #include "unicode/uniset.h"
26
27 namespace {
28 constexpr int INVALID_UNICODE_FROM_UTF8 = -1;
29
30 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
31 constexpr int UICODE_FROM_UTF8[] = {
32 0x80, 0xc0, 0xdf, 0xe0, 0xef, 0xf0, 0xf7, 0xf8, 0xfb, 0xfc, 0xfd,
33 };
34
35 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
36 constexpr int UTF8_MIN_CODE[] = {
37 0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
38 };
39
40 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
41 constexpr char UTF8_FIRST_CODE[] = {
42 0x1f, 0xf, 0x7, 0x3, 0x1,
43 };
44
FromUtf8(int c,int l,const uint8_t * p,const uint8_t ** pp)45 int FromUtf8(int c, int l, const uint8_t *p, const uint8_t **pp)
46 {
47 uint32_t b;
48 // NOLINTNEXTLINE(hicpp-signed-bitwise)
49 c &= UTF8_FIRST_CODE[l - 1];
50 for (int i = 0; i < l; i++) {
51 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
52 b = *p++;
53 if (b < ark::utf::UTF8_2B_SECOND || b >= ark::utf::UTF8_2B_FIRST) {
54 return INVALID_UNICODE_FROM_UTF8;
55 }
56 // NOLINTNEXTLINE(hicpp-signed-bitwise)
57 c = (c << 6) | (b & ark::utf::UTF8_2B_THIRD); // 6: Maximum Unicode range
58 }
59 if (c < UTF8_MIN_CODE[l - 1]) {
60 return INVALID_UNICODE_FROM_UTF8;
61 }
62 *pp = p;
63 return c;
64 }
65
UnicodeFromUtf8(const uint8_t * p,int maxLen,const uint8_t ** pp)66 int UnicodeFromUtf8(const uint8_t *p, int maxLen, const uint8_t **pp)
67 {
68 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
69 int c = *p++;
70 if (c < UICODE_FROM_UTF8[0]) {
71 *pp = p;
72 return c;
73 }
74 int l = 0;
75 if (c >= UICODE_FROM_UTF8[1U] && c <= UICODE_FROM_UTF8[2U]) { // 1 - 2: 0000 0080 - 0000 07FF
76 l = 1; // 1: 0000 0080 - 0000 07FF Unicode
77 } else if (c >= UICODE_FROM_UTF8[3U] && c <= UICODE_FROM_UTF8[4U]) { // 3 - 4: 0000 0800 - 0000 FFFF
78 l = 2; // 2: 0000 0800 - 0000 FFFF Unicode
79 } else if (c >= UICODE_FROM_UTF8[5U] && c <= UICODE_FROM_UTF8[6U]) { // 5 - 6: 0001 0000 - 0010 FFFF
80 l = 3; // 3: 0001 0000 - 0010 FFFF Unicode
81 } else if (c >= UICODE_FROM_UTF8[7U] && c <= UICODE_FROM_UTF8[8U]) { // 7 - 8: 0020 0000 - 03FF FFFF
82 l = 4; // 4: 0020 0000 - 03FF FFFF Unicode
83 // NOLINTNEXTLINE(readability-magic-numbers)
84 } else if (c == UICODE_FROM_UTF8[9U] || c == UICODE_FROM_UTF8[10U]) { // 9 - 10: 0400 0000 - 7FFF FFFF
85 l = 5; // 5: 0400 0000 - 7FFF FFFF Unicode
86 } else {
87 return INVALID_UNICODE_FROM_UTF8;
88 }
89 /* check that we have enough characters */
90 if (l > (maxLen - 1)) {
91 return INVALID_UNICODE_FROM_UTF8;
92 }
93 return FromUtf8(c, l, p, pp);
94 }
95 } // namespace
96
97 namespace ark {
98 static constexpr uint32_t CACHE_SIZE = 128;
99 static constexpr uint32_t CHAR_MAXS = 128;
100 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
101 static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
102 /* $ A-Z _ a-z */
103 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE};
104 static RangeSet g_gRangeD(0x30, 0x39); // NOLINT(fuchsia-statically-constructed-objects, readability-magic-numbers)
105 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
106 static RangeSet g_gRangeS({
107 std::pair<uint32_t, uint32_t>(0x0009, 0x000D), // NOLINT(readability-magic-numbers)
108 std::pair<uint32_t, uint32_t>(0x0020, 0x0020), // NOLINT(readability-magic-numbers)
109 std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0), // NOLINT(readability-magic-numbers)
110 std::pair<uint32_t, uint32_t>(0x1680, 0x1680), // NOLINT(readability-magic-numbers)
111 std::pair<uint32_t, uint32_t>(0x2000, 0x200A), // NOLINT(readability-magic-numbers)
112 /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
113 /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
114 std::pair<uint32_t, uint32_t>(0x2028, 0x2029), // NOLINT(readability-magic-numbers)
115 std::pair<uint32_t, uint32_t>(0x202F, 0x202F), // NOLINT(readability-magic-numbers)
116 std::pair<uint32_t, uint32_t>(0x205F, 0x205F), // NOLINT(readability-magic-numbers)
117 std::pair<uint32_t, uint32_t>(0x3000, 0x3000), // NOLINT(readability-magic-numbers)
118 /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
119 std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF), // NOLINT(readability-magic-numbers)
120 });
121
122 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
123 static RangeSet g_gRangeW({
124 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINT(readability-magic-numbers)
125 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINT(readability-magic-numbers)
126 std::pair<uint32_t, uint32_t>(0x005F, 0x005F), // NOLINT(readability-magic-numbers)
127 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINT(readability-magic-numbers)
128 });
129
130 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
131 static RangeSet g_gRegexpIdentifyStart({
132 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINT(readability-magic-numbers)
133 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINT(readability-magic-numbers)
134 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINT(readability-magic-numbers)
135 });
136
137 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
138 static RangeSet g_gRegexpIdentifyContinue({
139 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINT(readability-magic-numbers)
140 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINT(readability-magic-numbers)
141 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINT(readability-magic-numbers)
142 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINT(readability-magic-numbers)
143 });
144
Parse()145 void RegExpParser::Parse()
146 {
147 // dynbuffer head init [size,capture_count,statck_count,flags]
148 buffer_.EmitU32(0);
149 buffer_.EmitU32(0);
150 buffer_.EmitU32(0);
151 buffer_.EmitU32(0);
152 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
153 PrintF("Parse Pattern------\n");
154 // Pattern[U, N]::
155 // Disjunction[?U, ?N]
156 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
157 Advance();
158 SaveStartOpCode saveStartOp;
159 int captureIndex = captureCount_++;
160 saveStartOp.EmitOpCode(&buffer_, captureIndex);
161 ParseDisjunction(false);
162 if (c0_ != KEY_EOF) {
163 ParseError("extraneous characters at the end");
164 return;
165 }
166 SaveEndOpCode saveEndOp;
167 saveEndOp.EmitOpCode(&buffer_, captureIndex);
168 MatchEndOpCode matchEndOp;
169 matchEndOp.EmitOpCode(&buffer_, 0);
170 // dynbuffer head assignments
171 buffer_.PutU32(0, buffer_.size_);
172 buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
173 buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
174 buffer_.PutU32(FLAGS_OFFSET, flags_);
175 }
176
ParseDisjunction(bool isBackward)177 void RegExpParser::ParseDisjunction(bool isBackward)
178 {
179 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
180 PrintF("Parse Disjunction------\n");
181 size_t start = buffer_.size_;
182 ParseAlternative(isBackward);
183 if (isError_) {
184 return;
185 }
186 do {
187 if (c0_ == '|') {
188 SplitNextOpCode splitOp;
189 uint32_t len = buffer_.size_ - start;
190 GotoOpCode gotoOp;
191 splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
192 uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
193 Advance();
194 ParseAlternative(isBackward);
195 gotoOp.UpdateOpPara(&buffer_, pos, buffer_.size_ - pos - gotoOp.GetSize());
196 }
197 } while (c0_ != KEY_EOF && c0_ != ')');
198 }
199
ParseOctalLiteral()200 uint32_t RegExpParser::ParseOctalLiteral()
201 {
202 // For compatibility with some other browsers (not all), we parse
203 // up to three octal digits with a value below 256.
204 // ES#prod-annexB-LegacyOctalEscapeSequence
205 uint32_t value = c0_ - '0';
206 Advance();
207 if (c0_ >= '0' && c0_ <= '7') {
208 value = value * OCTAL_VALUE + c0_ - '0';
209 Advance();
210 if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
211 value = value * OCTAL_VALUE + c0_ - '0';
212 Advance();
213 }
214 }
215 return value;
216 }
217
ParseUnlimitedLengthHexNumber(uint32_t maxValue,uint32_t * value)218 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
219 {
220 uint32_t x = 0;
221 int d = static_cast<int>(HexValue(c0_));
222 if (d < 0) {
223 return false;
224 }
225 while (d >= 0) {
226 if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
227 LOG(FATAL, COMMON) << "value overflow";
228 return false;
229 }
230 x = x * HEX_VALUE + static_cast<uint32_t>(d);
231 if (x > maxValue) {
232 return false;
233 }
234 Advance();
235 d = static_cast<int>(HexValue(c0_));
236 }
237 *value = x;
238 return true;
239 }
240
241 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uint32_t * value)242 bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
243 {
244 // Accept both \uxxxx and \u{xxxxxx} (if allowed).
245 // In the latter case, the number of hex digits between { } is arbitrary.
246 // \ and u have already been read.
247 if (c0_ == '{' && IsUtf16()) {
248 uint8_t *start = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
249 Advance();
250 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { // NOLINT(readability-magic-numbers)
251 if (c0_ == '}') {
252 Advance();
253 return true;
254 }
255 }
256 pc_ = start;
257 Advance();
258 return false;
259 }
260 // \u but no {, or \u{...} escapes not allowed.
261 bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
262 if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
263 // Attempt to read trail surrogate.
264 uint8_t *start = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
265 if (*pc_ == 'u') {
266 Advance(UNICODE_HEX_ADVANCE);
267 uint32_t trail;
268 if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
269 *value = U16_GET_SUPPLEMENTARY((*value), (trail)); // NOLINT(hicpp-signed-bitwise)
270 return true;
271 }
272 }
273 pc_ = start;
274 Advance();
275 }
276 return result;
277 }
278
ParseHexEscape(int length,uint32_t * value)279 bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
280 {
281 uint8_t *start = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
282 uint32_t val = 0;
283 for (int i = 0; i < length; ++i) {
284 uint32_t c = c0_;
285 int d = static_cast<int>(HexValue(c));
286 if (d < 0) {
287 pc_ = start;
288 Advance();
289 return false;
290 }
291 val = val * HEX_VALUE + static_cast<uint32_t>(d);
292 Advance();
293 }
294 *value = val;
295 return true;
296 }
297
298 // NOLINTNEXTLINE(readability-function-size)
ParseAlternative(bool isBackward)299 void RegExpParser::ParseAlternative(bool isBackward)
300 {
301 size_t start = buffer_.size_;
302 while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
303 if (isError_) {
304 return;
305 }
306 size_t atomBcStart = buffer_.GetSize();
307 int captureIndex = 0;
308 bool isAtom = false;
309 switch (c0_) {
310 case '^': {
311 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
312 PrintF("Assertion %c line start \n", c0_);
313 LineStartOpCode lineStartOp;
314 lineStartOp.EmitOpCode(&buffer_, 0);
315 Advance();
316 break;
317 }
318 case '$': {
319 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
320 PrintF("Assertion %c line end \n", c0_);
321 LineEndOpCode lineEndOp;
322 lineEndOp.EmitOpCode(&buffer_, 0);
323 Advance();
324 break;
325 }
326 case '\\': {
327 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
328 PrintF("Escape %c \n", c0_);
329 Advance();
330 switch (c0_) {
331 case 'b': {
332 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
333 PrintF("Assertion %c \n", c0_);
334 WordBoundaryOpCode wordBoundaryOp;
335 wordBoundaryOp.EmitOpCode(&buffer_, 0);
336 Advance();
337 break;
338 }
339 case 'B': {
340 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
341 PrintF("Assertion %c \n", c0_);
342 NotWordBoundaryOpCode notWordBoundaryOp;
343 notWordBoundaryOp.EmitOpCode(&buffer_, 0);
344 Advance();
345 break;
346 }
347 default: {
348 isAtom = true;
349 int atomValue = ParseAtomEscape(isBackward);
350 if (atomValue != -1) {
351 if (IsIgnoreCase()) {
352 if (!IsUtf16()) {
353 atomValue = Canonicalize(atomValue, false);
354 } else {
355 icu::UnicodeSet set(atomValue, atomValue);
356 set.closeOver(USET_CASE_INSENSITIVE);
357 set.removeAllStrings();
358 int32_t size = set.size();
359 RangeOpCode rangeOp;
360 RangeSet rangeResult;
361 for (int32_t idx = 0; idx < size; idx++) {
362 int32_t uc = set.charAt(idx);
363 RangeSet curRange(uc);
364 rangeResult.Insert(curRange);
365 }
366 rangeOp.InsertOpCode(&buffer_, rangeResult);
367 break;
368 }
369 }
370 if (atomValue <= UINT16_MAX) {
371 CharOpCode charOp;
372 charOp.EmitOpCode(&buffer_, atomValue);
373 } else {
374 Char32OpCode charOp;
375 charOp.EmitOpCode(&buffer_, atomValue);
376 }
377 }
378 break;
379 }
380 }
381 break;
382 }
383 case '(': {
384 Advance();
385 isAtom = ParseAssertionCapture(&captureIndex, isBackward);
386 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
387 Advance();
388 break;
389 }
390 case '.': {
391 PrevOpCode prevOp;
392 if (isBackward) {
393 prevOp.EmitOpCode(&buffer_, 0);
394 }
395 if (IsDotAll()) {
396 AllOpCode allOp;
397 allOp.EmitOpCode(&buffer_, 0);
398 } else {
399 DotsOpCode dotsOp;
400 dotsOp.EmitOpCode(&buffer_, 0);
401 }
402 if (isBackward) {
403 prevOp.EmitOpCode(&buffer_, 0);
404 }
405 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
406 PrintF("Atom %c match any \n", c0_);
407 isAtom = true;
408 Advance();
409 break;
410 }
411 case '[': {
412 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
413 PrintF("Atom %c match range \n", c0_);
414 isAtom = true;
415 PrevOpCode prevOp;
416 Advance();
417 if (isBackward) {
418 prevOp.EmitOpCode(&buffer_, 0);
419 }
420 bool isInvert = false;
421 if (c0_ == '^') {
422 isInvert = true;
423 Advance();
424 }
425 RangeSet rangeResult;
426 if (!ParseClassRanges(&rangeResult)) {
427 break;
428 }
429 if (isInvert) {
430 rangeResult.Invert(IsUtf16());
431 }
432 uint32_t highValue = rangeResult.HighestValue();
433 if (highValue <= UINT16_MAX) {
434 RangeOpCode rangeOp;
435 rangeOp.InsertOpCode(&buffer_, rangeResult);
436 } else {
437 Range32OpCode rangeOp;
438 rangeOp.InsertOpCode(&buffer_, rangeResult);
439 }
440
441 if (isBackward) {
442 prevOp.EmitOpCode(&buffer_, 0);
443 }
444 break;
445 }
446 case '*':
447 case '+':
448 case '?':
449 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
450 ParseError("nothing to repeat");
451 return;
452 case '{': {
453 uint8_t *begin = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
454 int dummy;
455 if (ParserIntervalQuantifier(&dummy, &dummy)) {
456 ParseError("nothing to repeat");
457 return;
458 }
459 pc_ = begin;
460 Advance();
461 }
462 [[fallthrough]];
463 case '}':
464 case ']':
465 if (IsUtf16()) {
466 ParseError("syntax error");
467 return;
468 }
469 [[fallthrough]];
470 default: {
471 // PatternCharacter
472 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
473 PrintF("PatternCharacter %c\n", c0_);
474 isAtom = true;
475 {
476 PrevOpCode prevOp;
477 if (isBackward) {
478 prevOp.EmitOpCode(&buffer_, 0);
479 }
480 uint32_t matchedChar = c0_;
481 if (c0_ > (INT8_MAX + 1)) {
482 Prev();
483 int i = 0;
484 UChar32 c;
485 int32_t length = end_ - pc_ + 1;
486 // NOLINTNEXTLINE(hicpp-signed-bitwise)
487 U8_NEXT(pc_, i, length, c); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
488 matchedChar = static_cast<uint32_t>(c);
489 pc_ += i; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
490 }
491 if (IsIgnoreCase()) {
492 matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
493 }
494 if (matchedChar > UINT16_MAX) {
495 Char32OpCode charOp;
496 charOp.EmitOpCode(&buffer_, matchedChar);
497 } else {
498 CharOpCode charOp;
499 charOp.EmitOpCode(&buffer_, matchedChar);
500 }
501 if (isBackward) {
502 prevOp.EmitOpCode(&buffer_, 0);
503 }
504 }
505 Advance();
506 break;
507 }
508 }
509 if (isAtom && !isError_) {
510 ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
511 }
512 if (isBackward) {
513 size_t end = buffer_.GetSize();
514 size_t termSize = end - atomBcStart;
515 size_t moveSize = end - start;
516 buffer_.Expand(end + termSize);
517 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
518 if (memmove_s(buffer_.buf_ + start + termSize, moveSize, buffer_.buf_ + start, moveSize) != EOK) {
519 LOG(FATAL, COMMON) << "memmove_s failed";
520 UNREACHABLE();
521 }
522 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
523 if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
524 LOG(FATAL, COMMON) << "memcpy_s failed";
525 UNREACHABLE();
526 }
527 }
528 }
529 }
530
FindGroupName(const PandaString & name)531 int RegExpParser::FindGroupName(const PandaString &name)
532 {
533 size_t len;
534 size_t nameLen = name.size();
535 const char *p = reinterpret_cast<char *>(groupNames_.buf_);
536 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
537 const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
538 int captureIndex = 1;
539 while (p < bufEnd) {
540 len = strlen(p);
541 if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
542 return captureIndex;
543 }
544 p += len + 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
545 captureIndex++;
546 }
547 return -1;
548 }
549
ParseAssertionCapture(int * captureIndex,bool isBackward)550 bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
551 {
552 bool isAtom = false;
553 do {
554 if (c0_ == '?') {
555 Advance();
556 switch (c0_) {
557 // (?=Disjunction[?U, ?N])
558 case '=': {
559 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
560 PrintF("Assertion(?= Disjunction)\n");
561 Advance();
562 uint32_t start = buffer_.size_;
563 ParseDisjunction(isBackward);
564 MatchOpCode matchOp;
565 matchOp.EmitOpCode(&buffer_, 0);
566 MatchAheadOpCode matchAheadOp;
567 uint32_t len = buffer_.size_ - start;
568 matchAheadOp.InsertOpCode(&buffer_, start, len);
569 break;
570 }
571 // (?!Disjunction[?U, ?N])
572 case '!': {
573 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
574 PrintF("Assertion(?! Disjunction)\n");
575 uint32_t start = buffer_.size_;
576 Advance();
577 ParseDisjunction(isBackward);
578 MatchOpCode matchOp;
579 matchOp.EmitOpCode(&buffer_, 0);
580 NegativeMatchAheadOpCode matchAheadOp;
581 uint32_t len = buffer_.size_ - start;
582 matchAheadOp.InsertOpCode(&buffer_, start, len);
583 break;
584 }
585 case '<': {
586 Advance();
587 // (?<=Disjunction[?U, ?N])
588 if (c0_ == '=') {
589 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
590 PrintF("Assertion(?<= Disjunction)\n");
591 Advance();
592 uint32_t start = buffer_.size_;
593 ParseDisjunction(true);
594 MatchOpCode matchOp;
595 matchOp.EmitOpCode(&buffer_, 0);
596 MatchAheadOpCode matchAheadOp;
597 uint32_t len = buffer_.size_ - start;
598 matchAheadOp.InsertOpCode(&buffer_, start, len);
599 // (?<!Disjunction[?U, ?N])
600 } else if (c0_ == '!') {
601 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
602 PrintF("Assertion(?<! Disjunction)\n");
603 Advance();
604 uint32_t start = buffer_.size_;
605 ParseDisjunction(true);
606 MatchOpCode matchOp;
607 matchOp.EmitOpCode(&buffer_, 0);
608 NegativeMatchAheadOpCode matchAheadOp;
609 uint32_t len = buffer_.size_ - start;
610 matchAheadOp.InsertOpCode(&buffer_, start, len);
611 } else {
612 Prev();
613 PandaString name;
614 auto **pp = const_cast<const uint8_t **>(&pc_);
615 if (!ParseGroupSpecifier(pp, name)) {
616 ParseError("GroupName Syntax error.");
617 return false;
618 }
619 if (FindGroupName(name) > 0) {
620 ParseError("Duplicate GroupName error.");
621 return false;
622 }
623 groupNames_.EmitStr(name.c_str());
624 newGroupNames_.push_back(name);
625 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
626 PrintF("group name %s", name.c_str());
627 Advance();
628 goto parseCapture; // NOLINT(cppcoreguidelines-avoid-goto)
629 }
630 break;
631 }
632 // (?:Disjunction[?U, ?N])
633 case ':':
634 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
635 PrintF("Atom(?<: Disjunction)\n");
636 isAtom = true;
637 Advance();
638 ParseDisjunction(isBackward);
639 break;
640 default:
641 Advance();
642 ParseError("? Syntax error.");
643 return false;
644 }
645 } else {
646 groupNames_.EmitChar(0);
647 parseCapture:
648 isAtom = true;
649 *captureIndex = captureCount_++;
650 SaveEndOpCode saveEndOp;
651 SaveStartOpCode saveStartOp;
652 if (isBackward) {
653 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
654 } else {
655 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
656 }
657 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
658 PrintF("capture start %d \n", *captureIndex);
659 ParseDisjunction(isBackward);
660 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
661 PrintF("capture end %d \n", *captureIndex);
662 if (isBackward) {
663 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
664 } else {
665 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
666 }
667 }
668 } while (c0_ != ')' && c0_ != KEY_EOF);
669 if (c0_ != ')') {
670 ParseError("capture syntax error");
671 return false;
672 }
673 return isAtom;
674 }
675
ParseDecimalDigits()676 int RegExpParser::ParseDecimalDigits()
677 {
678 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
679 PrintF("Parse DecimalDigits------\n");
680 uint32_t result = 0;
681 bool overflow = false;
682 while (true) {
683 if (c0_ < '0' || c0_ > '9') {
684 break;
685 }
686 if (!overflow) {
687 if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
688 overflow = true;
689 } else {
690 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
691 }
692 }
693 Advance();
694 }
695 if (overflow) {
696 return INT32_MAX;
697 }
698 return result;
699 }
700
ParserIntervalQuantifier(int * pmin,int * pmax)701 bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
702 {
703 // Quantifier::
704 // QuantifierPrefix
705 // QuantifierPrefix?
706 // QuantifierPrefix::
707 // *
708 // +
709 // ?
710 // {DecimalDigits}
711 // {DecimalDigits,}
712 // {DecimalDigits,DecimalDigits}
713 Advance();
714 *pmin = ParseDecimalDigits();
715 *pmax = *pmin;
716 switch (c0_) {
717 case ',': {
718 Advance();
719 if (c0_ == '}') {
720 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
721 PrintF("QuantifierPrefix{DecimalDigits,}\n");
722 *pmax = INT32_MAX;
723 Advance();
724 } else {
725 *pmax = ParseDecimalDigits();
726 if (c0_ == '}') {
727 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
728 PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
729 Advance();
730 } else {
731 return false;
732 }
733 }
734 break;
735 }
736 case '}':
737 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
738 PrintF("QuantifierPrefix{DecimalDigits}\n");
739 Advance();
740 break;
741 default:
742 Advance();
743 return false;
744 }
745 return true;
746 }
747
ParseQuantifierPrefix(int & min,int & max,bool & isGreedy)748 bool RegExpParser::ParseQuantifierPrefix(int &min, int &max, bool &isGreedy)
749 {
750 switch (c0_) {
751 case '*':
752 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
753 PrintF("QuantifierPrefix %c\n", c0_);
754 min = 0;
755 max = INT32_MAX;
756 Advance();
757 break;
758 case '+':
759 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
760 PrintF("QuantifierPrefix %c\n", c0_);
761 min = 1;
762 max = INT32_MAX;
763 Advance();
764 break;
765 case '?':
766 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
767 PrintF("QuantifierPrefix %c\n", c0_);
768 Advance();
769 min = 0;
770 max = 1;
771 break;
772 case '{': {
773 uint8_t *start = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
774 if (!ParserIntervalQuantifier(&min, &max)) {
775 pc_ = start;
776 Advance(); // back to '{'
777 return false;
778 }
779 if (min > max) {
780 ParseError("Invalid repetition count");
781 return false;
782 }
783 break;
784 }
785 default:
786 break;
787 }
788 if (c0_ == '?') {
789 isGreedy = false;
790 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
791 PrintF("Quantifier::QuantifierPrefix?\n");
792 Advance();
793 } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
794 ParseError("nothing to repeat");
795 return false;
796 }
797 return true;
798 }
799
ParseQuantifier(size_t atomBcStart,int captureStart,int captureEnd)800 void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
801 {
802 int min = -1;
803 int max = -1;
804 bool isGreedy = true;
805 if (!ParseQuantifierPrefix(min, max, isGreedy)) {
806 return;
807 }
808 if (min != -1 && max != -1) {
809 stackCount_++;
810 PushOpCode pushOp;
811 pushOp.InsertOpCode(&buffer_, atomBcStart);
812 atomBcStart += pushOp.GetSize();
813
814 if (captureStart != 0) {
815 SaveResetOpCode saveResetOp;
816 saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
817 }
818
819 // zero advance check
820 if (max == INT32_MAX) {
821 stackCount_++;
822 PushCharOpCode pushCharOp;
823 pushCharOp.InsertOpCode(&buffer_, atomBcStart);
824 CheckCharOpCode checkCharOp;
825 // NOLINTNEXTLINE(readability-magic-numbers)
826 checkCharOp.EmitOpCode(&buffer_, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
827 }
828
829 if (isGreedy) {
830 LoopGreedyOpCode loopOp;
831 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
832 } else {
833 LoopOpCode loopOp;
834 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
835 }
836
837 if (min == 0) {
838 if (isGreedy) {
839 SplitNextOpCode splitNextOp;
840 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
841 } else {
842 SplitFirstOpCode splitFirstOp;
843 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
844 }
845 }
846
847 PopOpCode popOp;
848 popOp.EmitOpCode(&buffer_);
849 }
850 }
851
ParseGroupSpecifier(const uint8_t ** pp,PandaString & name)852 bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, PandaString &name)
853 {
854 const uint8_t *p = *pp;
855 uint32_t c;
856 std::array<char, CACHE_SIZE> buffer {};
857 char *q = buffer.data();
858 while (true) {
859 if (p <= end_) {
860 c = *p;
861 } else {
862 c = KEY_EOF;
863 }
864 if (c == '\\') {
865 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
866 p++;
867 if (*p != 'u') {
868 return false;
869 }
870 if (!ParseUnicodeEscape(&c)) {
871 return false;
872 }
873 } else if (c == '>') {
874 break;
875 } else if (c > CACHE_SIZE && c != KEY_EOF) {
876 c = static_cast<uint32_t>(UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
877 } else if (c != KEY_EOF) {
878 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
879 p++;
880 } else {
881 return false;
882 }
883 if (q == buffer.data()) {
884 if (IsIdentFirst(c) != 0) {
885 return false;
886 }
887 } else {
888 if (!u_isIDPart(c)) {
889 return false;
890 }
891 }
892 if (q != nullptr) {
893 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
894 *q++ = c;
895 }
896 }
897 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
898 p++;
899 *pp = p;
900 name = buffer.data();
901 return true;
902 }
903
ParseCaptureCount(const char * groupName)904 int RegExpParser::ParseCaptureCount(const char *groupName)
905 {
906 const uint8_t *p = nullptr;
907 int captureIndex = 1;
908 PandaString name;
909 hasNamedCaptures_ = 0;
910 for (p = base_; p < end_; p++) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
911 switch (*p) {
912 case '(': {
913 if (p[1] == '?') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
914 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
915 if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
916 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
917 p[CAPTURE_CONUT_ADVANCE] != '=') {
918 hasNamedCaptures_ = 1;
919 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
920 p += CAPTURE_CONUT_ADVANCE;
921 if (groupName != nullptr) {
922 if (ParseGroupSpecifier(&p, name)) {
923 if (strcmp(name.c_str(), groupName) == 0) {
924 return captureIndex;
925 }
926 }
927 }
928 captureIndex++;
929 }
930 } else {
931 captureIndex++;
932 }
933 break;
934 }
935 case '\\':
936 p++; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
937 break;
938 case '[': {
939 while (p < end_ && *p != ']') {
940 if (*p == '\\') {
941 p++; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
942 }
943 p++; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
944 }
945 break;
946 }
947 default:
948 break;
949 }
950 }
951 return captureIndex;
952 }
953
954 // NOLINTNEXTLINE(readability-function-size)
ParseAtomEscape(bool isBackward)955 int RegExpParser::ParseAtomEscape(bool isBackward)
956 {
957 // AtomEscape[U, N]::
958 // DecimalEscape
959 // CharacterClassEscape[?U]
960 // CharacterEscape[?U]
961 // [+N]kGroupName[?U]
962 int result = -1;
963 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
964 PrintF("Parse AtomEscape------\n");
965 PrevOpCode prevOp;
966 switch (c0_) {
967 case KEY_EOF:
968 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
969 ParseError("unexpected end");
970 break;
971 // DecimalEscape
972 case '1':
973 case '2':
974 case '3':
975 case '4':
976 case '5':
977 case '6':
978 case '7':
979 case '8':
980 case '9': {
981 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
982 PrintF("NonZeroDigit %c\n", c0_);
983 int capture = ParseDecimalDigits();
984 if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
985 ParseError("invalid backreference count");
986 break;
987 }
988 if (isBackward) {
989 BackwardBackReferenceOpCode backReferenceOp;
990 backReferenceOp.EmitOpCode(&buffer_, capture);
991 } else {
992 BackReferenceOpCode backReferenceOp;
993 backReferenceOp.EmitOpCode(&buffer_, capture);
994 }
995 break;
996 }
997 // CharacterClassEscape
998 case 'd': {
999 // [0-9]
1000 RangeOpCode rangeOp;
1001 if (isBackward) {
1002 prevOp.EmitOpCode(&buffer_, 0);
1003 }
1004 rangeOp.InsertOpCode(&buffer_, g_gRangeD);
1005 goto parseLookBehind; // NOLINT(cppcoreguidelines-avoid-goto)
1006 }
1007 case 'D': {
1008 // [^0-9]
1009 RangeSet atomRange(g_gRangeD);
1010 atomRange.Invert(IsUtf16());
1011 Range32OpCode rangeOp;
1012 if (isBackward) {
1013 prevOp.EmitOpCode(&buffer_, 0);
1014 }
1015 rangeOp.InsertOpCode(&buffer_, atomRange);
1016 goto parseLookBehind; // NOLINT(cppcoreguidelines-avoid-goto)
1017 }
1018 case 's': {
1019 // [\f\n\r\t\v]
1020 RangeOpCode rangeOp;
1021 if (isBackward) {
1022 prevOp.EmitOpCode(&buffer_, 0);
1023 }
1024 rangeOp.InsertOpCode(&buffer_, g_gRangeS);
1025 goto parseLookBehind; // NOLINT(cppcoreguidelines-avoid-goto)
1026 }
1027 case 'S': {
1028 RangeSet atomRange(g_gRangeS);
1029 Range32OpCode rangeOp;
1030 atomRange.Invert(IsUtf16());
1031 if (isBackward) {
1032 prevOp.EmitOpCode(&buffer_, 0);
1033 }
1034 rangeOp.InsertOpCode(&buffer_, atomRange);
1035 goto parseLookBehind; // NOLINT(cppcoreguidelines-avoid-goto)
1036 }
1037 case 'w': {
1038 // [A-Za-z0-9]
1039 RangeOpCode rangeOp;
1040 if (isBackward) {
1041 prevOp.EmitOpCode(&buffer_, 0);
1042 }
1043 rangeOp.InsertOpCode(&buffer_, g_gRangeW);
1044 goto parseLookBehind; // NOLINT(cppcoreguidelines-avoid-goto)
1045 }
1046 case 'W': {
1047 // [^A-Za-z0-9]
1048 RangeSet atomRange(g_gRangeW);
1049 atomRange.Invert(IsUtf16());
1050 Range32OpCode rangeOp;
1051 if (isBackward) {
1052 prevOp.EmitOpCode(&buffer_, 0);
1053 }
1054 rangeOp.InsertOpCode(&buffer_, atomRange);
1055 goto parseLookBehind; // NOLINT(cppcoreguidelines-avoid-goto)
1056 }
1057 // P{UnicodePropertyValueExpression}
1058 // p{UnicodePropertyValueExpression}
1059 case 'P':
1060 case 'p':
1061 // [+N]kGroupName[?U]
1062 case 'k': {
1063 Advance();
1064 if (c0_ != '<') {
1065 if (!IsUtf16() || HasNamedCaptures()) {
1066 ParseError("expecting group name.");
1067 break;
1068 }
1069 }
1070 Advance();
1071 Prev();
1072 PandaString name;
1073 auto **pp = const_cast<const uint8_t **>(&pc_);
1074 if (!ParseGroupSpecifier(pp, name)) {
1075 ParseError("GroupName Syntax error.");
1076 break;
1077 }
1078 int postion = FindGroupName(name);
1079 if (postion < 0) {
1080 postion = ParseCaptureCount(name.c_str());
1081 if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1082 ParseError("group name not defined");
1083 break;
1084 }
1085 }
1086 if (isBackward) {
1087 BackwardBackReferenceOpCode backReferenceOp;
1088 backReferenceOp.EmitOpCode(&buffer_, postion);
1089 } else {
1090 BackReferenceOpCode backReferenceOp;
1091 backReferenceOp.EmitOpCode(&buffer_, postion);
1092 }
1093 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1094 Advance();
1095 break;
1096 }
1097 parseLookBehind : {
1098 if (isBackward) {
1099 prevOp.EmitOpCode(&buffer_, 0);
1100 }
1101 Advance();
1102 break;
1103 }
1104 default:
1105 result = ParseCharacterEscape();
1106 break;
1107 }
1108 return result;
1109 }
1110
RecountCaptures()1111 int RegExpParser::RecountCaptures()
1112 {
1113 if (totalCaptureCount_ < 0) {
1114 const char *name = reinterpret_cast<const char *>(groupNames_.buf_);
1115 totalCaptureCount_ = ParseCaptureCount(name);
1116 }
1117 return totalCaptureCount_;
1118 }
HasNamedCaptures()1119 bool RegExpParser::HasNamedCaptures()
1120 {
1121 if (hasNamedCaptures_ < 0) {
1122 RecountCaptures();
1123 }
1124 return false;
1125 }
1126
ParseCharacterEscape()1127 int RegExpParser::ParseCharacterEscape()
1128 {
1129 // CharacterEscape[U]::
1130 // ControlEscape
1131 // c ControlLetter
1132 // 0 [lookahead ∉ DecimalDigit]
1133 // HexEscapeSequence
1134 // RegExpUnicodeEscapeSequence[?U]
1135 // IdentityEscape[?U]
1136 uint32_t result = 0;
1137 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1138 switch (c0_) {
1139 // ControlEscape
1140 case 'f':
1141 result = '\f';
1142 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1143 PrintF("ControlEscape %c\n", c0_);
1144 Advance();
1145 break;
1146 case 'n':
1147 result = '\n';
1148 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1149 PrintF("ControlEscape %c\n", c0_);
1150 Advance();
1151 break;
1152 case 'r':
1153 result = '\r';
1154 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1155 PrintF("ControlEscape %c\n", c0_);
1156 Advance();
1157 break;
1158 case 't':
1159 result = '\t';
1160 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1161 PrintF("ControlEscape %c\n", c0_);
1162 Advance();
1163 break;
1164 case 'v':
1165 result = '\v';
1166 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1167 PrintF("ControlEscape %c\n", c0_);
1168 Advance();
1169 break;
1170 // c ControlLetter
1171 case 'c': {
1172 Advance();
1173 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1174 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1175 PrintF("ControlLetter %c\n", c0_);
1176 result = static_cast<uint32_t>(c0_) & 0x1f; // NOLINT(readability-magic-numbers, hicpp-signed-bitwise)
1177 Advance();
1178 } else {
1179 if (!IsUtf16()) {
1180 pc_--; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1181 result = '\\';
1182 } else {
1183 ParseError("Invalid control letter");
1184 return -1;
1185 }
1186 }
1187 break;
1188 }
1189 case '0': {
1190 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1191 PrintF("CharacterEscape 0 [lookahead ∉ DecimalDigit]\n");
1192 if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) { // NOLINT(readability-magic-numbers)
1193 Advance();
1194 result = 0;
1195 break;
1196 }
1197 [[fallthrough]];
1198 }
1199 case '1':
1200 case '2':
1201 case '3':
1202 case '4':
1203 case '5':
1204 case '6':
1205 case '7': {
1206 if (IsUtf16()) {
1207 // With /u, decimal escape is not interpreted as octal character code.
1208 ParseError("Invalid class escape");
1209 return 0;
1210 }
1211 result = ParseOctalLiteral();
1212 break;
1213 }
1214 // ParseHexEscapeSequence
1215 // ParseRegExpUnicodeEscapeSequence
1216 case 'x': {
1217 Advance();
1218 if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1219 return result;
1220 }
1221 if (IsUtf16()) {
1222 ParseError("Invalid class escape");
1223 return -1;
1224 }
1225 result = 'x';
1226 break;
1227 }
1228 case 'u': {
1229 Advance();
1230 if (ParseUnicodeEscape(&result)) {
1231 return result;
1232 }
1233 if (IsUtf16()) {
1234 // With /u, invalid escapes are not treated as identity escapes.
1235 ParseError("Invalid unicode escape");
1236 return 0;
1237 }
1238 // If \u is not followed by a two-digit hexadecimal, treat it
1239 // as an identity escape.
1240 result = 'u';
1241 break;
1242 }
1243 // IdentityEscape[?U]
1244 case '$':
1245 case '(':
1246 case ')':
1247 case '*':
1248 case '+':
1249 case '.':
1250 case '/':
1251 case '?':
1252 case '[':
1253 case '\\':
1254 case ']':
1255 case '^':
1256 case '{':
1257 case '|':
1258 case '}':
1259 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1260 PrintF("IdentityEscape %c\n", c0_);
1261 result = c0_;
1262 Advance();
1263 break;
1264 default: {
1265 if (IsUtf16()) {
1266 ParseError("Invalid unicode escape");
1267 return 0;
1268 }
1269 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1270 PrintF("SourceCharacter %c\n", c0_);
1271 result = c0_;
1272 if (result < CHAR_MAXS) {
1273 Advance();
1274 }
1275 break;
1276 }
1277 }
1278 return result;
1279 }
1280
ParseClassRanges(RangeSet * result)1281 bool RegExpParser::ParseClassRanges(RangeSet *result)
1282 {
1283 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1284 PrintF("Parse ClassRanges------\n");
1285 while (c0_ != ']') {
1286 RangeSet s1;
1287 uint32_t c1 = ParseClassAtom(&s1);
1288 if (c1 == UINT32_MAX) {
1289 ParseError("invalid class range");
1290 return false;
1291 }
1292
1293 int nextC0 = *pc_;
1294 if (c0_ == '-' && nextC0 != ']') {
1295 if (c1 == CLASS_RANGE_BASE) {
1296 if (IsUtf16()) {
1297 ParseError("invalid class range");
1298 return false;
1299 }
1300 result->Insert(s1);
1301 continue;
1302 }
1303 Advance();
1304 RangeSet s2;
1305 uint32_t c2 = ParseClassAtom(&s2);
1306 if (c2 == UINT32_MAX) {
1307 ParseError("invalid class range");
1308 return false;
1309 }
1310 if (c2 == CLASS_RANGE_BASE) {
1311 if (IsUtf16()) {
1312 ParseError("invalid class range");
1313 return false;
1314 }
1315 result->Insert(s2);
1316 continue;
1317 }
1318 if (c1 < INT8_MAX) {
1319 if (c1 > c2) {
1320 ParseError("invalid class range");
1321 return false;
1322 }
1323 }
1324 if (IsIgnoreCase()) {
1325 c1 = static_cast<uint32_t>(Canonicalize(c1, IsUtf16()));
1326 c2 = static_cast<uint32_t>(Canonicalize(c2, IsUtf16()));
1327 }
1328
1329 result->Insert(c1, c2);
1330 } else {
1331 result->Insert(s1);
1332 }
1333 }
1334 Advance();
1335 return true;
1336 }
1337
ParseClassAtom(RangeSet * atom)1338 uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1339 {
1340 uint32_t ret = UINT32_MAX;
1341 switch (c0_) {
1342 case '\\': {
1343 Advance();
1344 ret = static_cast<uint32_t>(ParseClassEscape(atom));
1345 break;
1346 }
1347 case KEY_EOF:
1348 break;
1349 case 0: {
1350 if (pc_ >= end_) {
1351 return UINT32_MAX;
1352 }
1353 [[fallthrough]];
1354 }
1355 default: {
1356 uint32_t value = c0_;
1357 size_t u16Size;
1358 if (c0_ > INT8_MAX) {
1359 pc_ -= 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1360 auto u16Result = utf::ConvertUtf8ToUtf16Pair(pc_, true);
1361 value = u16Result.first;
1362 u16Size = u16Result.second;
1363 Advance(u16Size + 1);
1364 } else {
1365 Advance();
1366 }
1367 if (IsIgnoreCase()) {
1368 value = static_cast<uint32_t>(Canonicalize(value, IsUtf16()));
1369 }
1370 atom->Insert(RangeSet(value));
1371 ret = value;
1372 break;
1373 }
1374 }
1375 return ret;
1376 }
1377
ParseClassEscape(RangeSet * atom)1378 int RegExpParser::ParseClassEscape(RangeSet *atom)
1379 {
1380 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1381 PrintF("Parse ClassEscape------\n");
1382 int result = -1;
1383 switch (c0_) {
1384 case 'b':
1385 Advance();
1386 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1387 PrintF("ClassEscape %c", 'b');
1388 result = '\b';
1389 atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1390 break;
1391 case '-':
1392 Advance();
1393 result = '-';
1394 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1395 PrintF("ClassEscape %c", '-');
1396 atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1397 break;
1398 // CharacterClassEscape
1399 case 'd':
1400 case 'D':
1401 result = CLASS_RANGE_BASE;
1402 atom->Insert(g_gRangeD);
1403 if (c0_ == 'D') {
1404 atom->Invert(IsUtf16());
1405 }
1406 Advance();
1407 break;
1408 case 's':
1409 case 'S':
1410 result = CLASS_RANGE_BASE;
1411 atom->Insert(g_gRangeS);
1412 if (c0_ == 'S') {
1413 atom->Invert(IsUtf16());
1414 }
1415 Advance();
1416 break;
1417 case 'w':
1418 case 'W':
1419 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1420 PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1421 result = CLASS_RANGE_BASE;
1422 atom->Insert(g_gRangeW);
1423 if (c0_ == 'W') {
1424 atom->Invert(IsUtf16());
1425 }
1426 Advance();
1427 break;
1428 // P{UnicodePropertyValueExpression}
1429 // p{UnicodePropertyValueExpression}
1430 case 'P':
1431 case 'p':
1432 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1433 PrintF("Warning: \\p is not supported in ECMA 2015!");
1434 Advance();
1435 if (c0_ == '{') {
1436 Advance();
1437 if (c0_ == '}') {
1438 break; // p{}, invalid
1439 }
1440 bool isValue = false;
1441 ParseUnicodePropertyValueCharacters(&isValue);
1442 if (!isValue && c0_ == '=') {
1443 // UnicodePropertyName = UnicodePropertyValue
1444 Advance();
1445 if (c0_ == '}') {
1446 break; // p{xxx=}, invalid
1447 }
1448 ParseUnicodePropertyValueCharacters(&isValue);
1449 }
1450 if (c0_ != '}') {
1451 break; // p{xxx, invalid
1452 }
1453 // should do atom->Invert() here after ECMA 9.0
1454 Advance();
1455 result = CLASS_RANGE_BASE;
1456 }
1457 break;
1458 default:
1459 result = ParseCharacterEscape();
1460 int value = result;
1461 if (IsIgnoreCase()) {
1462 value = Canonicalize(value, IsUtf16());
1463 }
1464 atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1465 break;
1466 }
1467 return result;
1468 }
1469
ParseUnicodePropertyValueCharacters(bool * isValue)1470 void RegExpParser::ParseUnicodePropertyValueCharacters(bool *isValue)
1471 {
1472 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1473 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1474 PrintF("UnicodePropertyCharacter::ControlLetter %c\n", c0_);
1475 } else if (c0_ == '_') {
1476 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1477 PrintF("UnicodePropertyCharacter:: _ \n");
1478 } else if (c0_ >= '0' && c0_ <= '9') {
1479 *isValue = true;
1480 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1481 PrintF("UnicodePropertyValueCharacter::DecimalDigit %c\n", c0_);
1482 } else {
1483 return;
1484 }
1485 Advance();
1486 ParseUnicodePropertyValueCharacters(isValue);
1487 }
1488
1489 // NOLINTNEXTLINE(cert-dcl50-cpp)
PrintF(const char * fmt,...)1490 void RegExpParser::PrintF(const char *fmt, ...)
1491 {
1492 (void)fmt;
1493 }
1494
ParseError(const char * errorMessage)1495 void RegExpParser::ParseError(const char *errorMessage)
1496 {
1497 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1498 PrintF("error: ");
1499 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1500 PrintF(errorMessage);
1501 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1502 PrintF("\n");
1503 SetIsError();
1504 size_t length = strlen(errorMessage) + 1;
1505 if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1506 LOG(FATAL, COMMON) << "memcpy_s failed";
1507 UNREACHABLE();
1508 }
1509 }
1510
IsIdentFirst(uint32_t c)1511 int RegExpParser::IsIdentFirst(uint32_t c)
1512 {
1513 if (c < CACHE_SIZE) {
1514 // NOLINTNEXTLINE(hicpp-signed-bitwise
1515 return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1; // 5: Shift five bits 31: and operation binary of 31
1516 }
1517 return static_cast<int>(u_isIDStart(c));
1518 }
1519 } // namespace ark