1 /**
2 * Copyright (c) 2021-2025 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "runtime/regexp/ecmascript/regexp_parser.h"
17 #include "runtime/regexp/ecmascript/regexp_opcode.h"
18 #include "runtime/include/coretypes/string-inl.h"
19
20 #include "libpandabase/utils/utils.h"
21 #include "libpandabase/utils/utf.h"
22
23 #include "securec.h"
24 #include "unicode/uchar.h"
25 #include "unicode/uniset.h"
26
27 namespace {
28 constexpr int INVALID_UNICODE_FROM_UTF8 = -1;
29
30 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
31 constexpr int UICODE_FROM_UTF8[] = {
32 0x80, 0xc0, 0xdf, 0xe0, 0xef, 0xf0, 0xf7, 0xf8, 0xfb, 0xfc, 0xfd,
33 };
34
35 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
36 constexpr int UTF8_MIN_CODE[] = {
37 0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
38 };
39
40 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
41 constexpr char UTF8_FIRST_CODE[] = {
42 0x1f, 0xf, 0x7, 0x3, 0x1,
43 };
44
FromUtf8(int c,int l,const uint8_t * p,const uint8_t ** pp)45 int FromUtf8(int c, int l, const uint8_t *p, const uint8_t **pp)
46 {
47 uint32_t b;
48 uint32_t cc = c;
49 // NOLINTNEXTLINE(hicpp-signed-bitwise)
50 cc &= UTF8_FIRST_CODE[l - 1];
51 for (int i = 0; i < l; i++) {
52 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
53 b = *p++;
54 if (b < ark::utf::UTF8_2B_SECOND || b >= ark::utf::UTF8_2B_FIRST) {
55 return INVALID_UNICODE_FROM_UTF8;
56 }
57 // NOLINTNEXTLINE(hicpp-signed-bitwise)
58 cc = (cc << 6) | (b & ark::utf::UTF8_2B_THIRD); // 6: Maximum Unicode range
59 }
60 if (cc < static_cast<uint32_t>(UTF8_MIN_CODE[l - 1])) {
61 return INVALID_UNICODE_FROM_UTF8;
62 }
63 *pp = p;
64 return static_cast<int>(cc);
65 }
66
UnicodeFromUtf8(const uint8_t * p,int maxLen,const uint8_t ** pp)67 int UnicodeFromUtf8(const uint8_t *p, int maxLen, const uint8_t **pp)
68 {
69 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
70 int c = *p++;
71 if (c < UICODE_FROM_UTF8[0]) {
72 *pp = p;
73 return c;
74 }
75 int l = 0;
76 if (c >= UICODE_FROM_UTF8[1U] && c <= UICODE_FROM_UTF8[2U]) { // 1 - 2: 0000 0080 - 0000 07FF
77 l = 1; // 1: 0000 0080 - 0000 07FF Unicode
78 } else if (c >= UICODE_FROM_UTF8[3U] && c <= UICODE_FROM_UTF8[4U]) { // 3 - 4: 0000 0800 - 0000 FFFF
79 l = 2; // 2: 0000 0800 - 0000 FFFF Unicode
80 } else if (c >= UICODE_FROM_UTF8[5U] && c <= UICODE_FROM_UTF8[6U]) { // 5 - 6: 0001 0000 - 0010 FFFF
81 l = 3; // 3: 0001 0000 - 0010 FFFF Unicode
82 } else if (c >= UICODE_FROM_UTF8[7U] && c <= UICODE_FROM_UTF8[8U]) { // 7 - 8: 0020 0000 - 03FF FFFF
83 l = 4; // 4: 0020 0000 - 03FF FFFF Unicode
84 // NOLINTNEXTLINE(readability-magic-numbers)
85 } else if (c == UICODE_FROM_UTF8[9U] || c == UICODE_FROM_UTF8[10U]) { // 9 - 10: 0400 0000 - 7FFF FFFF
86 l = 5; // 5: 0400 0000 - 7FFF FFFF Unicode
87 } else {
88 return INVALID_UNICODE_FROM_UTF8;
89 }
90 /* check that we have enough characters */
91 if (l > (maxLen - 1)) {
92 return INVALID_UNICODE_FROM_UTF8;
93 }
94 return FromUtf8(c, l, p, pp);
95 }
96 } // namespace
97
98 namespace ark {
99 static constexpr uint32_t CACHE_SIZE = 128;
100 static constexpr uint32_t CHAR_MAXS = 128;
101 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
102 static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
103 /* $ A-Z _ a-z */
104 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE};
105 static RangeSet g_gRangeD(0x30, 0x39); // NOLINT(fuchsia-statically-constructed-objects, readability-magic-numbers)
106 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
107 static RangeSet g_gRangeS({
108 std::pair<uint32_t, uint32_t>(0x0009, 0x000D), // NOLINT(readability-magic-numbers)
109 std::pair<uint32_t, uint32_t>(0x0020, 0x0020), // NOLINT(readability-magic-numbers)
110 std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0), // NOLINT(readability-magic-numbers)
111 std::pair<uint32_t, uint32_t>(0x1680, 0x1680), // NOLINT(readability-magic-numbers)
112 std::pair<uint32_t, uint32_t>(0x2000, 0x200A), // NOLINT(readability-magic-numbers)
113 /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
114 /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
115 std::pair<uint32_t, uint32_t>(0x2028, 0x2029), // NOLINT(readability-magic-numbers)
116 std::pair<uint32_t, uint32_t>(0x202F, 0x202F), // NOLINT(readability-magic-numbers)
117 std::pair<uint32_t, uint32_t>(0x205F, 0x205F), // NOLINT(readability-magic-numbers)
118 std::pair<uint32_t, uint32_t>(0x3000, 0x3000), // NOLINT(readability-magic-numbers)
119 /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
120 std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF), // NOLINT(readability-magic-numbers)
121 });
122
123 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
124 static RangeSet g_gRangeW({
125 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINT(readability-magic-numbers)
126 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINT(readability-magic-numbers)
127 std::pair<uint32_t, uint32_t>(0x005F, 0x005F), // NOLINT(readability-magic-numbers)
128 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINT(readability-magic-numbers)
129 });
130
131 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
132 static RangeSet g_gRegexpIdentifyStart({
133 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINT(readability-magic-numbers)
134 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINT(readability-magic-numbers)
135 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINT(readability-magic-numbers)
136 });
137
138 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
139 static RangeSet g_gRegexpIdentifyContinue({
140 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINT(readability-magic-numbers)
141 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINT(readability-magic-numbers)
142 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINT(readability-magic-numbers)
143 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINT(readability-magic-numbers)
144 });
145
Parse()146 void RegExpParser::Parse()
147 {
148 // dynbuffer head init [size,capture_count,statck_count,flags]
149 buffer_.EmitU32(0);
150 buffer_.EmitU32(0);
151 buffer_.EmitU32(0);
152 buffer_.EmitU32(0);
153 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
154 PrintF("Parse Pattern------\n");
155 // Pattern[U, N]::
156 // Disjunction[?U, ?N]
157 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
158 Advance();
159 SaveStartOpCode saveStartOp;
160 int captureIndex = captureCount_++;
161 saveStartOp.EmitOpCode(&buffer_, captureIndex);
162 ParseDisjunction(false);
163 if (c0_ != KEY_EOF) {
164 ParseError("extraneous characters at the end");
165 return;
166 }
167 SaveEndOpCode saveEndOp;
168 saveEndOp.EmitOpCode(&buffer_, captureIndex);
169 MatchEndOpCode matchEndOp;
170 matchEndOp.EmitOpCode(&buffer_, 0);
171 // dynbuffer head assignments
172 buffer_.PutU32(0, buffer_.size_);
173 buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
174 buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
175 buffer_.PutU32(FLAGS_OFFSET, flags_);
176 }
177
ParseDisjunction(bool isBackward)178 void RegExpParser::ParseDisjunction(bool isBackward)
179 {
180 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
181 PrintF("Parse Disjunction------\n");
182 size_t start = buffer_.size_;
183 ParseAlternative(isBackward);
184 if (isError_) {
185 return;
186 }
187 do {
188 if (c0_ == '|') {
189 SplitNextOpCode splitOp;
190 uint32_t len = buffer_.size_ - start;
191 GotoOpCode gotoOp;
192 splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
193 uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
194 Advance();
195 ParseAlternative(isBackward);
196 gotoOp.UpdateOpPara(&buffer_, pos, buffer_.size_ - pos - gotoOp.GetSize());
197 }
198 } while (c0_ != KEY_EOF && c0_ != ')');
199 }
200
ParseOctalLiteral()201 uint32_t RegExpParser::ParseOctalLiteral()
202 {
203 // For compatibility with some other browsers (not all), we parse
204 // up to three octal digits with a value below 256.
205 // ES#prod-annexB-LegacyOctalEscapeSequence
206 uint32_t value = c0_ - '0';
207 Advance();
208 if (c0_ >= '0' && c0_ <= '7') {
209 value = value * OCTAL_VALUE + c0_ - '0';
210 Advance();
211 if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
212 value = value * OCTAL_VALUE + c0_ - '0';
213 Advance();
214 }
215 }
216 return value;
217 }
218
ParseUnlimitedLengthHexNumber(uint32_t maxValue,uint32_t * value)219 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
220 {
221 uint32_t x = 0;
222 int d = static_cast<int>(HexValue(c0_));
223 if (d < 0) {
224 return false;
225 }
226 while (d >= 0) {
227 if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
228 LOG(FATAL, COMMON) << "value overflow";
229 return false;
230 }
231 x = x * HEX_VALUE + static_cast<uint32_t>(d);
232 if (x > maxValue) {
233 return false;
234 }
235 Advance();
236 d = static_cast<int>(HexValue(c0_));
237 }
238 *value = x;
239 return true;
240 }
241
242 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uint32_t * value)243 bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
244 {
245 // Accept both \uxxxx and \u{xxxxxx} (if allowed).
246 // In the latter case, the number of hex digits between { } is arbitrary.
247 // \ and u have already been read.
248 if (c0_ == '{' && IsUtf16()) {
249 uint8_t *start = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
250 Advance();
251 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { // NOLINT(readability-magic-numbers)
252 if (c0_ == '}') {
253 Advance();
254 return true;
255 }
256 }
257 pc_ = start;
258 Advance();
259 return false;
260 }
261 // \u but no {, or \u{...} escapes not allowed.
262 bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
263 if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
264 // Attempt to read trail surrogate.
265 uint8_t *start = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
266 if (*pc_ == 'u') {
267 Advance(UNICODE_HEX_ADVANCE);
268 uint32_t trail;
269 if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
270 *value = U16_GET_SUPPLEMENTARY((*value), (trail)); // NOLINT(hicpp-signed-bitwise)
271 return true;
272 }
273 }
274 pc_ = start;
275 Advance();
276 }
277 return result;
278 }
279
ParseHexEscape(int length,uint32_t * value)280 bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
281 {
282 uint8_t *start = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
283 uint32_t val = 0;
284 for (int i = 0; i < length; ++i) {
285 uint32_t c = c0_;
286 int d = static_cast<int>(HexValue(c));
287 if (d < 0) {
288 pc_ = start;
289 Advance();
290 return false;
291 }
292 val = val * HEX_VALUE + static_cast<uint32_t>(d);
293 Advance();
294 }
295 *value = val;
296 return true;
297 }
298
ParseAlternativeEscape(bool isBackward,bool & isAtom)299 void RegExpParser::ParseAlternativeEscape(bool isBackward, bool &isAtom)
300 {
301 switch (c0_) {
302 case 'b': {
303 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
304 PrintF("Assertion %c \n", c0_);
305 WordBoundaryOpCode wordBoundaryOp;
306 wordBoundaryOp.EmitOpCode(&buffer_, 0);
307 Advance();
308 break;
309 }
310 case 'B': {
311 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
312 PrintF("Assertion %c \n", c0_);
313 NotWordBoundaryOpCode notWordBoundaryOp;
314 notWordBoundaryOp.EmitOpCode(&buffer_, 0);
315 Advance();
316 break;
317 }
318 default: {
319 isAtom = true;
320 int atomValue = ParseAtomEscape(isBackward);
321 if (atomValue != -1) {
322 ParseAlternativeEscapeDefault(atomValue);
323 }
324 break;
325 }
326 }
327 }
328
ParseAlternativeEscapeDefault(int atomValue)329 void RegExpParser::ParseAlternativeEscapeDefault(int atomValue)
330 {
331 if (IsIgnoreCase()) {
332 if (!IsUtf16()) {
333 atomValue = Canonicalize(atomValue, false);
334 } else {
335 icu::UnicodeSet set(atomValue, atomValue);
336 set.closeOver(USET_CASE_INSENSITIVE);
337 set.removeAllStrings();
338 int32_t size = set.size();
339 RangeOpCode rangeOp;
340 RangeSet rangeResult;
341 for (int32_t idx = 0; idx < size; idx++) {
342 int32_t uc = set.charAt(idx);
343 RangeSet curRange(uc);
344 rangeResult.Insert(curRange);
345 }
346 rangeOp.InsertOpCode(&buffer_, rangeResult);
347 return;
348 }
349 }
350 if (atomValue <= UINT16_MAX) {
351 CharOpCode charOp;
352 charOp.EmitOpCode(&buffer_, atomValue);
353 } else {
354 Char32OpCode charOp;
355 charOp.EmitOpCode(&buffer_, atomValue);
356 }
357 }
358
ParsePatternCharacter(bool isBackward)359 void RegExpParser::ParsePatternCharacter(bool isBackward)
360 {
361 PrevOpCode prevOp;
362 if (isBackward) {
363 prevOp.EmitOpCode(&buffer_, 0);
364 }
365 uint32_t matchedChar = c0_;
366 if (c0_ > (INT8_MAX + 1)) {
367 Prev();
368 int i = 0;
369 UChar32 c = 0;
370 int32_t length = end_ - pc_ + 1;
371 // NOLINTNEXTLINE(hicpp-signed-bitwise)
372 U8_NEXT(pc_, i, length, c); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
373 matchedChar = static_cast<uint32_t>(c);
374 pc_ += i; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
375 }
376 if (IsIgnoreCase()) {
377 matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
378 }
379 if (matchedChar > UINT16_MAX) {
380 Char32OpCode charOp;
381 charOp.EmitOpCode(&buffer_, matchedChar);
382 } else {
383 CharOpCode charOp;
384 charOp.EmitOpCode(&buffer_, matchedChar);
385 }
386 if (isBackward) {
387 prevOp.EmitOpCode(&buffer_, 0);
388 }
389 }
390
ParseAlternativeAny(bool isBackward)391 void RegExpParser::ParseAlternativeAny(bool isBackward)
392 {
393 PrevOpCode prevOp;
394 if (isBackward) {
395 prevOp.EmitOpCode(&buffer_, 0);
396 }
397 if (IsDotAll()) {
398 AllOpCode allOp;
399 allOp.EmitOpCode(&buffer_, 0);
400 } else {
401 DotsOpCode dotsOp;
402 dotsOp.EmitOpCode(&buffer_, 0);
403 }
404 if (isBackward) {
405 prevOp.EmitOpCode(&buffer_, 0);
406 }
407 }
408
ParseAlternativeRange(bool isBackward)409 void RegExpParser::ParseAlternativeRange(bool isBackward)
410 {
411 PrevOpCode prevOp;
412 Advance();
413 if (isBackward) {
414 prevOp.EmitOpCode(&buffer_, 0);
415 }
416 bool isInvert = false;
417 if (c0_ == '^') {
418 isInvert = true;
419 Advance();
420 }
421 RangeSet rangeResult;
422 if (!ParseClassRanges(&rangeResult)) {
423 return;
424 }
425 if (isInvert) {
426 rangeResult.Invert(IsUtf16());
427 }
428 uint32_t highValue = rangeResult.HighestValue();
429 if (highValue <= UINT16_MAX) {
430 RangeOpCode rangeOp;
431 rangeOp.InsertOpCode(&buffer_, rangeResult);
432 } else {
433 Range32OpCode rangeOp;
434 rangeOp.InsertOpCode(&buffer_, rangeResult);
435 }
436
437 if (isBackward) {
438 prevOp.EmitOpCode(&buffer_, 0);
439 }
440 }
441
442 // CC-OFFNXT(G.FUN.01, huge_method) solid logic
ParseAlternativeImpl(bool isBackward,bool & isAtom,int & captureIndex)443 void RegExpParser::ParseAlternativeImpl(bool isBackward, bool &isAtom, int &captureIndex)
444 {
445 switch (c0_) {
446 case '^': {
447 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
448 PrintF("Assertion %c line start \n", c0_);
449 LineStartOpCode lineStartOp;
450 lineStartOp.EmitOpCode(&buffer_, 0);
451 Advance();
452 break;
453 }
454 case '$': {
455 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
456 PrintF("Assertion %c line end \n", c0_);
457 LineEndOpCode lineEndOp;
458 lineEndOp.EmitOpCode(&buffer_, 0);
459 Advance();
460 break;
461 }
462 case '\\': {
463 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
464 PrintF("Escape %c \n", c0_);
465 Advance();
466 ParseAlternativeEscape(isBackward, isAtom);
467 break;
468 }
469 case '(': {
470 Advance();
471 isAtom = ParseAssertionCapture(&captureIndex, isBackward);
472 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
473 Advance();
474 break;
475 }
476 case '.': {
477 ParseAlternativeAny(isBackward);
478 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
479 PrintF("Atom %c match any \n", c0_);
480 isAtom = true;
481 Advance();
482 break;
483 }
484 case '[': {
485 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
486 PrintF("Atom %c match range \n", c0_);
487 isAtom = true;
488 ParseAlternativeRange(isBackward);
489 break;
490 }
491 case '*':
492 case '+':
493 case '?':
494 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
495 ParseError("nothing to repeat");
496 return;
497 case '{': {
498 uint8_t *begin = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
499 int dummy;
500 if (ParserIntervalQuantifier(&dummy, &dummy)) {
501 ParseError("nothing to repeat");
502 return;
503 }
504 pc_ = begin;
505 Advance();
506 }
507 [[fallthrough]];
508 case '}':
509 case ']':
510 if (IsUtf16()) {
511 ParseError("syntax error");
512 return;
513 }
514 [[fallthrough]];
515 default: {
516 // PatternCharacter
517 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
518 PrintF("PatternCharacter %c\n", c0_);
519 isAtom = true;
520 ParsePatternCharacter(isBackward);
521 Advance();
522 break;
523 }
524 }
525 }
526
ParseAlternative(bool isBackward)527 void RegExpParser::ParseAlternative(bool isBackward)
528 {
529 size_t start = buffer_.size_;
530 while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
531 if (isError_) {
532 return;
533 }
534 size_t atomBcStart = buffer_.GetSize();
535 int captureIndex = 0;
536 bool isAtom = false;
537 ParseAlternativeImpl(isBackward, isAtom, captureIndex);
538 if (isAtom && !isError_) {
539 ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
540 }
541 if (isBackward) {
542 size_t end = buffer_.GetSize();
543 size_t termSize = end - atomBcStart;
544 size_t moveSize = end - start;
545 buffer_.Expand(end + termSize);
546 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
547 if (memmove_s(buffer_.buf_ + start + termSize, moveSize, buffer_.buf_ + start, moveSize) != EOK) {
548 LOG(FATAL, COMMON) << "memmove_s failed";
549 UNREACHABLE();
550 }
551 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
552 if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
553 LOG(FATAL, COMMON) << "memcpy_s failed";
554 UNREACHABLE();
555 }
556 }
557 }
558 }
559
FindGroupName(const PandaString & name)560 int RegExpParser::FindGroupName(const PandaString &name)
561 {
562 size_t len;
563 size_t nameLen = name.size();
564 const char *p = reinterpret_cast<char *>(groupNames_.buf_);
565 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
566 const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
567 int captureIndex = 1;
568 while (p < bufEnd) {
569 len = strlen(p);
570 if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
571 return captureIndex;
572 }
573 p += len + 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
574 captureIndex++;
575 }
576 return -1;
577 }
578
579 template <typename OpCodeT>
InsertMatchAheadOpCode(bool isBackward)580 void RegExpParser::InsertMatchAheadOpCode(bool isBackward)
581 {
582 Advance();
583 uint32_t start = buffer_.size_;
584 ParseDisjunction(isBackward);
585 MatchOpCode matchOp;
586 matchOp.EmitOpCode(&buffer_, 0);
587 OpCodeT matchAheadOp;
588 uint32_t len = buffer_.size_ - start;
589 matchAheadOp.InsertOpCode(&buffer_, start, len);
590 }
591
HandleGroupName()592 bool RegExpParser::HandleGroupName()
593 {
594 PandaString name;
595 auto **pp = const_cast<const uint8_t **>(&pc_);
596 if (!ParseGroupSpecifier(pp, name)) {
597 ParseError("GroupName Syntax error.");
598 return false;
599 }
600 if (FindGroupName(name) > 0) {
601 ParseError("Duplicate GroupName error.");
602 return false;
603 }
604 groupNames_.EmitStr(name.c_str());
605 newGroupNames_.push_back(name);
606 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
607 PrintF("group name %s", name.c_str());
608
609 return true;
610 }
611
ParseAssertion(bool isBackward,bool & isAtom,bool & parseCapture)612 bool RegExpParser::ParseAssertion(bool isBackward, bool &isAtom, bool &parseCapture)
613 {
614 switch (c0_) {
615 // (?=Disjunction[?U, ?N])
616 case '=': {
617 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
618 PrintF("Assertion(?= Disjunction)\n");
619 InsertMatchAheadOpCode<MatchAheadOpCode>(isBackward);
620 break;
621 }
622 // (?!Disjunction[?U, ?N])
623 case '!': {
624 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
625 PrintF("Assertion(?! Disjunction)\n");
626 InsertMatchAheadOpCode<NegativeMatchAheadOpCode>(isBackward);
627 break;
628 }
629 case '<': {
630 Advance();
631 // (?<=Disjunction[?U, ?N])
632 if (c0_ == '=') {
633 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
634 PrintF("Assertion(?<= Disjunction)\n");
635 InsertMatchAheadOpCode<MatchAheadOpCode>(true);
636 return true;
637 // (?<!Disjunction[?U, ?N])
638 }
639 if (c0_ == '!') {
640 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
641 PrintF("Assertion(?<! Disjunction)\n");
642 InsertMatchAheadOpCode<NegativeMatchAheadOpCode>(true);
643 return true;
644 }
645
646 Prev();
647 if (!HandleGroupName()) {
648 return false;
649 }
650 Advance();
651 parseCapture = true;
652 break;
653 }
654 // (?:Disjunction[?U, ?N])
655 case ':':
656 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
657 PrintF("Atom(?<: Disjunction)\n");
658 isAtom = true;
659 Advance();
660 ParseDisjunction(isBackward);
661 break;
662 default:
663 Advance();
664 ParseError("? Syntax error.");
665 return false;
666 }
667
668 return true;
669 }
670
ParseAssertionCapture(int * captureIndex,bool isBackward)671 bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
672 {
673 bool isAtom = false;
674 do {
675 bool parseCapture = false;
676 if (c0_ == '?') {
677 Advance();
678 if (!ParseAssertion(isBackward, isAtom, parseCapture)) {
679 return false;
680 }
681 } else {
682 groupNames_.EmitChar(0);
683 parseCapture = true;
684 }
685 if (parseCapture) {
686 isAtom = true;
687 *captureIndex = captureCount_++;
688 SaveEndOpCode saveEndOp;
689 SaveStartOpCode saveStartOp;
690 if (isBackward) {
691 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
692 } else {
693 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
694 }
695 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
696 PrintF("capture start %d \n", *captureIndex);
697 ParseDisjunction(isBackward);
698 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
699 PrintF("capture end %d \n", *captureIndex);
700 if (isBackward) {
701 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
702 } else {
703 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
704 }
705 }
706 } while (c0_ != ')' && c0_ != KEY_EOF);
707 if (c0_ != ')') {
708 ParseError("capture syntax error");
709 return false;
710 }
711 return isAtom;
712 }
713
ParseDecimalDigits()714 int RegExpParser::ParseDecimalDigits()
715 {
716 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
717 PrintF("Parse DecimalDigits------\n");
718 uint32_t result = 0;
719 bool overflow = false;
720 while (true) {
721 if (c0_ < '0' || c0_ > '9') {
722 break;
723 }
724 if (!overflow) {
725 if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
726 overflow = true;
727 } else {
728 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
729 }
730 }
731 Advance();
732 }
733 if (overflow) {
734 return INT32_MAX;
735 }
736 return result;
737 }
738
ParserIntervalQuantifier(int * pmin,int * pmax)739 bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
740 {
741 // Quantifier::
742 // QuantifierPrefix
743 // QuantifierPrefix?
744 // QuantifierPrefix::
745 // *
746 // +
747 // ?
748 // {DecimalDigits}
749 // {DecimalDigits,}
750 // {DecimalDigits,DecimalDigits}
751 Advance();
752 *pmin = ParseDecimalDigits();
753 *pmax = *pmin;
754 switch (c0_) {
755 case ',': {
756 Advance();
757 if (c0_ == '}') {
758 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
759 PrintF("QuantifierPrefix{DecimalDigits,}\n");
760 *pmax = INT32_MAX;
761 Advance();
762 } else {
763 *pmax = ParseDecimalDigits();
764 if (c0_ == '}') {
765 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
766 PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
767 Advance();
768 } else {
769 return false;
770 }
771 }
772 break;
773 }
774 case '}':
775 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
776 PrintF("QuantifierPrefix{DecimalDigits}\n");
777 Advance();
778 break;
779 default:
780 Advance();
781 return false;
782 }
783 return true;
784 }
785
ParseQuantifierPrefix(int & min,int & max,bool & isGreedy)786 bool RegExpParser::ParseQuantifierPrefix(int &min, int &max, bool &isGreedy)
787 {
788 switch (c0_) {
789 case '*':
790 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
791 PrintF("QuantifierPrefix %c\n", c0_);
792 min = 0;
793 max = INT32_MAX;
794 Advance();
795 break;
796 case '+':
797 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
798 PrintF("QuantifierPrefix %c\n", c0_);
799 min = 1;
800 max = INT32_MAX;
801 Advance();
802 break;
803 case '?':
804 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
805 PrintF("QuantifierPrefix %c\n", c0_);
806 Advance();
807 min = 0;
808 max = 1;
809 break;
810 case '{': {
811 uint8_t *start = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
812 if (!ParserIntervalQuantifier(&min, &max)) {
813 pc_ = start;
814 Advance(); // back to '{'
815 return false;
816 }
817 if (min > max) {
818 ParseError("Invalid repetition count");
819 return false;
820 }
821 break;
822 }
823 default:
824 break;
825 }
826 if (c0_ == '?') {
827 isGreedy = false;
828 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
829 PrintF("Quantifier::QuantifierPrefix?\n");
830 Advance();
831 } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
832 ParseError("nothing to repeat");
833 return false;
834 }
835 return true;
836 }
837
ParseQuantifier(size_t atomBcStart,int captureStart,int captureEnd)838 void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
839 {
840 int min = -1;
841 int max = -1;
842 bool isGreedy = true;
843 if (!ParseQuantifierPrefix(min, max, isGreedy)) {
844 return;
845 }
846 if (min != -1 && max != -1) {
847 stackCount_++;
848 PushOpCode pushOp;
849 pushOp.InsertOpCode(&buffer_, atomBcStart);
850 atomBcStart += pushOp.GetSize();
851
852 if (captureStart != 0) {
853 SaveResetOpCode saveResetOp;
854 saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
855 }
856
857 // zero advance check
858 if (max == INT32_MAX) {
859 stackCount_++;
860 PushCharOpCode pushCharOp;
861 pushCharOp.InsertOpCode(&buffer_, atomBcStart);
862 CheckCharOpCode checkCharOp;
863 // NOLINTNEXTLINE(readability-magic-numbers)
864 checkCharOp.EmitOpCode(&buffer_, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
865 }
866
867 if (isGreedy) {
868 LoopGreedyOpCode loopOp;
869 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
870 } else {
871 LoopOpCode loopOp;
872 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
873 }
874
875 if (min == 0) {
876 if (isGreedy) {
877 SplitNextOpCode splitNextOp;
878 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
879 } else {
880 SplitFirstOpCode splitFirstOp;
881 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
882 }
883 }
884
885 PopOpCode popOp;
886 popOp.EmitOpCode(&buffer_);
887 }
888 }
889
ParseGroupSpecifier(const uint8_t ** pp,PandaString & name)890 bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, PandaString &name)
891 {
892 const uint8_t *p = *pp;
893 uint32_t c;
894 std::array<char, CACHE_SIZE> buffer {};
895 char *q = buffer.data();
896 while (true) {
897 if (p <= end_) {
898 c = *p;
899 } else {
900 c = KEY_EOF;
901 }
902 if (c == '\\') {
903 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
904 p++;
905 if (*p != 'u') {
906 return false;
907 }
908 if (!ParseUnicodeEscape(&c)) {
909 return false;
910 }
911 } else if (c == '>') {
912 break;
913 } else if (c > CACHE_SIZE && c != KEY_EOF) {
914 c = static_cast<uint32_t>(UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
915 } else if (c != KEY_EOF) {
916 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
917 p++;
918 } else {
919 return false;
920 }
921 if (q == buffer.data()) {
922 if (IsIdentFirst(c) == 0) {
923 return false;
924 }
925 } else {
926 if (!u_isIDPart(c)) {
927 return false;
928 }
929 }
930 if (q != nullptr) {
931 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
932 *q++ = c;
933 }
934 }
935 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
936 p++;
937 *pp = p;
938 name = buffer.data();
939 return true;
940 }
941
CalculateCaptureIndex(const uint8_t * p,int & captureIndex,const char * groupName,PandaString & name)942 bool RegExpParser::CalculateCaptureIndex(const uint8_t *p, int &captureIndex, const char *groupName, PandaString &name)
943 {
944 if (p[1] == '?') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
945 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
946 if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
947 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
948 p[CAPTURE_CONUT_ADVANCE] != '=') {
949 hasNamedCaptures_ = 1;
950 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
951 p += CAPTURE_CONUT_ADVANCE;
952 if (groupName != nullptr && ParseGroupSpecifier(&p, name) && strcmp(name.c_str(), groupName) == 0) {
953 return true;
954 }
955 captureIndex++;
956 }
957 } else {
958 captureIndex++;
959 }
960
961 return false;
962 }
963
ShiftPointerToClosingBracket(const uint8_t * p,const uint8_t * end)964 static inline void ShiftPointerToClosingBracket(const uint8_t *p, const uint8_t *end)
965 {
966 while (p < end && *p != ']') {
967 if (*p == '\\') {
968 p++; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
969 }
970 p++; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
971 }
972 }
973
ParseCaptureCountImpl(const uint8_t * p,int & captureIndex,const char * groupName,PandaString & name)974 bool RegExpParser::ParseCaptureCountImpl(const uint8_t *p, int &captureIndex, const char *groupName, PandaString &name)
975 {
976 switch (*p) {
977 case '(': {
978 if (CalculateCaptureIndex(p, captureIndex, groupName, name)) {
979 return true;
980 }
981 break;
982 }
983 case '\\':
984 p++; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
985 break;
986 case '[': {
987 ShiftPointerToClosingBracket(p, end_);
988 break;
989 }
990 default:
991 break;
992 }
993
994 return false;
995 }
996
ParseCaptureCount(const char * groupName)997 int RegExpParser::ParseCaptureCount(const char *groupName)
998 {
999 const uint8_t *p = nullptr;
1000 int captureIndex = 1;
1001 PandaString name;
1002 hasNamedCaptures_ = 0;
1003 for (p = base_; p < end_; p++) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1004 if (ParseCaptureCountImpl(p, captureIndex, groupName, name)) {
1005 return captureIndex;
1006 }
1007 }
1008 return captureIndex;
1009 }
1010
ParseLookBehind(DynChunk & buffer,PrevOpCode & prevOp,bool isBackward)1011 void RegExpParser::ParseLookBehind(DynChunk &buffer, PrevOpCode &prevOp, bool isBackward)
1012 {
1013 if (isBackward) {
1014 prevOp.EmitOpCode(&buffer, 0);
1015 }
1016 Advance();
1017 }
1018
InsertRangeOpCode(DynChunk & buffer,RangeSet & rangeSet,PrevOpCode & prevOp,bool isBackward)1019 void RegExpParser::InsertRangeOpCode(DynChunk &buffer, RangeSet &rangeSet, PrevOpCode &prevOp, bool isBackward)
1020 {
1021 RangeOpCode rangeOp;
1022 if (isBackward) {
1023 prevOp.EmitOpCode(&buffer, 0);
1024 }
1025 rangeOp.InsertOpCode(&buffer, rangeSet);
1026 ParseLookBehind(buffer, prevOp, isBackward);
1027 }
1028
InsertRange32OpCode(DynChunk & buffer,RangeSet & rangeSet,PrevOpCode & prevOp,bool isBackward)1029 void RegExpParser::InsertRange32OpCode(DynChunk &buffer, RangeSet &rangeSet, PrevOpCode &prevOp, bool isBackward)
1030 {
1031 RangeSet atomRange(rangeSet);
1032 atomRange.Invert(IsUtf16());
1033 Range32OpCode rangeOp;
1034 if (isBackward) {
1035 prevOp.EmitOpCode(&buffer, 0);
1036 }
1037 rangeOp.InsertOpCode(&buffer, atomRange);
1038 ParseLookBehind(buffer, prevOp, isBackward);
1039 }
1040
ParseGroupName()1041 int RegExpParser::ParseGroupName()
1042 {
1043 Advance();
1044 if (c0_ != '<') {
1045 if (!IsUtf16() || HasNamedCaptures()) {
1046 ParseError("expecting group name.");
1047 return -1;
1048 }
1049 }
1050 Advance();
1051 Prev();
1052 PandaString name;
1053 auto **pp = const_cast<const uint8_t **>(&pc_);
1054 if (!ParseGroupSpecifier(pp, name)) {
1055 ParseError("GroupName Syntax error.");
1056 return -1;
1057 }
1058 int postion = FindGroupName(name);
1059 if (postion < 0) {
1060 postion = ParseCaptureCount(name.c_str());
1061 if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1062 ParseError("group name not defined");
1063 return -1;
1064 }
1065 }
1066
1067 return postion;
1068 }
1069
EmitRefOpCode(DynChunk & buffer,uint32_t para,bool isBackward)1070 static void EmitRefOpCode(DynChunk &buffer, uint32_t para, bool isBackward)
1071 {
1072 if (isBackward) {
1073 BackwardBackReferenceOpCode backReferenceOp;
1074 backReferenceOp.EmitOpCode(&buffer, para);
1075 } else {
1076 BackReferenceOpCode backReferenceOp;
1077 backReferenceOp.EmitOpCode(&buffer, para);
1078 }
1079 }
1080
1081 // CC-OFFNXT(G.FUN.01, huge_method) big switch case
1082 // NOLINTNEXTLINE(readability-function-size)
ParseAtomEscape(bool isBackward)1083 int RegExpParser::ParseAtomEscape(bool isBackward)
1084 {
1085 // AtomEscape[U, N]::
1086 // DecimalEscape
1087 // CharacterClassEscape[?U]
1088 // CharacterEscape[?U]
1089 // [+N]kGroupName[?U]
1090 int result = -1;
1091 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1092 PrintF("Parse AtomEscape------\n");
1093 PrevOpCode prevOp;
1094 switch (c0_) {
1095 case KEY_EOF:
1096 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1097 ParseError("unexpected end");
1098 break;
1099 // DecimalEscape
1100 case '1':
1101 case '2':
1102 case '3':
1103 case '4':
1104 case '5':
1105 case '6':
1106 case '7':
1107 case '8':
1108 case '9': {
1109 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1110 PrintF("NonZeroDigit %c\n", c0_);
1111 int capture = ParseDecimalDigits();
1112 if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
1113 ParseError("invalid backreference count");
1114 break;
1115 }
1116 EmitRefOpCode(buffer_, capture, isBackward);
1117 break;
1118 }
1119 // CharacterClassEscape
1120 case 'd': {
1121 // [0-9]
1122 InsertRangeOpCode(buffer_, g_gRangeD, prevOp, isBackward);
1123 break;
1124 }
1125 case 'D': {
1126 // [^0-9]
1127 InsertRange32OpCode(buffer_, g_gRangeD, prevOp, isBackward);
1128 break;
1129 }
1130 case 's': {
1131 // [\f\n\r\t\v]
1132 InsertRangeOpCode(buffer_, g_gRangeS, prevOp, isBackward);
1133 break;
1134 }
1135 case 'S': {
1136 InsertRange32OpCode(buffer_, g_gRangeS, prevOp, isBackward);
1137 break;
1138 }
1139 case 'w': {
1140 // [A-Za-z0-9]
1141 InsertRangeOpCode(buffer_, g_gRangeW, prevOp, isBackward);
1142 break;
1143 }
1144 case 'W': {
1145 // [^A-Za-z0-9]
1146 InsertRange32OpCode(buffer_, g_gRangeW, prevOp, isBackward);
1147 break;
1148 }
1149 // P{UnicodePropertyValueExpression}
1150 // p{UnicodePropertyValueExpression}
1151 case 'P':
1152 case 'p':
1153 // [+N]kGroupName[?U]
1154 case 'k': {
1155 int postion = ParseGroupName();
1156 if (postion < 0) {
1157 break;
1158 }
1159 EmitRefOpCode(buffer_, postion, isBackward);
1160 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1161 Advance();
1162 break;
1163 }
1164 default:
1165 result = ParseCharacterEscape();
1166 break;
1167 }
1168 return result;
1169 }
1170
RecountCaptures()1171 int RegExpParser::RecountCaptures()
1172 {
1173 if (totalCaptureCount_ < 0) {
1174 const char *name = reinterpret_cast<const char *>(groupNames_.buf_);
1175 totalCaptureCount_ = ParseCaptureCount(name);
1176 }
1177 return totalCaptureCount_;
1178 }
HasNamedCaptures()1179 bool RegExpParser::HasNamedCaptures()
1180 {
1181 if (hasNamedCaptures_ < 0) {
1182 RecountCaptures();
1183 }
1184 return false;
1185 }
1186
1187 // CC-OFFNXT(G.FUN.01, huge_cyclomatic_complexity, huge_method) big switch case
ParseCharacterEscape()1188 int RegExpParser::ParseCharacterEscape()
1189 {
1190 // CharacterEscape[U]::
1191 // ControlEscape
1192 // c ControlLetter
1193 // 0 [lookahead ∉ DecimalDigit]
1194 // HexEscapeSequence
1195 // RegExpUnicodeEscapeSequence[?U]
1196 // IdentityEscape[?U]
1197 uint32_t result = 0;
1198 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1199 switch (c0_) {
1200 // ControlEscape
1201 case 'f':
1202 result = '\f';
1203 PrintControlEscapeAndAdvance();
1204 break;
1205 case 'n':
1206 result = '\n';
1207 PrintControlEscapeAndAdvance();
1208 break;
1209 case 'r':
1210 result = '\r';
1211 PrintControlEscapeAndAdvance();
1212 break;
1213 case 't':
1214 result = '\t';
1215 PrintControlEscapeAndAdvance();
1216 break;
1217 case 'v':
1218 result = '\v';
1219 PrintControlEscapeAndAdvance();
1220 break;
1221 // c ControlLetter
1222 case 'c': {
1223 ParseControlLetter(result);
1224 break;
1225 }
1226 case '0': {
1227 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1228 PrintF("CharacterEscape 0 [lookahead ∉ DecimalDigit]\n");
1229 if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) { // NOLINT(readability-magic-numbers)
1230 Advance();
1231 result = 0;
1232 break;
1233 }
1234 [[fallthrough]];
1235 }
1236 case '1':
1237 case '2':
1238 case '3':
1239 case '4':
1240 case '5':
1241 case '6':
1242 case '7': {
1243 if (IsUtf16()) {
1244 // With /u, decimal escape is not interpreted as octal character code.
1245 ParseError("Invalid class escape");
1246 return 0;
1247 }
1248 result = ParseOctalLiteral();
1249 break;
1250 }
1251 // ParseHexEscapeSequence
1252 // ParseRegExpUnicodeEscapeSequence
1253 case 'x': {
1254 Advance();
1255 if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1256 return result;
1257 }
1258 if (IsUtf16()) {
1259 ParseError("Invalid class escape");
1260 return -1;
1261 }
1262 result = 'x';
1263 break;
1264 }
1265 case 'u': {
1266 Advance();
1267 if (ParseUnicodeEscape(&result)) {
1268 return result;
1269 }
1270 if (IsUtf16()) {
1271 // With /u, invalid escapes are not treated as identity escapes.
1272 ParseError("Invalid unicode escape");
1273 return 0;
1274 }
1275 // If \u is not followed by a two-digit hexadecimal, treat it
1276 // as an identity escape.
1277 result = 'u';
1278 break;
1279 }
1280 // IdentityEscape[?U]
1281 case '$':
1282 case '(':
1283 case ')':
1284 case '*':
1285 case '+':
1286 case '.':
1287 case '/':
1288 case '?':
1289 case '[':
1290 case '\\':
1291 case ']':
1292 case '^':
1293 case '{':
1294 case '|':
1295 case '}':
1296 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1297 PrintF("IdentityEscape %c\n", c0_);
1298 result = c0_;
1299 Advance();
1300 break;
1301 default: {
1302 ParseCharacterEscapeDefault(result);
1303 break;
1304 }
1305 }
1306 return static_cast<int>(result);
1307 }
1308
ParseCharacterEscapeDefault(uint32_t & result)1309 void RegExpParser::ParseCharacterEscapeDefault(uint32_t &result)
1310 {
1311 if (IsUtf16()) {
1312 ParseError("Invalid unicode escape");
1313 result = 0;
1314 return;
1315 }
1316 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1317 PrintF("SourceCharacter %c\n", c0_);
1318 result = c0_;
1319 if (result < CHAR_MAXS) {
1320 Advance();
1321 }
1322 }
1323
ParseControlLetter(uint32_t & result)1324 void RegExpParser::ParseControlLetter(uint32_t &result)
1325 {
1326 Advance();
1327 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1328 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1329 PrintF("ControlLetter %c\n", c0_);
1330 result = static_cast<uint32_t>(c0_) & 0x1f; // NOLINT(readability-magic-numbers, hicpp-signed-bitwise)
1331 Advance();
1332 } else {
1333 if (!IsUtf16()) {
1334 pc_--; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1335 result = '\\';
1336 } else {
1337 ParseError("Invalid control letter");
1338 result = -1;
1339 }
1340 }
1341 }
1342
PrintControlEscapeAndAdvance()1343 void RegExpParser::PrintControlEscapeAndAdvance()
1344 {
1345 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1346 PrintF("ControlEscape %c\n", c0_);
1347 Advance();
1348 }
1349
ParseClassRangesImpl(RangeSet * result)1350 bool RegExpParser::ParseClassRangesImpl(RangeSet *result)
1351 {
1352 RangeSet s1;
1353 uint32_t c1 = ParseClassAtom(&s1);
1354 if (c1 == UINT32_MAX) {
1355 ParseError("invalid class range");
1356 return false;
1357 }
1358
1359 int nextC0 = *pc_;
1360 if (c0_ == '-' && nextC0 != ']') {
1361 if (c1 == CLASS_RANGE_BASE) {
1362 if (IsUtf16()) {
1363 ParseError("invalid class range");
1364 return false;
1365 }
1366 result->Insert(s1);
1367 return true;
1368 }
1369 Advance();
1370 RangeSet s2;
1371 uint32_t c2 = ParseClassAtom(&s2);
1372 if (c2 == UINT32_MAX) {
1373 ParseError("invalid class range");
1374 return false;
1375 }
1376 if (c2 == CLASS_RANGE_BASE) {
1377 if (IsUtf16()) {
1378 ParseError("invalid class range");
1379 return false;
1380 }
1381 result->Insert(s2);
1382 return true;
1383 }
1384 if (c1 < INT8_MAX) {
1385 if (c1 > c2) {
1386 ParseError("invalid class range");
1387 return false;
1388 }
1389 }
1390 if (IsIgnoreCase()) {
1391 c1 = static_cast<uint32_t>(Canonicalize(c1, IsUtf16()));
1392 c2 = static_cast<uint32_t>(Canonicalize(c2, IsUtf16()));
1393 }
1394
1395 result->Insert(c1, c2);
1396 } else {
1397 result->Insert(s1);
1398 }
1399
1400 return true;
1401 }
1402
ParseClassRanges(RangeSet * result)1403 bool RegExpParser::ParseClassRanges(RangeSet *result)
1404 {
1405 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1406 PrintF("Parse ClassRanges------\n");
1407 while (c0_ != ']') {
1408 if (!ParseClassRangesImpl(result)) {
1409 return false;
1410 }
1411 }
1412 Advance();
1413 return true;
1414 }
1415
ParseClassAtom(RangeSet * atom)1416 uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1417 {
1418 uint32_t ret = UINT32_MAX;
1419 switch (c0_) {
1420 case '\\': {
1421 Advance();
1422 ret = static_cast<uint32_t>(ParseClassEscape(atom));
1423 break;
1424 }
1425 case KEY_EOF:
1426 break;
1427 case 0: {
1428 if (pc_ >= end_) {
1429 return UINT32_MAX;
1430 }
1431 [[fallthrough]];
1432 }
1433 default: {
1434 uint32_t value = c0_;
1435 size_t u16Size;
1436 if (c0_ > INT8_MAX) {
1437 pc_ -= 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1438 auto u16Result = utf::ConvertUtf8ToUtf16Pair(pc_, true);
1439 value = u16Result.first;
1440 u16Size = u16Result.second;
1441 Advance(u16Size + 1);
1442 } else {
1443 Advance();
1444 }
1445 if (IsIgnoreCase()) {
1446 value = static_cast<uint32_t>(Canonicalize(value, IsUtf16()));
1447 }
1448 atom->Insert(RangeSet(value));
1449 ret = value;
1450 break;
1451 }
1452 }
1453 return ret;
1454 }
1455
InsertRangeBase(RangeSet * atom,RangeSet & rangeSet,bool invert)1456 void RegExpParser::InsertRangeBase(RangeSet *atom, RangeSet &rangeSet, bool invert)
1457 {
1458 atom->Insert(rangeSet);
1459 if (invert) {
1460 atom->Invert(IsUtf16());
1461 }
1462 }
1463
ParseClassEscape(RangeSet * atom)1464 int RegExpParser::ParseClassEscape(RangeSet *atom)
1465 {
1466 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1467 PrintF("Parse ClassEscape------\n");
1468 int result = -1;
1469 switch (c0_) {
1470 case 'b':
1471 Advance();
1472 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1473 PrintF("ClassEscape %c", 'b');
1474 result = '\b';
1475 atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1476 break;
1477 case '-':
1478 Advance();
1479 result = '-';
1480 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1481 PrintF("ClassEscape %c", '-');
1482 atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1483 break;
1484 // CharacterClassEscape
1485 case 'd':
1486 case 'D':
1487 result = CLASS_RANGE_BASE;
1488 InsertRangeBase(atom, g_gRangeD, c0_ == 'D');
1489 Advance();
1490 break;
1491 case 's':
1492 case 'S':
1493 result = CLASS_RANGE_BASE;
1494 InsertRangeBase(atom, g_gRangeS, c0_ == 'S');
1495 Advance();
1496 break;
1497 case 'w':
1498 case 'W':
1499 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1500 PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1501 result = CLASS_RANGE_BASE;
1502 InsertRangeBase(atom, g_gRangeW, c0_ == 'W');
1503 Advance();
1504 break;
1505 // P{UnicodePropertyValueExpression}
1506 // p{UnicodePropertyValueExpression}
1507 case 'P':
1508 case 'p':
1509 ParseUnicodePropertyValueCharacters(result);
1510 break;
1511 default:
1512 result = ParseCharacterEscape();
1513 int value = result;
1514 if (IsIgnoreCase()) {
1515 value = Canonicalize(value, IsUtf16());
1516 }
1517 atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1518 break;
1519 }
1520 return result;
1521 }
1522
ParseUnicodePropertyValueCharacters(int & result)1523 void RegExpParser::ParseUnicodePropertyValueCharacters(int &result)
1524 {
1525 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1526 PrintF("Warning: \\p is not supported in ECMA 2015!");
1527 Advance();
1528 if (c0_ == '{') {
1529 Advance();
1530 if (c0_ == '}') {
1531 return; // p{}, invalid
1532 }
1533 bool isValue = false;
1534 ParseUnicodePropertyValueCharactersImpl(&isValue);
1535 if (!isValue && c0_ == '=') {
1536 // UnicodePropertyName = UnicodePropertyValue
1537 Advance();
1538 if (c0_ == '}') {
1539 return; // p{xxx=}, invalid
1540 }
1541 ParseUnicodePropertyValueCharactersImpl(&isValue);
1542 }
1543 if (c0_ != '}') {
1544 return; // p{xxx, invalid
1545 }
1546 // should do atom->Invert() here after ECMA 9.0
1547 Advance();
1548 result = CLASS_RANGE_BASE;
1549 }
1550 }
1551
ParseUnicodePropertyValueCharactersImpl(bool * isValue)1552 void RegExpParser::ParseUnicodePropertyValueCharactersImpl(bool *isValue)
1553 {
1554 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1555 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1556 PrintF("UnicodePropertyCharacter::ControlLetter %c\n", c0_);
1557 } else if (c0_ == '_') {
1558 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1559 PrintF("UnicodePropertyCharacter:: _ \n");
1560 } else if (c0_ >= '0' && c0_ <= '9') {
1561 *isValue = true;
1562 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1563 PrintF("UnicodePropertyValueCharacter::DecimalDigit %c\n", c0_);
1564 } else {
1565 return;
1566 }
1567 Advance();
1568 ParseUnicodePropertyValueCharactersImpl(isValue);
1569 }
1570
1571 // NOLINTNEXTLINE(cert-dcl50-cpp)
PrintF(const char * fmt,...)1572 void RegExpParser::PrintF(const char *fmt, ...)
1573 {
1574 (void)fmt;
1575 }
1576
ParseError(const char * errorMessage)1577 void RegExpParser::ParseError(const char *errorMessage)
1578 {
1579 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1580 PrintF("error: ");
1581 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1582 PrintF(errorMessage);
1583 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1584 PrintF("\n");
1585 SetIsError();
1586 size_t length = strlen(errorMessage) + 1;
1587 if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1588 LOG(FATAL, COMMON) << "memcpy_s failed";
1589 UNREACHABLE();
1590 }
1591 }
1592
IsIdentFirst(uint32_t c)1593 int RegExpParser::IsIdentFirst(uint32_t c)
1594 {
1595 if (c < CACHE_SIZE) {
1596 // NOLINTNEXTLINE(hicpp-signed-bitwise
1597 return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1; // 5: Shift five bits 31: and operation binary of 31
1598 }
1599 return static_cast<int>(u_isIDStart(c));
1600 }
1601 } // namespace ark