1 /**
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "runtime/regexp/ecmascript/regexp_parser.h"
17 #include "runtime/regexp/ecmascript/regexp_opcode.h"
18 #include "runtime/include/coretypes/string-inl.h"
19
20 #include "libpandabase/utils/utils.h"
21 #include "libpandabase/utils/utf.h"
22
23 #include "securec.h"
24 #include "unicode/uchar.h"
25 #include "unicode/uniset.h"
26
27 namespace {
28 constexpr int INVALID_UNICODE_FROM_UTF8 = -1;
29
30 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
31 constexpr int UICODE_FROM_UTF8[] = {
32 0x80, 0xc0, 0xdf, 0xe0, 0xef, 0xf0, 0xf7, 0xf8, 0xfb, 0xfc, 0xfd,
33 };
34
35 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
36 constexpr int UTF8_MIN_CODE[] = {
37 0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
38 };
39
40 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
41 constexpr char UTF8_FIRST_CODE[] = {
42 0x1f, 0xf, 0x7, 0x3, 0x1,
43 };
44
FromUtf8(int c,int l,const uint8_t * p,const uint8_t ** pp)45 int FromUtf8(int c, int l, const uint8_t *p, const uint8_t **pp)
46 {
47 uint32_t b;
48 // NOLINTNEXTLINE(hicpp-signed-bitwise)
49 c &= UTF8_FIRST_CODE[l - 1];
50 for (int i = 0; i < l; i++) {
51 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
52 b = *p++;
53 if (b < ark::utf::UTF8_2B_SECOND || b >= ark::utf::UTF8_2B_FIRST) {
54 return INVALID_UNICODE_FROM_UTF8;
55 }
56 // NOLINTNEXTLINE(hicpp-signed-bitwise)
57 c = (c << 6) | (b & ark::utf::UTF8_2B_THIRD); // 6: Maximum Unicode range
58 }
59 if (c < UTF8_MIN_CODE[l - 1]) {
60 return INVALID_UNICODE_FROM_UTF8;
61 }
62 *pp = p;
63 return c;
64 }
65
UnicodeFromUtf8(const uint8_t * p,int maxLen,const uint8_t ** pp)66 int UnicodeFromUtf8(const uint8_t *p, int maxLen, const uint8_t **pp)
67 {
68 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
69 int c = *p++;
70 if (c < UICODE_FROM_UTF8[0]) {
71 *pp = p;
72 return c;
73 }
74 int l = 0;
75 if (c >= UICODE_FROM_UTF8[1U] && c <= UICODE_FROM_UTF8[2U]) { // 1 - 2: 0000 0080 - 0000 07FF
76 l = 1; // 1: 0000 0080 - 0000 07FF Unicode
77 } else if (c >= UICODE_FROM_UTF8[3U] && c <= UICODE_FROM_UTF8[4U]) { // 3 - 4: 0000 0800 - 0000 FFFF
78 l = 2; // 2: 0000 0800 - 0000 FFFF Unicode
79 } else if (c >= UICODE_FROM_UTF8[5U] && c <= UICODE_FROM_UTF8[6U]) { // 5 - 6: 0001 0000 - 0010 FFFF
80 l = 3; // 3: 0001 0000 - 0010 FFFF Unicode
81 } else if (c >= UICODE_FROM_UTF8[7U] && c <= UICODE_FROM_UTF8[8U]) { // 7 - 8: 0020 0000 - 03FF FFFF
82 l = 4; // 4: 0020 0000 - 03FF FFFF Unicode
83 // NOLINTNEXTLINE(readability-magic-numbers)
84 } else if (c == UICODE_FROM_UTF8[9U] || c == UICODE_FROM_UTF8[10U]) { // 9 - 10: 0400 0000 - 7FFF FFFF
85 l = 5; // 5: 0400 0000 - 7FFF FFFF Unicode
86 } else {
87 return INVALID_UNICODE_FROM_UTF8;
88 }
89 /* check that we have enough characters */
90 if (l > (maxLen - 1)) {
91 return INVALID_UNICODE_FROM_UTF8;
92 }
93 return FromUtf8(c, l, p, pp);
94 }
95 } // namespace
96
97 namespace ark {
98 static constexpr uint32_t CACHE_SIZE = 128;
99 static constexpr uint32_t CHAR_MAXS = 128;
100 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
101 static constexpr uint32_t ID_START_TABLE_ASCII[4] = {
102 /* $ A-Z _ a-z */
103 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE};
104 static RangeSet g_gRangeD(0x30, 0x39); // NOLINT(fuchsia-statically-constructed-objects, readability-magic-numbers)
105 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
106 static RangeSet g_gRangeS({
107 std::pair<uint32_t, uint32_t>(0x0009, 0x000D), // NOLINT(readability-magic-numbers)
108 std::pair<uint32_t, uint32_t>(0x0020, 0x0020), // NOLINT(readability-magic-numbers)
109 std::pair<uint32_t, uint32_t>(0x00A0, 0x00A0), // NOLINT(readability-magic-numbers)
110 std::pair<uint32_t, uint32_t>(0x1680, 0x1680), // NOLINT(readability-magic-numbers)
111 std::pair<uint32_t, uint32_t>(0x2000, 0x200A), // NOLINT(readability-magic-numbers)
112 /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
113 /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
114 std::pair<uint32_t, uint32_t>(0x2028, 0x2029), // NOLINT(readability-magic-numbers)
115 std::pair<uint32_t, uint32_t>(0x202F, 0x202F), // NOLINT(readability-magic-numbers)
116 std::pair<uint32_t, uint32_t>(0x205F, 0x205F), // NOLINT(readability-magic-numbers)
117 std::pair<uint32_t, uint32_t>(0x3000, 0x3000), // NOLINT(readability-magic-numbers)
118 /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
119 std::pair<uint32_t, uint32_t>(0xFEFF, 0xFEFF), // NOLINT(readability-magic-numbers)
120 });
121
122 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
123 static RangeSet g_gRangeW({
124 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINT(readability-magic-numbers)
125 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINT(readability-magic-numbers)
126 std::pair<uint32_t, uint32_t>(0x005F, 0x005F), // NOLINT(readability-magic-numbers)
127 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINT(readability-magic-numbers)
128 });
129
130 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
131 static RangeSet g_gRegexpIdentifyStart({
132 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINT(readability-magic-numbers)
133 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINT(readability-magic-numbers)
134 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINT(readability-magic-numbers)
135 });
136
137 // NOLINTNEXTLINE(fuchsia-statically-constructed-objects)
138 static RangeSet g_gRegexpIdentifyContinue({
139 std::pair<uint32_t, uint32_t>(0x0024, 0x0024), // NOLINT(readability-magic-numbers)
140 std::pair<uint32_t, uint32_t>(0x0030, 0x0039), // NOLINT(readability-magic-numbers)
141 std::pair<uint32_t, uint32_t>(0x0041, 0x005A), // NOLINT(readability-magic-numbers)
142 std::pair<uint32_t, uint32_t>(0x0061, 0x007A), // NOLINT(readability-magic-numbers)
143 });
144
Parse()145 void RegExpParser::Parse()
146 {
147 // dynbuffer head init [size,capture_count,statck_count,flags]
148 buffer_.EmitU32(0);
149 buffer_.EmitU32(0);
150 buffer_.EmitU32(0);
151 buffer_.EmitU32(0);
152 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
153 PrintF("Parse Pattern------\n");
154 // Pattern[U, N]::
155 // Disjunction[?U, ?N]
156 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
157 Advance();
158 SaveStartOpCode saveStartOp;
159 int captureIndex = captureCount_++;
160 saveStartOp.EmitOpCode(&buffer_, captureIndex);
161 ParseDisjunction(false);
162 if (c0_ != KEY_EOF) {
163 ParseError("extraneous characters at the end");
164 return;
165 }
166 SaveEndOpCode saveEndOp;
167 saveEndOp.EmitOpCode(&buffer_, captureIndex);
168 MatchEndOpCode matchEndOp;
169 matchEndOp.EmitOpCode(&buffer_, 0);
170 // dynbuffer head assignments
171 buffer_.PutU32(0, buffer_.size_);
172 buffer_.PutU32(NUM_CAPTURE__OFFSET, captureCount_);
173 buffer_.PutU32(NUM_STACK_OFFSET, stackCount_);
174 buffer_.PutU32(FLAGS_OFFSET, flags_);
175 }
176
ParseDisjunction(bool isBackward)177 void RegExpParser::ParseDisjunction(bool isBackward)
178 {
179 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
180 PrintF("Parse Disjunction------\n");
181 size_t start = buffer_.size_;
182 ParseAlternative(isBackward);
183 if (isError_) {
184 return;
185 }
186 do {
187 if (c0_ == '|') {
188 SplitNextOpCode splitOp;
189 uint32_t len = buffer_.size_ - start;
190 GotoOpCode gotoOp;
191 splitOp.InsertOpCode(&buffer_, start, len + gotoOp.GetSize());
192 uint32_t pos = gotoOp.EmitOpCode(&buffer_, 0) - gotoOp.GetSize();
193 Advance();
194 ParseAlternative(isBackward);
195 gotoOp.UpdateOpPara(&buffer_, pos, buffer_.size_ - pos - gotoOp.GetSize());
196 }
197 } while (c0_ != KEY_EOF && c0_ != ')');
198 }
199
ParseOctalLiteral()200 uint32_t RegExpParser::ParseOctalLiteral()
201 {
202 // For compatibility with some other browsers (not all), we parse
203 // up to three octal digits with a value below 256.
204 // ES#prod-annexB-LegacyOctalEscapeSequence
205 uint32_t value = c0_ - '0';
206 Advance();
207 if (c0_ >= '0' && c0_ <= '7') {
208 value = value * OCTAL_VALUE + c0_ - '0';
209 Advance();
210 if (value < OCTAL_VALUE_RANGE && c0_ >= '0' && c0_ <= '7') {
211 value = value * OCTAL_VALUE + c0_ - '0';
212 Advance();
213 }
214 }
215 return value;
216 }
217
ParseUnlimitedLengthHexNumber(uint32_t maxValue,uint32_t * value)218 bool RegExpParser::ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value)
219 {
220 uint32_t x = 0;
221 int d = static_cast<int>(HexValue(c0_));
222 if (d < 0) {
223 return false;
224 }
225 while (d >= 0) {
226 if (UNLIKELY(x > (std::numeric_limits<uint32_t>::max() - static_cast<uint32_t>(d)) / HEX_VALUE)) {
227 LOG(FATAL, COMMON) << "value overflow";
228 return false;
229 }
230 x = x * HEX_VALUE + static_cast<uint32_t>(d);
231 if (x > maxValue) {
232 return false;
233 }
234 Advance();
235 d = static_cast<int>(HexValue(c0_));
236 }
237 *value = x;
238 return true;
239 }
240
241 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
ParseUnicodeEscape(uint32_t * value)242 bool RegExpParser::ParseUnicodeEscape(uint32_t *value)
243 {
244 // Accept both \uxxxx and \u{xxxxxx} (if allowed).
245 // In the latter case, the number of hex digits between { } is arbitrary.
246 // \ and u have already been read.
247 if (c0_ == '{' && IsUtf16()) {
248 uint8_t *start = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
249 Advance();
250 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) { // NOLINT(readability-magic-numbers)
251 if (c0_ == '}') {
252 Advance();
253 return true;
254 }
255 }
256 pc_ = start;
257 Advance();
258 return false;
259 }
260 // \u but no {, or \u{...} escapes not allowed.
261 bool result = ParseHexEscape(UNICODE_HEX_VALUE, value);
262 if (result && IsUtf16() && U16_IS_LEAD(*value) && c0_ == '\\') {
263 // Attempt to read trail surrogate.
264 uint8_t *start = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
265 if (*pc_ == 'u') {
266 Advance(UNICODE_HEX_ADVANCE);
267 uint32_t trail;
268 if (ParseHexEscape(UNICODE_HEX_VALUE, &trail) && U16_IS_TRAIL(trail)) {
269 *value = U16_GET_SUPPLEMENTARY((*value), (trail)); // NOLINT(hicpp-signed-bitwise)
270 return true;
271 }
272 }
273 pc_ = start;
274 Advance();
275 }
276 return result;
277 }
278
ParseHexEscape(int length,uint32_t * value)279 bool RegExpParser::ParseHexEscape(int length, uint32_t *value)
280 {
281 uint8_t *start = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
282 uint32_t val = 0;
283 for (int i = 0; i < length; ++i) {
284 uint32_t c = c0_;
285 int d = static_cast<int>(HexValue(c));
286 if (d < 0) {
287 pc_ = start;
288 Advance();
289 return false;
290 }
291 val = val * HEX_VALUE + static_cast<uint32_t>(d);
292 Advance();
293 }
294 *value = val;
295 return true;
296 }
297
ParseAlternativeEscape(bool isBackward,bool & isAtom)298 void RegExpParser::ParseAlternativeEscape(bool isBackward, bool &isAtom)
299 {
300 switch (c0_) {
301 case 'b': {
302 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
303 PrintF("Assertion %c \n", c0_);
304 WordBoundaryOpCode wordBoundaryOp;
305 wordBoundaryOp.EmitOpCode(&buffer_, 0);
306 Advance();
307 break;
308 }
309 case 'B': {
310 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
311 PrintF("Assertion %c \n", c0_);
312 NotWordBoundaryOpCode notWordBoundaryOp;
313 notWordBoundaryOp.EmitOpCode(&buffer_, 0);
314 Advance();
315 break;
316 }
317 default: {
318 isAtom = true;
319 int atomValue = ParseAtomEscape(isBackward);
320 if (atomValue != -1) {
321 ParseAlternativeEscapeDefault(atomValue);
322 }
323 break;
324 }
325 }
326 }
327
ParseAlternativeEscapeDefault(int atomValue)328 void RegExpParser::ParseAlternativeEscapeDefault(int atomValue)
329 {
330 if (IsIgnoreCase()) {
331 if (!IsUtf16()) {
332 atomValue = Canonicalize(atomValue, false);
333 } else {
334 icu::UnicodeSet set(atomValue, atomValue);
335 set.closeOver(USET_CASE_INSENSITIVE);
336 set.removeAllStrings();
337 int32_t size = set.size();
338 RangeOpCode rangeOp;
339 RangeSet rangeResult;
340 for (int32_t idx = 0; idx < size; idx++) {
341 int32_t uc = set.charAt(idx);
342 RangeSet curRange(uc);
343 rangeResult.Insert(curRange);
344 }
345 rangeOp.InsertOpCode(&buffer_, rangeResult);
346 return;
347 }
348 }
349 if (atomValue <= UINT16_MAX) {
350 CharOpCode charOp;
351 charOp.EmitOpCode(&buffer_, atomValue);
352 } else {
353 Char32OpCode charOp;
354 charOp.EmitOpCode(&buffer_, atomValue);
355 }
356 }
357
ParsePatternCharacter(bool isBackward)358 void RegExpParser::ParsePatternCharacter(bool isBackward)
359 {
360 PrevOpCode prevOp;
361 if (isBackward) {
362 prevOp.EmitOpCode(&buffer_, 0);
363 }
364 uint32_t matchedChar = c0_;
365 if (c0_ > (INT8_MAX + 1)) {
366 Prev();
367 int i = 0;
368 UChar32 c;
369 int32_t length = end_ - pc_ + 1;
370 // NOLINTNEXTLINE(hicpp-signed-bitwise)
371 U8_NEXT(pc_, i, length, c); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
372 matchedChar = static_cast<uint32_t>(c);
373 pc_ += i; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
374 }
375 if (IsIgnoreCase()) {
376 matchedChar = static_cast<uint32_t>(Canonicalize(static_cast<int>(matchedChar), IsUtf16()));
377 }
378 if (matchedChar > UINT16_MAX) {
379 Char32OpCode charOp;
380 charOp.EmitOpCode(&buffer_, matchedChar);
381 } else {
382 CharOpCode charOp;
383 charOp.EmitOpCode(&buffer_, matchedChar);
384 }
385 if (isBackward) {
386 prevOp.EmitOpCode(&buffer_, 0);
387 }
388 }
389
ParseAlternativeAny(bool isBackward)390 void RegExpParser::ParseAlternativeAny(bool isBackward)
391 {
392 PrevOpCode prevOp;
393 if (isBackward) {
394 prevOp.EmitOpCode(&buffer_, 0);
395 }
396 if (IsDotAll()) {
397 AllOpCode allOp;
398 allOp.EmitOpCode(&buffer_, 0);
399 } else {
400 DotsOpCode dotsOp;
401 dotsOp.EmitOpCode(&buffer_, 0);
402 }
403 if (isBackward) {
404 prevOp.EmitOpCode(&buffer_, 0);
405 }
406 }
407
ParseAlternativeRange(bool isBackward)408 void RegExpParser::ParseAlternativeRange(bool isBackward)
409 {
410 PrevOpCode prevOp;
411 Advance();
412 if (isBackward) {
413 prevOp.EmitOpCode(&buffer_, 0);
414 }
415 bool isInvert = false;
416 if (c0_ == '^') {
417 isInvert = true;
418 Advance();
419 }
420 RangeSet rangeResult;
421 if (!ParseClassRanges(&rangeResult)) {
422 return;
423 }
424 if (isInvert) {
425 rangeResult.Invert(IsUtf16());
426 }
427 uint32_t highValue = rangeResult.HighestValue();
428 if (highValue <= UINT16_MAX) {
429 RangeOpCode rangeOp;
430 rangeOp.InsertOpCode(&buffer_, rangeResult);
431 } else {
432 Range32OpCode rangeOp;
433 rangeOp.InsertOpCode(&buffer_, rangeResult);
434 }
435
436 if (isBackward) {
437 prevOp.EmitOpCode(&buffer_, 0);
438 }
439 }
440
441 // CC-OFFNXT(G.FUN.01, huge_method) solid logic
ParseAlternativeImpl(bool isBackward,bool & isAtom,int & captureIndex)442 void RegExpParser::ParseAlternativeImpl(bool isBackward, bool &isAtom, int &captureIndex)
443 {
444 switch (c0_) {
445 case '^': {
446 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
447 PrintF("Assertion %c line start \n", c0_);
448 LineStartOpCode lineStartOp;
449 lineStartOp.EmitOpCode(&buffer_, 0);
450 Advance();
451 break;
452 }
453 case '$': {
454 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
455 PrintF("Assertion %c line end \n", c0_);
456 LineEndOpCode lineEndOp;
457 lineEndOp.EmitOpCode(&buffer_, 0);
458 Advance();
459 break;
460 }
461 case '\\': {
462 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
463 PrintF("Escape %c \n", c0_);
464 Advance();
465 ParseAlternativeEscape(isBackward, isAtom);
466 break;
467 }
468 case '(': {
469 Advance();
470 isAtom = ParseAssertionCapture(&captureIndex, isBackward);
471 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
472 Advance();
473 break;
474 }
475 case '.': {
476 ParseAlternativeAny(isBackward);
477 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
478 PrintF("Atom %c match any \n", c0_);
479 isAtom = true;
480 Advance();
481 break;
482 }
483 case '[': {
484 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
485 PrintF("Atom %c match range \n", c0_);
486 isAtom = true;
487 ParseAlternativeRange(isBackward);
488 break;
489 }
490 case '*':
491 case '+':
492 case '?':
493 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
494 ParseError("nothing to repeat");
495 return;
496 case '{': {
497 uint8_t *begin = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
498 int dummy;
499 if (ParserIntervalQuantifier(&dummy, &dummy)) {
500 ParseError("nothing to repeat");
501 return;
502 }
503 pc_ = begin;
504 Advance();
505 }
506 [[fallthrough]];
507 case '}':
508 case ']':
509 if (IsUtf16()) {
510 ParseError("syntax error");
511 return;
512 }
513 [[fallthrough]];
514 default: {
515 // PatternCharacter
516 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
517 PrintF("PatternCharacter %c\n", c0_);
518 isAtom = true;
519 ParsePatternCharacter(isBackward);
520 Advance();
521 break;
522 }
523 }
524 }
525
ParseAlternative(bool isBackward)526 void RegExpParser::ParseAlternative(bool isBackward)
527 {
528 size_t start = buffer_.size_;
529 while (c0_ != '|' && c0_ != KEY_EOF && c0_ != ')') {
530 if (isError_) {
531 return;
532 }
533 size_t atomBcStart = buffer_.GetSize();
534 int captureIndex = 0;
535 bool isAtom = false;
536 ParseAlternativeImpl(isBackward, isAtom, captureIndex);
537 if (isAtom && !isError_) {
538 ParseQuantifier(atomBcStart, captureIndex, captureCount_ - 1);
539 }
540 if (isBackward) {
541 size_t end = buffer_.GetSize();
542 size_t termSize = end - atomBcStart;
543 size_t moveSize = end - start;
544 buffer_.Expand(end + termSize);
545 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
546 if (memmove_s(buffer_.buf_ + start + termSize, moveSize, buffer_.buf_ + start, moveSize) != EOK) {
547 LOG(FATAL, COMMON) << "memmove_s failed";
548 UNREACHABLE();
549 }
550 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
551 if (memcpy_s(buffer_.buf_ + start, termSize, buffer_.buf_ + end, termSize) != EOK) {
552 LOG(FATAL, COMMON) << "memcpy_s failed";
553 UNREACHABLE();
554 }
555 }
556 }
557 }
558
FindGroupName(const PandaString & name)559 int RegExpParser::FindGroupName(const PandaString &name)
560 {
561 size_t len;
562 size_t nameLen = name.size();
563 const char *p = reinterpret_cast<char *>(groupNames_.buf_);
564 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
565 const char *bufEnd = reinterpret_cast<char *>(groupNames_.buf_) + groupNames_.size_;
566 int captureIndex = 1;
567 while (p < bufEnd) {
568 len = strlen(p);
569 if (len == nameLen && memcmp(name.c_str(), p, nameLen) == 0) {
570 return captureIndex;
571 }
572 p += len + 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
573 captureIndex++;
574 }
575 return -1;
576 }
577
578 template <typename OpCodeT>
InsertMatchAheadOpCode(bool isBackward)579 void RegExpParser::InsertMatchAheadOpCode(bool isBackward)
580 {
581 Advance();
582 uint32_t start = buffer_.size_;
583 ParseDisjunction(isBackward);
584 MatchOpCode matchOp;
585 matchOp.EmitOpCode(&buffer_, 0);
586 OpCodeT matchAheadOp;
587 uint32_t len = buffer_.size_ - start;
588 matchAheadOp.InsertOpCode(&buffer_, start, len);
589 }
590
HandleGroupName()591 bool RegExpParser::HandleGroupName()
592 {
593 PandaString name;
594 auto **pp = const_cast<const uint8_t **>(&pc_);
595 if (!ParseGroupSpecifier(pp, name)) {
596 ParseError("GroupName Syntax error.");
597 return false;
598 }
599 if (FindGroupName(name) > 0) {
600 ParseError("Duplicate GroupName error.");
601 return false;
602 }
603 groupNames_.EmitStr(name.c_str());
604 newGroupNames_.push_back(name);
605 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
606 PrintF("group name %s", name.c_str());
607
608 return true;
609 }
610
ParseAssertion(bool isBackward,bool & isAtom,bool & parseCapture)611 bool RegExpParser::ParseAssertion(bool isBackward, bool &isAtom, bool &parseCapture)
612 {
613 switch (c0_) {
614 // (?=Disjunction[?U, ?N])
615 case '=': {
616 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
617 PrintF("Assertion(?= Disjunction)\n");
618 InsertMatchAheadOpCode<MatchAheadOpCode>(isBackward);
619 break;
620 }
621 // (?!Disjunction[?U, ?N])
622 case '!': {
623 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
624 PrintF("Assertion(?! Disjunction)\n");
625 InsertMatchAheadOpCode<NegativeMatchAheadOpCode>(isBackward);
626 break;
627 }
628 case '<': {
629 Advance();
630 // (?<=Disjunction[?U, ?N])
631 if (c0_ == '=') {
632 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
633 PrintF("Assertion(?<= Disjunction)\n");
634 InsertMatchAheadOpCode<MatchAheadOpCode>(true);
635 return true;
636 // (?<!Disjunction[?U, ?N])
637 }
638 if (c0_ == '!') {
639 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
640 PrintF("Assertion(?<! Disjunction)\n");
641 InsertMatchAheadOpCode<NegativeMatchAheadOpCode>(true);
642 return true;
643 }
644
645 Prev();
646 if (!HandleGroupName()) {
647 return false;
648 }
649 Advance();
650 parseCapture = true;
651 break;
652 }
653 // (?:Disjunction[?U, ?N])
654 case ':':
655 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
656 PrintF("Atom(?<: Disjunction)\n");
657 isAtom = true;
658 Advance();
659 ParseDisjunction(isBackward);
660 break;
661 default:
662 Advance();
663 ParseError("? Syntax error.");
664 return false;
665 }
666
667 return true;
668 }
669
ParseAssertionCapture(int * captureIndex,bool isBackward)670 bool RegExpParser::ParseAssertionCapture(int *captureIndex, bool isBackward)
671 {
672 bool isAtom = false;
673 do {
674 bool parseCapture = false;
675 if (c0_ == '?') {
676 Advance();
677 if (!ParseAssertion(isBackward, isAtom, parseCapture)) {
678 return false;
679 }
680 } else {
681 groupNames_.EmitChar(0);
682 parseCapture = true;
683 }
684 if (parseCapture) {
685 isAtom = true;
686 *captureIndex = captureCount_++;
687 SaveEndOpCode saveEndOp;
688 SaveStartOpCode saveStartOp;
689 if (isBackward) {
690 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
691 } else {
692 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
693 }
694 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
695 PrintF("capture start %d \n", *captureIndex);
696 ParseDisjunction(isBackward);
697 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
698 PrintF("capture end %d \n", *captureIndex);
699 if (isBackward) {
700 saveStartOp.EmitOpCode(&buffer_, *captureIndex);
701 } else {
702 saveEndOp.EmitOpCode(&buffer_, *captureIndex);
703 }
704 }
705 } while (c0_ != ')' && c0_ != KEY_EOF);
706 if (c0_ != ')') {
707 ParseError("capture syntax error");
708 return false;
709 }
710 return isAtom;
711 }
712
ParseDecimalDigits()713 int RegExpParser::ParseDecimalDigits()
714 {
715 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
716 PrintF("Parse DecimalDigits------\n");
717 uint32_t result = 0;
718 bool overflow = false;
719 while (true) {
720 if (c0_ < '0' || c0_ > '9') {
721 break;
722 }
723 if (!overflow) {
724 if (UNLIKELY(result > (INT32_MAX - c0_ + '0') / DECIMAL_DIGITS_ADVANCE)) {
725 overflow = true;
726 } else {
727 result = result * DECIMAL_DIGITS_ADVANCE + c0_ - '0';
728 }
729 }
730 Advance();
731 }
732 if (overflow) {
733 return INT32_MAX;
734 }
735 return result;
736 }
737
ParserIntervalQuantifier(int * pmin,int * pmax)738 bool RegExpParser::ParserIntervalQuantifier(int *pmin, int *pmax)
739 {
740 // Quantifier::
741 // QuantifierPrefix
742 // QuantifierPrefix?
743 // QuantifierPrefix::
744 // *
745 // +
746 // ?
747 // {DecimalDigits}
748 // {DecimalDigits,}
749 // {DecimalDigits,DecimalDigits}
750 Advance();
751 *pmin = ParseDecimalDigits();
752 *pmax = *pmin;
753 switch (c0_) {
754 case ',': {
755 Advance();
756 if (c0_ == '}') {
757 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
758 PrintF("QuantifierPrefix{DecimalDigits,}\n");
759 *pmax = INT32_MAX;
760 Advance();
761 } else {
762 *pmax = ParseDecimalDigits();
763 if (c0_ == '}') {
764 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
765 PrintF("QuantifierPrefix{DecimalDigits,DecimalDigits}\n");
766 Advance();
767 } else {
768 return false;
769 }
770 }
771 break;
772 }
773 case '}':
774 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
775 PrintF("QuantifierPrefix{DecimalDigits}\n");
776 Advance();
777 break;
778 default:
779 Advance();
780 return false;
781 }
782 return true;
783 }
784
ParseQuantifierPrefix(int & min,int & max,bool & isGreedy)785 bool RegExpParser::ParseQuantifierPrefix(int &min, int &max, bool &isGreedy)
786 {
787 switch (c0_) {
788 case '*':
789 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
790 PrintF("QuantifierPrefix %c\n", c0_);
791 min = 0;
792 max = INT32_MAX;
793 Advance();
794 break;
795 case '+':
796 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
797 PrintF("QuantifierPrefix %c\n", c0_);
798 min = 1;
799 max = INT32_MAX;
800 Advance();
801 break;
802 case '?':
803 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
804 PrintF("QuantifierPrefix %c\n", c0_);
805 Advance();
806 min = 0;
807 max = 1;
808 break;
809 case '{': {
810 uint8_t *start = pc_ - 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
811 if (!ParserIntervalQuantifier(&min, &max)) {
812 pc_ = start;
813 Advance(); // back to '{'
814 return false;
815 }
816 if (min > max) {
817 ParseError("Invalid repetition count");
818 return false;
819 }
820 break;
821 }
822 default:
823 break;
824 }
825 if (c0_ == '?') {
826 isGreedy = false;
827 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
828 PrintF("Quantifier::QuantifierPrefix?\n");
829 Advance();
830 } else if (c0_ == '?' || c0_ == '+' || c0_ == '*' || c0_ == '{') {
831 ParseError("nothing to repeat");
832 return false;
833 }
834 return true;
835 }
836
ParseQuantifier(size_t atomBcStart,int captureStart,int captureEnd)837 void RegExpParser::ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd)
838 {
839 int min = -1;
840 int max = -1;
841 bool isGreedy = true;
842 if (!ParseQuantifierPrefix(min, max, isGreedy)) {
843 return;
844 }
845 if (min != -1 && max != -1) {
846 stackCount_++;
847 PushOpCode pushOp;
848 pushOp.InsertOpCode(&buffer_, atomBcStart);
849 atomBcStart += pushOp.GetSize();
850
851 if (captureStart != 0) {
852 SaveResetOpCode saveResetOp;
853 saveResetOp.InsertOpCode(&buffer_, atomBcStart, captureStart, captureEnd);
854 }
855
856 // zero advance check
857 if (max == INT32_MAX) {
858 stackCount_++;
859 PushCharOpCode pushCharOp;
860 pushCharOp.InsertOpCode(&buffer_, atomBcStart);
861 CheckCharOpCode checkCharOp;
862 // NOLINTNEXTLINE(readability-magic-numbers)
863 checkCharOp.EmitOpCode(&buffer_, RegExpOpCode::GetRegExpOpCode(RegExpOpCode::OP_LOOP)->GetSize());
864 }
865
866 if (isGreedy) {
867 LoopGreedyOpCode loopOp;
868 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
869 } else {
870 LoopOpCode loopOp;
871 loopOp.EmitOpCode(&buffer_, atomBcStart - buffer_.GetSize() - loopOp.GetSize(), min, max);
872 }
873
874 if (min == 0) {
875 if (isGreedy) {
876 SplitNextOpCode splitNextOp;
877 splitNextOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
878 } else {
879 SplitFirstOpCode splitFirstOp;
880 splitFirstOp.InsertOpCode(&buffer_, atomBcStart, buffer_.GetSize() - atomBcStart);
881 }
882 }
883
884 PopOpCode popOp;
885 popOp.EmitOpCode(&buffer_);
886 }
887 }
888
ParseGroupSpecifier(const uint8_t ** pp,PandaString & name)889 bool RegExpParser::ParseGroupSpecifier(const uint8_t **pp, PandaString &name)
890 {
891 const uint8_t *p = *pp;
892 uint32_t c;
893 std::array<char, CACHE_SIZE> buffer {};
894 char *q = buffer.data();
895 while (true) {
896 if (p <= end_) {
897 c = *p;
898 } else {
899 c = KEY_EOF;
900 }
901 if (c == '\\') {
902 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
903 p++;
904 if (*p != 'u') {
905 return false;
906 }
907 if (!ParseUnicodeEscape(&c)) {
908 return false;
909 }
910 } else if (c == '>') {
911 break;
912 } else if (c > CACHE_SIZE && c != KEY_EOF) {
913 c = static_cast<uint32_t>(UnicodeFromUtf8(p, UTF8_CHAR_LEN_MAX, &p));
914 } else if (c != KEY_EOF) {
915 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
916 p++;
917 } else {
918 return false;
919 }
920 if (q == buffer.data()) {
921 if (IsIdentFirst(c) != 0) {
922 return false;
923 }
924 } else {
925 if (!u_isIDPart(c)) {
926 return false;
927 }
928 }
929 if (q != nullptr) {
930 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
931 *q++ = c;
932 }
933 }
934 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
935 p++;
936 *pp = p;
937 name = buffer.data();
938 return true;
939 }
940
CalculateCaptureIndex(const uint8_t * p,int & captureIndex,const char * groupName,PandaString & name)941 bool RegExpParser::CalculateCaptureIndex(const uint8_t *p, int &captureIndex, const char *groupName, PandaString &name)
942 {
943 if (p[1] == '?') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
944 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
945 if (p[CAPTURE_CONUT_ADVANCE - 1] == '<' && p[CAPTURE_CONUT_ADVANCE] != '!' &&
946 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
947 p[CAPTURE_CONUT_ADVANCE] != '=') {
948 hasNamedCaptures_ = 1;
949 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
950 p += CAPTURE_CONUT_ADVANCE;
951 if (groupName != nullptr && ParseGroupSpecifier(&p, name) && strcmp(name.c_str(), groupName) == 0) {
952 return true;
953 }
954 captureIndex++;
955 }
956 } else {
957 captureIndex++;
958 }
959
960 return false;
961 }
962
ShiftPointerToClosingBracket(const uint8_t * p,const uint8_t * end)963 static inline void ShiftPointerToClosingBracket(const uint8_t *p, const uint8_t *end)
964 {
965 while (p < end && *p != ']') {
966 if (*p == '\\') {
967 p++; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
968 }
969 p++; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
970 }
971 }
972
ParseCaptureCountImpl(const uint8_t * p,int & captureIndex,const char * groupName,PandaString & name)973 bool RegExpParser::ParseCaptureCountImpl(const uint8_t *p, int &captureIndex, const char *groupName, PandaString &name)
974 {
975 switch (*p) {
976 case '(': {
977 if (CalculateCaptureIndex(p, captureIndex, groupName, name)) {
978 return true;
979 }
980 break;
981 }
982 case '\\':
983 p++; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
984 break;
985 case '[': {
986 ShiftPointerToClosingBracket(p, end_);
987 break;
988 }
989 default:
990 break;
991 }
992
993 return false;
994 }
995
ParseCaptureCount(const char * groupName)996 int RegExpParser::ParseCaptureCount(const char *groupName)
997 {
998 const uint8_t *p = nullptr;
999 int captureIndex = 1;
1000 PandaString name;
1001 hasNamedCaptures_ = 0;
1002 for (p = base_; p < end_; p++) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1003 if (ParseCaptureCountImpl(p, captureIndex, groupName, name)) {
1004 return captureIndex;
1005 }
1006 }
1007 return captureIndex;
1008 }
1009
ParseLookBehind(DynChunk & buffer,PrevOpCode & prevOp,bool isBackward)1010 void RegExpParser::ParseLookBehind(DynChunk &buffer, PrevOpCode &prevOp, bool isBackward)
1011 {
1012 if (isBackward) {
1013 prevOp.EmitOpCode(&buffer, 0);
1014 }
1015 Advance();
1016 }
1017
InsertRangeOpCode(DynChunk & buffer,RangeSet & rangeSet,PrevOpCode & prevOp,bool isBackward)1018 void RegExpParser::InsertRangeOpCode(DynChunk &buffer, RangeSet &rangeSet, PrevOpCode &prevOp, bool isBackward)
1019 {
1020 RangeOpCode rangeOp;
1021 if (isBackward) {
1022 prevOp.EmitOpCode(&buffer, 0);
1023 }
1024 rangeOp.InsertOpCode(&buffer, rangeSet);
1025 ParseLookBehind(buffer, prevOp, isBackward);
1026 }
1027
InsertRange32OpCode(DynChunk & buffer,RangeSet & rangeSet,PrevOpCode & prevOp,bool isBackward)1028 void RegExpParser::InsertRange32OpCode(DynChunk &buffer, RangeSet &rangeSet, PrevOpCode &prevOp, bool isBackward)
1029 {
1030 RangeSet atomRange(rangeSet);
1031 atomRange.Invert(IsUtf16());
1032 Range32OpCode rangeOp;
1033 if (isBackward) {
1034 prevOp.EmitOpCode(&buffer, 0);
1035 }
1036 rangeOp.InsertOpCode(&buffer, atomRange);
1037 ParseLookBehind(buffer, prevOp, isBackward);
1038 }
1039
ParseGroupName()1040 int RegExpParser::ParseGroupName()
1041 {
1042 Advance();
1043 if (c0_ != '<') {
1044 if (!IsUtf16() || HasNamedCaptures()) {
1045 ParseError("expecting group name.");
1046 return -1;
1047 }
1048 }
1049 Advance();
1050 Prev();
1051 PandaString name;
1052 auto **pp = const_cast<const uint8_t **>(&pc_);
1053 if (!ParseGroupSpecifier(pp, name)) {
1054 ParseError("GroupName Syntax error.");
1055 return -1;
1056 }
1057 int postion = FindGroupName(name);
1058 if (postion < 0) {
1059 postion = ParseCaptureCount(name.c_str());
1060 if (postion < 0 && (!IsUtf16() || HasNamedCaptures())) {
1061 ParseError("group name not defined");
1062 return -1;
1063 }
1064 }
1065
1066 return postion;
1067 }
1068
EmitRefOpCode(DynChunk & buffer,uint32_t para,bool isBackward)1069 static void EmitRefOpCode(DynChunk &buffer, uint32_t para, bool isBackward)
1070 {
1071 if (isBackward) {
1072 BackwardBackReferenceOpCode backReferenceOp;
1073 backReferenceOp.EmitOpCode(&buffer, para);
1074 } else {
1075 BackReferenceOpCode backReferenceOp;
1076 backReferenceOp.EmitOpCode(&buffer, para);
1077 }
1078 }
1079
1080 // CC-OFFNXT(G.FUN.01, huge_method) big switch case
1081 // NOLINTNEXTLINE(readability-function-size)
ParseAtomEscape(bool isBackward)1082 int RegExpParser::ParseAtomEscape(bool isBackward)
1083 {
1084 // AtomEscape[U, N]::
1085 // DecimalEscape
1086 // CharacterClassEscape[?U]
1087 // CharacterEscape[?U]
1088 // [+N]kGroupName[?U]
1089 int result = -1;
1090 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1091 PrintF("Parse AtomEscape------\n");
1092 PrevOpCode prevOp;
1093 switch (c0_) {
1094 case KEY_EOF:
1095 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1096 ParseError("unexpected end");
1097 break;
1098 // DecimalEscape
1099 case '1':
1100 case '2':
1101 case '3':
1102 case '4':
1103 case '5':
1104 case '6':
1105 case '7':
1106 case '8':
1107 case '9': {
1108 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1109 PrintF("NonZeroDigit %c\n", c0_);
1110 int capture = ParseDecimalDigits();
1111 if (capture > captureCount_ - 1 && capture > ParseCaptureCount(nullptr) - 1) {
1112 ParseError("invalid backreference count");
1113 break;
1114 }
1115 EmitRefOpCode(buffer_, capture, isBackward);
1116 break;
1117 }
1118 // CharacterClassEscape
1119 case 'd': {
1120 // [0-9]
1121 InsertRangeOpCode(buffer_, g_gRangeD, prevOp, isBackward);
1122 break;
1123 }
1124 case 'D': {
1125 // [^0-9]
1126 InsertRange32OpCode(buffer_, g_gRangeD, prevOp, isBackward);
1127 break;
1128 }
1129 case 's': {
1130 // [\f\n\r\t\v]
1131 InsertRangeOpCode(buffer_, g_gRangeS, prevOp, isBackward);
1132 break;
1133 }
1134 case 'S': {
1135 InsertRange32OpCode(buffer_, g_gRangeS, prevOp, isBackward);
1136 break;
1137 }
1138 case 'w': {
1139 // [A-Za-z0-9]
1140 InsertRangeOpCode(buffer_, g_gRangeW, prevOp, isBackward);
1141 break;
1142 }
1143 case 'W': {
1144 // [^A-Za-z0-9]
1145 InsertRange32OpCode(buffer_, g_gRangeW, prevOp, isBackward);
1146 break;
1147 }
1148 // P{UnicodePropertyValueExpression}
1149 // p{UnicodePropertyValueExpression}
1150 case 'P':
1151 case 'p':
1152 // [+N]kGroupName[?U]
1153 case 'k': {
1154 int postion = ParseGroupName();
1155 if (postion < 0) {
1156 break;
1157 }
1158 EmitRefOpCode(buffer_, postion, isBackward);
1159 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1160 Advance();
1161 break;
1162 }
1163 default:
1164 result = ParseCharacterEscape();
1165 break;
1166 }
1167 return result;
1168 }
1169
RecountCaptures()1170 int RegExpParser::RecountCaptures()
1171 {
1172 if (totalCaptureCount_ < 0) {
1173 const char *name = reinterpret_cast<const char *>(groupNames_.buf_);
1174 totalCaptureCount_ = ParseCaptureCount(name);
1175 }
1176 return totalCaptureCount_;
1177 }
HasNamedCaptures()1178 bool RegExpParser::HasNamedCaptures()
1179 {
1180 if (hasNamedCaptures_ < 0) {
1181 RecountCaptures();
1182 }
1183 return false;
1184 }
1185
1186 // CC-OFFNXT(G.FUN.01, huge_cyclomatic_complexity, huge_method) big switch case
ParseCharacterEscape()1187 int RegExpParser::ParseCharacterEscape()
1188 {
1189 // CharacterEscape[U]::
1190 // ControlEscape
1191 // c ControlLetter
1192 // 0 [lookahead ∉ DecimalDigit]
1193 // HexEscapeSequence
1194 // RegExpUnicodeEscapeSequence[?U]
1195 // IdentityEscape[?U]
1196 uint32_t result = 0;
1197 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1198 switch (c0_) {
1199 // ControlEscape
1200 case 'f':
1201 result = '\f';
1202 PrintControlEscapeAndAdvance();
1203 break;
1204 case 'n':
1205 result = '\n';
1206 PrintControlEscapeAndAdvance();
1207 break;
1208 case 'r':
1209 result = '\r';
1210 PrintControlEscapeAndAdvance();
1211 break;
1212 case 't':
1213 result = '\t';
1214 PrintControlEscapeAndAdvance();
1215 break;
1216 case 'v':
1217 result = '\v';
1218 PrintControlEscapeAndAdvance();
1219 break;
1220 // c ControlLetter
1221 case 'c': {
1222 ParseControlLetter(result);
1223 break;
1224 }
1225 case '0': {
1226 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1227 PrintF("CharacterEscape 0 [lookahead ∉ DecimalDigit]\n");
1228 if (IsUtf16() && !(*pc_ >= '0' && *pc_ <= '9')) { // NOLINT(readability-magic-numbers)
1229 Advance();
1230 result = 0;
1231 break;
1232 }
1233 [[fallthrough]];
1234 }
1235 case '1':
1236 case '2':
1237 case '3':
1238 case '4':
1239 case '5':
1240 case '6':
1241 case '7': {
1242 if (IsUtf16()) {
1243 // With /u, decimal escape is not interpreted as octal character code.
1244 ParseError("Invalid class escape");
1245 return 0;
1246 }
1247 result = ParseOctalLiteral();
1248 break;
1249 }
1250 // ParseHexEscapeSequence
1251 // ParseRegExpUnicodeEscapeSequence
1252 case 'x': {
1253 Advance();
1254 if (ParseHexEscape(UNICODE_HEX_ADVANCE, &result)) {
1255 return result;
1256 }
1257 if (IsUtf16()) {
1258 ParseError("Invalid class escape");
1259 return -1;
1260 }
1261 result = 'x';
1262 break;
1263 }
1264 case 'u': {
1265 Advance();
1266 if (ParseUnicodeEscape(&result)) {
1267 return result;
1268 }
1269 if (IsUtf16()) {
1270 // With /u, invalid escapes are not treated as identity escapes.
1271 ParseError("Invalid unicode escape");
1272 return 0;
1273 }
1274 // If \u is not followed by a two-digit hexadecimal, treat it
1275 // as an identity escape.
1276 result = 'u';
1277 break;
1278 }
1279 // IdentityEscape[?U]
1280 case '$':
1281 case '(':
1282 case ')':
1283 case '*':
1284 case '+':
1285 case '.':
1286 case '/':
1287 case '?':
1288 case '[':
1289 case '\\':
1290 case ']':
1291 case '^':
1292 case '{':
1293 case '|':
1294 case '}':
1295 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1296 PrintF("IdentityEscape %c\n", c0_);
1297 result = c0_;
1298 Advance();
1299 break;
1300 default: {
1301 ParseCharacterEscapeDefault(result);
1302 break;
1303 }
1304 }
1305 return result;
1306 }
1307
ParseCharacterEscapeDefault(uint32_t & result)1308 void RegExpParser::ParseCharacterEscapeDefault(uint32_t &result)
1309 {
1310 if (IsUtf16()) {
1311 ParseError("Invalid unicode escape");
1312 result = 0;
1313 return;
1314 }
1315 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1316 PrintF("SourceCharacter %c\n", c0_);
1317 result = c0_;
1318 if (result < CHAR_MAXS) {
1319 Advance();
1320 }
1321 }
1322
ParseControlLetter(uint32_t & result)1323 void RegExpParser::ParseControlLetter(uint32_t &result)
1324 {
1325 Advance();
1326 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1327 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1328 PrintF("ControlLetter %c\n", c0_);
1329 result = static_cast<uint32_t>(c0_) & 0x1f; // NOLINT(readability-magic-numbers, hicpp-signed-bitwise)
1330 Advance();
1331 } else {
1332 if (!IsUtf16()) {
1333 pc_--; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1334 result = '\\';
1335 } else {
1336 ParseError("Invalid control letter");
1337 result = -1;
1338 }
1339 }
1340 }
1341
PrintControlEscapeAndAdvance()1342 void RegExpParser::PrintControlEscapeAndAdvance()
1343 {
1344 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1345 PrintF("ControlEscape %c\n", c0_);
1346 Advance();
1347 }
1348
ParseClassRangesImpl(RangeSet * result)1349 bool RegExpParser::ParseClassRangesImpl(RangeSet *result)
1350 {
1351 RangeSet s1;
1352 uint32_t c1 = ParseClassAtom(&s1);
1353 if (c1 == UINT32_MAX) {
1354 ParseError("invalid class range");
1355 return false;
1356 }
1357
1358 int nextC0 = *pc_;
1359 if (c0_ == '-' && nextC0 != ']') {
1360 if (c1 == CLASS_RANGE_BASE) {
1361 if (IsUtf16()) {
1362 ParseError("invalid class range");
1363 return false;
1364 }
1365 result->Insert(s1);
1366 return true;
1367 }
1368 Advance();
1369 RangeSet s2;
1370 uint32_t c2 = ParseClassAtom(&s2);
1371 if (c2 == UINT32_MAX) {
1372 ParseError("invalid class range");
1373 return false;
1374 }
1375 if (c2 == CLASS_RANGE_BASE) {
1376 if (IsUtf16()) {
1377 ParseError("invalid class range");
1378 return false;
1379 }
1380 result->Insert(s2);
1381 return true;
1382 }
1383 if (c1 < INT8_MAX) {
1384 if (c1 > c2) {
1385 ParseError("invalid class range");
1386 return false;
1387 }
1388 }
1389 if (IsIgnoreCase()) {
1390 c1 = static_cast<uint32_t>(Canonicalize(c1, IsUtf16()));
1391 c2 = static_cast<uint32_t>(Canonicalize(c2, IsUtf16()));
1392 }
1393
1394 result->Insert(c1, c2);
1395 } else {
1396 result->Insert(s1);
1397 }
1398
1399 return true;
1400 }
1401
ParseClassRanges(RangeSet * result)1402 bool RegExpParser::ParseClassRanges(RangeSet *result)
1403 {
1404 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1405 PrintF("Parse ClassRanges------\n");
1406 while (c0_ != ']') {
1407 if (!ParseClassRangesImpl(result)) {
1408 return false;
1409 }
1410 }
1411 Advance();
1412 return true;
1413 }
1414
ParseClassAtom(RangeSet * atom)1415 uint32_t RegExpParser::ParseClassAtom(RangeSet *atom)
1416 {
1417 uint32_t ret = UINT32_MAX;
1418 switch (c0_) {
1419 case '\\': {
1420 Advance();
1421 ret = static_cast<uint32_t>(ParseClassEscape(atom));
1422 break;
1423 }
1424 case KEY_EOF:
1425 break;
1426 case 0: {
1427 if (pc_ >= end_) {
1428 return UINT32_MAX;
1429 }
1430 [[fallthrough]];
1431 }
1432 default: {
1433 uint32_t value = c0_;
1434 size_t u16Size;
1435 if (c0_ > INT8_MAX) {
1436 pc_ -= 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
1437 auto u16Result = utf::ConvertUtf8ToUtf16Pair(pc_, true);
1438 value = u16Result.first;
1439 u16Size = u16Result.second;
1440 Advance(u16Size + 1);
1441 } else {
1442 Advance();
1443 }
1444 if (IsIgnoreCase()) {
1445 value = static_cast<uint32_t>(Canonicalize(value, IsUtf16()));
1446 }
1447 atom->Insert(RangeSet(value));
1448 ret = value;
1449 break;
1450 }
1451 }
1452 return ret;
1453 }
1454
InsertRangeBase(RangeSet * atom,RangeSet & rangeSet,bool invert)1455 void RegExpParser::InsertRangeBase(RangeSet *atom, RangeSet &rangeSet, bool invert)
1456 {
1457 atom->Insert(rangeSet);
1458 if (invert) {
1459 atom->Invert(IsUtf16());
1460 }
1461 }
1462
ParseClassEscape(RangeSet * atom)1463 int RegExpParser::ParseClassEscape(RangeSet *atom)
1464 {
1465 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1466 PrintF("Parse ClassEscape------\n");
1467 int result = -1;
1468 switch (c0_) {
1469 case 'b':
1470 Advance();
1471 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1472 PrintF("ClassEscape %c", 'b');
1473 result = '\b';
1474 atom->Insert(RangeSet(static_cast<uint32_t>('\b')));
1475 break;
1476 case '-':
1477 Advance();
1478 result = '-';
1479 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1480 PrintF("ClassEscape %c", '-');
1481 atom->Insert(RangeSet(static_cast<uint32_t>('-')));
1482 break;
1483 // CharacterClassEscape
1484 case 'd':
1485 case 'D':
1486 result = CLASS_RANGE_BASE;
1487 InsertRangeBase(atom, g_gRangeD, c0_ == 'D');
1488 Advance();
1489 break;
1490 case 's':
1491 case 'S':
1492 result = CLASS_RANGE_BASE;
1493 InsertRangeBase(atom, g_gRangeS, c0_ == 'S');
1494 Advance();
1495 break;
1496 case 'w':
1497 case 'W':
1498 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1499 PrintF("ClassEscape::CharacterClassEscape %c\n", c0_);
1500 result = CLASS_RANGE_BASE;
1501 InsertRangeBase(atom, g_gRangeW, c0_ == 'W');
1502 Advance();
1503 break;
1504 // P{UnicodePropertyValueExpression}
1505 // p{UnicodePropertyValueExpression}
1506 case 'P':
1507 case 'p':
1508 ParseUnicodePropertyValueCharacters(result);
1509 break;
1510 default:
1511 result = ParseCharacterEscape();
1512 int value = result;
1513 if (IsIgnoreCase()) {
1514 value = Canonicalize(value, IsUtf16());
1515 }
1516 atom->Insert(RangeSet(static_cast<uint32_t>(value)));
1517 break;
1518 }
1519 return result;
1520 }
1521
ParseUnicodePropertyValueCharacters(int & result)1522 void RegExpParser::ParseUnicodePropertyValueCharacters(int &result)
1523 {
1524 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1525 PrintF("Warning: \\p is not supported in ECMA 2015!");
1526 Advance();
1527 if (c0_ == '{') {
1528 Advance();
1529 if (c0_ == '}') {
1530 return; // p{}, invalid
1531 }
1532 bool isValue = false;
1533 ParseUnicodePropertyValueCharactersImpl(&isValue);
1534 if (!isValue && c0_ == '=') {
1535 // UnicodePropertyName = UnicodePropertyValue
1536 Advance();
1537 if (c0_ == '}') {
1538 return; // p{xxx=}, invalid
1539 }
1540 ParseUnicodePropertyValueCharactersImpl(&isValue);
1541 }
1542 if (c0_ != '}') {
1543 return; // p{xxx, invalid
1544 }
1545 // should do atom->Invert() here after ECMA 9.0
1546 Advance();
1547 result = CLASS_RANGE_BASE;
1548 }
1549 }
1550
ParseUnicodePropertyValueCharactersImpl(bool * isValue)1551 void RegExpParser::ParseUnicodePropertyValueCharactersImpl(bool *isValue)
1552 {
1553 if ((c0_ >= 'A' && c0_ <= 'Z') || (c0_ >= 'a' && c0_ <= 'z')) {
1554 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1555 PrintF("UnicodePropertyCharacter::ControlLetter %c\n", c0_);
1556 } else if (c0_ == '_') {
1557 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1558 PrintF("UnicodePropertyCharacter:: _ \n");
1559 } else if (c0_ >= '0' && c0_ <= '9') {
1560 *isValue = true;
1561 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1562 PrintF("UnicodePropertyValueCharacter::DecimalDigit %c\n", c0_);
1563 } else {
1564 return;
1565 }
1566 Advance();
1567 ParseUnicodePropertyValueCharactersImpl(isValue);
1568 }
1569
1570 // NOLINTNEXTLINE(cert-dcl50-cpp)
PrintF(const char * fmt,...)1571 void RegExpParser::PrintF(const char *fmt, ...)
1572 {
1573 (void)fmt;
1574 }
1575
ParseError(const char * errorMessage)1576 void RegExpParser::ParseError(const char *errorMessage)
1577 {
1578 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1579 PrintF("error: ");
1580 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1581 PrintF(errorMessage);
1582 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
1583 PrintF("\n");
1584 SetIsError();
1585 size_t length = strlen(errorMessage) + 1;
1586 if (memcpy_s(errorMsg_, length, errorMessage, length) != EOK) {
1587 LOG(FATAL, COMMON) << "memcpy_s failed";
1588 UNREACHABLE();
1589 }
1590 }
1591
IsIdentFirst(uint32_t c)1592 int RegExpParser::IsIdentFirst(uint32_t c)
1593 {
1594 if (c < CACHE_SIZE) {
1595 // NOLINTNEXTLINE(hicpp-signed-bitwise
1596 return (ID_START_TABLE_ASCII[c >> 5] >> (c & 31)) & 1; // 5: Shift five bits 31: and operation binary of 31
1597 }
1598 return static_cast<int>(u_isIDStart(c));
1599 }
1600 } // namespace ark