1 /** 2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef PANDA_RUNTIME_REGEXP_EXECUTOR_H 17 #define PANDA_RUNTIME_REGEXP_EXECUTOR_H 18 19 #include "runtime/regexp/ecmascript/regexp_parser.h" 20 21 namespace ark { 22 23 template <class T> 24 struct RegExpMatchResult { 25 uint32_t endIndex = 0; 26 uint32_t index = 0; 27 // NOTE(kirill-mitkin): Change ecmascript API to PandaVector<T> 28 // first value is true if result is undefined 29 PandaVector<std::pair<bool, T>> captures; 30 PandaVector<std::pair<uint32_t, uint32_t>> indices; 31 bool isSuccess = false; 32 bool isWide = false; 33 }; 34 35 class RegExpExecutor { 36 public: 37 struct CaptureState { 38 const uint8_t *captureStart; 39 const uint8_t *captureEnd; 40 }; 41 42 enum StateType : uint8_t { 43 STATE_SPLIT = 0, 44 STATE_MATCH_AHEAD, 45 STATE_NEGATIVE_MATCH_AHEAD, 46 }; 47 48 struct RegExpState { 49 StateType type = STATE_SPLIT; 50 uint32_t currentPc = 0; 51 uint32_t currentStack = 0; 52 const uint8_t *currentPtr = nullptr; 53 __extension__ CaptureState *captureResultList[0]; // NOLINT(modernize-avoid-c-arrays) 54 }; 55 56 explicit RegExpExecutor() = default; 57 ~RegExpExecutor()58 ~RegExpExecutor() 59 { 60 auto allocator = Runtime::GetCurrent()->GetInternalAllocator(); 61 allocator->DeleteArray(stack_); 62 allocator->DeleteArray(captureResultList_); 63 allocator->DeleteArray(stateStack_); 64 } 65 66 NO_COPY_SEMANTIC(RegExpExecutor); 67 NO_MOVE_SEMANTIC(RegExpExecutor); 68 69 PANDA_PUBLIC_API bool Execute(const uint8_t *input, uint32_t lastIndex, uint32_t length, uint8_t *buf, 70 bool isWideChar = false); 71 72 bool ExecuteInternal(const DynChunk &byteCode, uint32_t pcEnd); 73 // CC-OFFNXT(G.FUD.06) perf critical HandleFirstSplit()74 inline bool HandleFirstSplit() 75 { 76 if (GetCurrentPC() == RegExpParser::OP_START_OFFSET && stateStackLen_ == 0 && 77 (flags_ & RegExpParser::FLAG_STICKY) == 0) { 78 if (IsEOF()) { 79 if (MatchFailed()) { 80 return false; 81 } 82 } else { 83 AdvanceCurrentPtr(); 84 PushRegExpState(STATE_SPLIT, RegExpParser::OP_START_OFFSET); 85 } 86 } 87 return true; 88 } 89 HandleOpAll(uint8_t opCode)90 inline bool HandleOpAll(uint8_t opCode) 91 { 92 if (IsEOF()) { 93 return !MatchFailed(); 94 } 95 uint32_t currentChar = GetCurrentChar(); 96 if ((opCode == RegExpOpCode::OP_DOTS) && IsTerminator(currentChar)) { 97 return !MatchFailed(); 98 } 99 Advance(opCode); 100 return true; 101 } 102 103 // CC-OFFNXT(G.FUD.06) perf critical HandleOpChar(const DynChunk & byteCode,uint8_t opCode)104 inline bool HandleOpChar(const DynChunk &byteCode, uint8_t opCode) 105 { 106 uint32_t expectedChar; 107 if (opCode == RegExpOpCode::OP_CHAR32) { 108 expectedChar = byteCode.GetU32(GetCurrentPC() + 1); 109 } else { 110 expectedChar = byteCode.GetU16(GetCurrentPC() + 1); 111 } 112 if (IsEOF()) { 113 return !MatchFailed(); 114 } 115 uint32_t currentChar = GetCurrentChar(); 116 if (IsIgnoreCase()) { 117 currentChar = static_cast<uint32_t>(RegExpParser::Canonicalize(currentChar, IsUtf16())); 118 } 119 if (currentChar == expectedChar) { 120 Advance(opCode); 121 } else { 122 if (MatchFailed()) { 123 return false; 124 } 125 } 126 return true; 127 } 128 129 // CC-OFFNXT(G.FUD.06) perf critical HandleOpWordBoundary(uint8_t opCode)130 inline bool HandleOpWordBoundary(uint8_t opCode) 131 { 132 if (IsEOF()) { 133 if (opCode == RegExpOpCode::OP_WORD_BOUNDARY) { 134 Advance(opCode); 135 } else { 136 if (MatchFailed()) { 137 return false; 138 } 139 } 140 return true; 141 } 142 bool preIsWord = false; 143 if (GetCurrentPtr() != input_) { 144 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 145 preIsWord = IsWordChar(PeekPrevChar(currentPtr_, input_)); 146 } 147 bool currentIsWord = IsWordChar(PeekChar(currentPtr_, inputEnd_)); 148 if (((opCode == RegExpOpCode::OP_WORD_BOUNDARY) && 149 ((!preIsWord && currentIsWord) || (preIsWord && !currentIsWord))) || 150 ((opCode == RegExpOpCode::OP_NOT_WORD_BOUNDARY) && 151 ((preIsWord && currentIsWord) || (!preIsWord && !currentIsWord)))) { 152 Advance(opCode); 153 } else { 154 if (MatchFailed()) { 155 return false; 156 } 157 } 158 return true; 159 } 160 HandleOpLineStart(uint8_t opCode)161 inline bool HandleOpLineStart(uint8_t opCode) 162 { 163 if (IsEOF()) { 164 return !MatchFailed(); 165 } 166 if ((GetCurrentPtr() == input_) || 167 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 168 ((flags_ & RegExpParser::FLAG_MULTILINE) != 0 && PeekPrevChar(currentPtr_, input_) == '\n')) { 169 Advance(opCode); 170 } else if (MatchFailed()) { 171 return false; 172 } 173 return true; 174 } 175 HandleOpLineEnd(uint8_t opCode)176 inline bool HandleOpLineEnd(uint8_t opCode) 177 { 178 if (IsEOF() || 179 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 180 ((flags_ & RegExpParser::FLAG_MULTILINE) != 0 && PeekChar(currentPtr_, inputEnd_) == '\n')) { 181 Advance(opCode); 182 } else { 183 if (MatchFailed()) { 184 return false; 185 } 186 } 187 return true; 188 } 189 HandleOpSaveStart(const DynChunk & byteCode,uint8_t opCode)190 inline void HandleOpSaveStart(const DynChunk &byteCode, uint8_t opCode) 191 { 192 uint32_t captureIndex = byteCode.GetU8(GetCurrentPC() + 1); 193 ASSERT(captureIndex < nCapture_); 194 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 195 CaptureState *captureState = &captureResultList_[captureIndex]; 196 captureState->captureStart = GetCurrentPtr(); 197 Advance(opCode); 198 } 199 HandleOpSaveEnd(const DynChunk & byteCode,uint8_t opCode)200 inline void HandleOpSaveEnd(const DynChunk &byteCode, uint8_t opCode) 201 { 202 uint32_t captureIndex = byteCode.GetU8(GetCurrentPC() + 1); 203 ASSERT(captureIndex < nCapture_); 204 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 205 CaptureState *captureState = &captureResultList_[captureIndex]; 206 captureState->captureEnd = GetCurrentPtr(); 207 Advance(opCode); 208 } 209 HandleOpSaveReset(const DynChunk & byteCode,uint8_t opCode)210 inline void HandleOpSaveReset(const DynChunk &byteCode, uint8_t opCode) 211 { 212 uint32_t catpureStartIndex = byteCode.GetU8(GetCurrentPC() + SAVE_RESET_START); 213 uint32_t catpureEndIndex = byteCode.GetU8(GetCurrentPC() + SAVE_RESET_END); 214 for (uint32_t i = catpureStartIndex; i <= catpureEndIndex; i++) { 215 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 216 CaptureState *captureState = &captureResultList_[i]; 217 captureState->captureStart = nullptr; 218 captureState->captureEnd = nullptr; 219 } 220 Advance(opCode); 221 } 222 HandleOpMatch(const DynChunk & byteCode,uint8_t opCode)223 inline void HandleOpMatch(const DynChunk &byteCode, uint8_t opCode) 224 { 225 auto type = static_cast<StateType>(opCode - RegExpOpCode::OP_SPLIT_NEXT); 226 ASSERT(type == STATE_SPLIT || type == STATE_MATCH_AHEAD || type == STATE_NEGATIVE_MATCH_AHEAD); 227 uint32_t offset = byteCode.GetU32(GetCurrentPC() + 1); 228 Advance(opCode); 229 uint32_t splitPc = GetCurrentPC() + offset; 230 PushRegExpState(type, splitPc); 231 } 232 HandleOpSplitFirst(const DynChunk & byteCode,uint8_t opCode)233 inline void HandleOpSplitFirst(const DynChunk &byteCode, uint8_t opCode) 234 { 235 uint32_t offset = byteCode.GetU32(GetCurrentPC() + 1); 236 Advance(opCode); 237 PushRegExpState(STATE_SPLIT, GetCurrentPC()); 238 AdvanceOffset(offset); 239 } 240 HandleOpPrev(uint8_t opCode)241 inline bool HandleOpPrev(uint8_t opCode) 242 { 243 if (GetCurrentPtr() == input_) { 244 if (MatchFailed()) { 245 return false; 246 } 247 } else { 248 PrevPtr(¤tPtr_, input_); 249 Advance(opCode); 250 } 251 return true; 252 } 253 254 // CC-OFFNXT(G.FUD.06) perf critical HandleOpLoop(const DynChunk & byteCode,uint8_t opCode)255 inline void HandleOpLoop(const DynChunk &byteCode, uint8_t opCode) 256 { 257 uint32_t quantifyMin = byteCode.GetU32(GetCurrentPC() + LOOP_MIN_OFFSET); 258 uint32_t quantifyMax = byteCode.GetU32(GetCurrentPC() + LOOP_MAX_OFFSET); 259 uint32_t pcOffset = byteCode.GetU32(GetCurrentPC() + LOOP_PC_OFFSET); 260 Advance(opCode); 261 uint32_t loopPcEnd = GetCurrentPC(); 262 uint32_t loopPcStart = GetCurrentPC() + pcOffset; 263 bool isGreedy = opCode == RegExpOpCode::OP_LOOP_GREEDY; 264 uint32_t loopMax = isGreedy ? quantifyMax : quantifyMin; 265 266 uint32_t loopCount = PeekStack(); 267 SetStackValue(++loopCount); 268 if (loopCount < loopMax) { 269 // greedy failed, goto next 270 if (loopCount >= quantifyMin) { 271 PushRegExpState(STATE_SPLIT, loopPcEnd); 272 } 273 // Goto loop start 274 SetCurrentPC(loopPcStart); 275 } else { 276 if (!isGreedy && (loopCount < quantifyMax)) { 277 PushRegExpState(STATE_SPLIT, loopPcStart); 278 } 279 } 280 } 281 282 // CC-OFFNXT(G.FUD.06) perf critical HandleOpRange32(const DynChunk & byteCode)283 inline bool HandleOpRange32(const DynChunk &byteCode) 284 { 285 if (IsEOF()) { 286 return !MatchFailed(); 287 } 288 uint32_t currentChar = GetCurrentChar(); 289 if (IsIgnoreCase()) { 290 currentChar = static_cast<uint32_t>(RegExpParser::Canonicalize(currentChar, IsUtf16())); 291 } 292 uint16_t rangeCount = byteCode.GetU16(GetCurrentPC() + 1); 293 bool isFound = false; 294 int32_t idxMin = 0; 295 int32_t idxMax = static_cast<int32_t>(rangeCount) - 1; 296 int32_t idx = 0; 297 uint32_t low = 0; 298 uint32_t high = byteCode.GetU32(GetCurrentPC() + RANGE32_HEAD_OFFSET + idxMax * RANGE32_MAX_OFFSET + 299 RANGE32_MAX_HALF_OFFSET); 300 if (currentChar <= high) { 301 while (idxMin <= idxMax) { 302 idx = (idxMin + idxMax) / RANGE32_OFFSET; 303 low = byteCode.GetU32(GetCurrentPC() + RANGE32_HEAD_OFFSET + 304 static_cast<uint32_t>(idx) * RANGE32_MAX_OFFSET); 305 high = byteCode.GetU32(GetCurrentPC() + RANGE32_HEAD_OFFSET + 306 static_cast<uint32_t>(idx) * RANGE32_MAX_OFFSET + RANGE32_MAX_HALF_OFFSET); 307 if (currentChar < low) { 308 idxMax = idx - 1; 309 } else if (currentChar > high) { 310 idxMin = idx + 1; 311 } else { 312 isFound = true; 313 break; 314 } 315 } 316 } 317 if (isFound) { 318 AdvanceOffset(rangeCount * RANGE32_MAX_OFFSET + RANGE32_HEAD_OFFSET); 319 } else { 320 if (MatchFailed()) { 321 return false; 322 } 323 } 324 return true; 325 } 326 327 // CC-OFFNXT(G.FUD.06) perf critical HandleOpRange(const DynChunk & byteCode)328 inline bool HandleOpRange(const DynChunk &byteCode) 329 { 330 if (IsEOF()) { 331 return !MatchFailed(); 332 } 333 uint32_t currentChar = GetCurrentChar(); 334 if (IsIgnoreCase()) { 335 currentChar = static_cast<uint32_t>(RegExpParser::Canonicalize(currentChar, IsUtf16())); 336 } 337 uint16_t rangeCount = byteCode.GetU16(GetCurrentPC() + 1); 338 bool isFound = false; 339 int32_t idxMin = 0; 340 int32_t idxMax = rangeCount - 1; 341 int32_t idx = 0; 342 uint32_t low = 0; 343 uint32_t high = 344 byteCode.GetU16(GetCurrentPC() + RANGE32_HEAD_OFFSET + idxMax * RANGE32_MAX_HALF_OFFSET + RANGE32_OFFSET); 345 if (currentChar <= high) { 346 while (idxMin <= idxMax) { 347 idx = (idxMin + idxMax) / RANGE32_OFFSET; 348 low = byteCode.GetU16(GetCurrentPC() + RANGE32_HEAD_OFFSET + 349 static_cast<uint32_t>(idx) * RANGE32_MAX_HALF_OFFSET); 350 high = byteCode.GetU16(GetCurrentPC() + RANGE32_HEAD_OFFSET + 351 static_cast<uint32_t>(idx) * RANGE32_MAX_HALF_OFFSET + RANGE32_OFFSET); 352 if (currentChar < low) { 353 idxMax = idx - 1; 354 } else if (currentChar > high) { 355 idxMin = idx + 1; 356 } else { 357 isFound = true; 358 break; 359 } 360 } 361 } 362 if (isFound) { 363 AdvanceOffset(rangeCount * RANGE32_MAX_HALF_OFFSET + RANGE32_HEAD_OFFSET); 364 } else { 365 if (MatchFailed()) { 366 return false; 367 } 368 } 369 return true; 370 } 371 372 bool HandleOpBackReference(const DynChunk &byteCode, uint8_t opCode); 373 374 bool HandleOpBackReferenceMatch(const uint8_t *captureStart, const uint8_t *captureEnd, uint8_t opCode); 375 376 bool HandleOpBackwardBackReferenceMatch(const uint8_t *captureStart, const uint8_t *captureEnd, uint8_t opCode); 377 378 inline void Advance(uint8_t opCode, uint32_t offset = 0) 379 { 380 currentPc_ += offset + static_cast<uint32_t>(RegExpOpCode::GetRegExpOpCode(opCode)->GetSize()); 381 } 382 AdvanceOffset(uint32_t offset)383 inline void AdvanceOffset(uint32_t offset) 384 { 385 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 386 currentPc_ += offset; 387 } 388 GetCurrentChar()389 inline uint32_t GetCurrentChar() 390 { 391 return GetChar(¤tPtr_, inputEnd_); 392 } 393 AdvanceCurrentPtr()394 inline void AdvanceCurrentPtr() 395 { 396 AdvancePtr(¤tPtr_, inputEnd_); 397 } 398 399 uint32_t GetChar(const uint8_t **pp, const uint8_t *end) const; 400 401 uint32_t PeekChar(const uint8_t *p, const uint8_t *end) const; 402 403 void AdvancePtr(const uint8_t **pp, const uint8_t *end) const; 404 405 uint32_t PeekPrevChar(const uint8_t *p, const uint8_t *start) const; 406 407 uint32_t GetPrevChar(const uint8_t **pp, const uint8_t *start) const; 408 409 void PrevPtr(const uint8_t **pp, const uint8_t *start) const; 410 411 bool MatchFailed(bool isMatched = false); 412 SetCurrentPC(uint32_t pc)413 void SetCurrentPC(uint32_t pc) 414 { 415 currentPc_ = pc; 416 } 417 SetCurrentPtr(const uint8_t * ptr)418 void SetCurrentPtr(const uint8_t *ptr) 419 { 420 currentPtr_ = ptr; 421 } 422 IsEOF()423 bool IsEOF() const 424 { 425 return currentPtr_ >= inputEnd_; 426 } 427 GetCurrentPC()428 uint32_t GetCurrentPC() const 429 { 430 return currentPc_; 431 } 432 PushStack(uintptr_t val)433 void PushStack(uintptr_t val) 434 { 435 ASSERT(currentStack_ < nStack_); 436 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 437 stack_[currentStack_++] = val; 438 } 439 SetStackValue(uintptr_t val)440 void SetStackValue(uintptr_t val) const 441 { 442 ASSERT(currentStack_ >= 1); 443 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 444 stack_[currentStack_ - 1] = val; 445 } 446 PopStack()447 uintptr_t PopStack() 448 { 449 ASSERT(currentStack_ >= 1); 450 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 451 return stack_[--currentStack_]; 452 } 453 PeekStack()454 uintptr_t PeekStack() const 455 { 456 ASSERT(currentStack_ >= 1); 457 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 458 return stack_[currentStack_ - 1]; 459 } 460 GetCurrentPtr()461 const uint8_t *GetCurrentPtr() const 462 { 463 return currentPtr_; 464 } 465 GetCaptureResultList()466 CaptureState *GetCaptureResultList() const 467 { 468 return captureResultList_; 469 } 470 GetCaptureCount()471 uint32_t GetCaptureCount() const 472 { 473 return nCapture_; 474 } 475 476 void DumpResult(std::ostream &out) const; 477 478 void PushRegExpState(StateType type, uint32_t pc); 479 480 RegExpState *PopRegExpState(bool copyCaptrue = true); 481 DropRegExpState()482 void DropRegExpState() 483 { 484 stateStackLen_--; 485 } 486 PeekRegExpState()487 RegExpState *PeekRegExpState() const 488 { 489 ASSERT(stateStackLen_ >= 1); 490 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 491 return reinterpret_cast<RegExpState *>(stateStack_ + (stateStackLen_ - 1) * stateSize_); 492 } 493 494 void ReAllocStack(uint32_t stackLen); 495 IsWordChar(uint8_t value)496 inline bool IsWordChar(uint8_t value) const 497 { 498 return ((value >= '0' && value <= '9') || (value >= 'a' && value <= 'z') || (value >= 'A' && value <= 'Z') || 499 (value == '_')); 500 } 501 IsTerminator(uint32_t value)502 inline bool IsTerminator(uint32_t value) const 503 { 504 // NOLINTNEXTLINE(readability-magic-numbers) 505 return (value == '\n' || value == '\r' || value == 0x2028 || value == 0x2029); 506 } 507 IsIgnoreCase()508 inline bool IsIgnoreCase() const 509 { 510 return (flags_ & RegExpParser::FLAG_IGNORECASE) != 0; 511 } 512 IsUtf16()513 inline bool IsUtf16() const 514 { 515 return (flags_ & RegExpParser::FLAG_UTF16) != 0; 516 } 517 IsWideChar()518 inline bool IsWideChar() const 519 { 520 return isWideChar_; 521 } 522 GetInputPtr()523 inline uint8_t *GetInputPtr() const 524 { 525 return input_; 526 } 527 528 static constexpr size_t CHAR_SIZE = 1; 529 static constexpr size_t WIDE_CHAR_SIZE = 2; 530 static constexpr size_t SAVE_RESET_START = 1; 531 static constexpr size_t SAVE_RESET_END = 2; 532 static constexpr size_t LOOP_MIN_OFFSET = 5; 533 static constexpr size_t LOOP_MAX_OFFSET = 9; 534 static constexpr size_t LOOP_PC_OFFSET = 1; 535 static constexpr size_t RANGE32_HEAD_OFFSET = 3; 536 static constexpr size_t RANGE32_MAX_HALF_OFFSET = 4; 537 static constexpr size_t RANGE32_MAX_OFFSET = 8; 538 static constexpr size_t RANGE32_OFFSET = 2; 539 static constexpr uint32_t STACK_MULTIPLIER = 2; 540 static constexpr uint32_t MIN_STACK_SIZE = 8; 541 542 private: 543 uint8_t *input_ = nullptr; 544 uint8_t *inputEnd_ = nullptr; 545 bool isWideChar_ = false; 546 547 uint32_t currentPc_ = 0; 548 const uint8_t *currentPtr_ = nullptr; 549 CaptureState *captureResultList_ = nullptr; 550 uintptr_t *stack_ = nullptr; 551 uint32_t currentStack_ = 0; 552 553 uint32_t nCapture_ = 0; 554 uint32_t nStack_ = 0; 555 556 uint32_t flags_ = 0; 557 uint32_t stateStackLen_ = 0; 558 uint32_t stateStackSize_ = 0; 559 uint32_t stateSize_ = 0; 560 uint8_t *stateStack_ = nullptr; 561 }; 562 } // namespace ark 563 #endif // core_REGEXP_REGEXP_EXECUTOR_H 564