1 /** 2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef PANDA_RUNTIME_REGEXP_EXECUTOR_H 17 #define PANDA_RUNTIME_REGEXP_EXECUTOR_H 18 19 #include "runtime/regexp/ecmascript/regexp_parser.h" 20 21 namespace ark { 22 23 template <class T> 24 struct RegExpMatchResult { 25 uint32_t endIndex = 0; 26 uint32_t index = 0; 27 // NOTE(kirill-mitkin): Change ecmascript API to PandaVector<T> 28 // first value is true if result is undefined 29 PandaVector<std::pair<bool, T>> captures; 30 PandaVector<std::pair<uint32_t, uint32_t>> indices; 31 bool isSuccess = false; 32 bool isWide = false; 33 }; 34 35 class RegExpExecutor { 36 public: 37 struct CaptureState { 38 const uint8_t *captureStart; 39 const uint8_t *captureEnd; 40 }; 41 42 enum StateType : uint8_t { 43 STATE_SPLIT = 0, 44 STATE_MATCH_AHEAD, 45 STATE_NEGATIVE_MATCH_AHEAD, 46 }; 47 48 struct RegExpState { 49 StateType type = STATE_SPLIT; 50 uint32_t currentPc = 0; 51 uint32_t currentStack = 0; 52 const uint8_t *currentPtr = nullptr; 53 __extension__ CaptureState *captureResultList[0]; // NOLINT(modernize-avoid-c-arrays) 54 }; 55 56 explicit RegExpExecutor() = default; 57 ~RegExpExecutor()58 ~RegExpExecutor() 59 { 60 auto allocator = Runtime::GetCurrent()->GetInternalAllocator(); 61 allocator->DeleteArray(stack_); 62 allocator->DeleteArray(captureResultList_); 63 allocator->DeleteArray(stateStack_); 64 } 65 66 NO_COPY_SEMANTIC(RegExpExecutor); 67 NO_MOVE_SEMANTIC(RegExpExecutor); 68 69 PANDA_PUBLIC_API bool Execute(const uint8_t *input, uint32_t lastIndex, uint32_t length, uint8_t *buf, 70 bool isWideChar = false); 71 72 bool ExecuteInternal(const DynChunk &byteCode, uint32_t pcEnd); HandleFirstSplit()73 inline bool HandleFirstSplit() 74 { 75 if (GetCurrentPC() == RegExpParser::OP_START_OFFSET && stateStackLen_ == 0 && 76 (flags_ & RegExpParser::FLAG_STICKY) == 0) { 77 if (IsEOF()) { 78 if (MatchFailed()) { 79 return false; 80 } 81 } else { 82 AdvanceCurrentPtr(); 83 PushRegExpState(STATE_SPLIT, RegExpParser::OP_START_OFFSET); 84 } 85 } 86 return true; 87 } 88 HandleOpAll(uint8_t opCode)89 inline bool HandleOpAll(uint8_t opCode) 90 { 91 if (IsEOF()) { 92 return !MatchFailed(); 93 } 94 uint32_t currentChar = GetCurrentChar(); 95 if ((opCode == RegExpOpCode::OP_DOTS) && IsTerminator(currentChar)) { 96 return !MatchFailed(); 97 } 98 Advance(opCode); 99 return true; 100 } 101 HandleOpChar(const DynChunk & byteCode,uint8_t opCode)102 inline bool HandleOpChar(const DynChunk &byteCode, uint8_t opCode) 103 { 104 uint32_t expectedChar; 105 if (opCode == RegExpOpCode::OP_CHAR32) { 106 expectedChar = byteCode.GetU32(GetCurrentPC() + 1); 107 } else { 108 expectedChar = byteCode.GetU16(GetCurrentPC() + 1); 109 } 110 if (IsEOF()) { 111 return !MatchFailed(); 112 } 113 uint32_t currentChar = GetCurrentChar(); 114 if (IsIgnoreCase()) { 115 currentChar = static_cast<uint32_t>(RegExpParser::Canonicalize(currentChar, IsUtf16())); 116 } 117 if (currentChar == expectedChar) { 118 Advance(opCode); 119 } else { 120 if (MatchFailed()) { 121 return false; 122 } 123 } 124 return true; 125 } 126 HandleOpWordBoundary(uint8_t opCode)127 inline bool HandleOpWordBoundary(uint8_t opCode) 128 { 129 if (IsEOF()) { 130 if (opCode == RegExpOpCode::OP_WORD_BOUNDARY) { 131 Advance(opCode); 132 } else { 133 if (MatchFailed()) { 134 return false; 135 } 136 } 137 return true; 138 } 139 bool preIsWord = false; 140 if (GetCurrentPtr() != input_) { 141 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 142 preIsWord = IsWordChar(PeekPrevChar(currentPtr_, input_)); 143 } 144 bool currentIsWord = IsWordChar(PeekChar(currentPtr_, inputEnd_)); 145 if (((opCode == RegExpOpCode::OP_WORD_BOUNDARY) && 146 ((!preIsWord && currentIsWord) || (preIsWord && !currentIsWord))) || 147 ((opCode == RegExpOpCode::OP_NOT_WORD_BOUNDARY) && 148 ((preIsWord && currentIsWord) || (!preIsWord && !currentIsWord)))) { 149 Advance(opCode); 150 } else { 151 if (MatchFailed()) { 152 return false; 153 } 154 } 155 return true; 156 } 157 HandleOpLineStart(uint8_t opCode)158 inline bool HandleOpLineStart(uint8_t opCode) 159 { 160 if (IsEOF()) { 161 return !MatchFailed(); 162 } 163 if ((GetCurrentPtr() == input_) || 164 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 165 ((flags_ & RegExpParser::FLAG_MULTILINE) != 0 && PeekPrevChar(currentPtr_, input_) == '\n')) { 166 Advance(opCode); 167 } else if (MatchFailed()) { 168 return false; 169 } 170 return true; 171 } 172 HandleOpLineEnd(uint8_t opCode)173 inline bool HandleOpLineEnd(uint8_t opCode) 174 { 175 if (IsEOF() || 176 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 177 ((flags_ & RegExpParser::FLAG_MULTILINE) != 0 && PeekChar(currentPtr_, inputEnd_) == '\n')) { 178 Advance(opCode); 179 } else { 180 if (MatchFailed()) { 181 return false; 182 } 183 } 184 return true; 185 } 186 HandleOpSaveStart(const DynChunk & byteCode,uint8_t opCode)187 inline void HandleOpSaveStart(const DynChunk &byteCode, uint8_t opCode) 188 { 189 uint32_t captureIndex = byteCode.GetU8(GetCurrentPC() + 1); 190 ASSERT(captureIndex < nCapture_); 191 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 192 CaptureState *captureState = &captureResultList_[captureIndex]; 193 captureState->captureStart = GetCurrentPtr(); 194 Advance(opCode); 195 } 196 HandleOpSaveEnd(const DynChunk & byteCode,uint8_t opCode)197 inline void HandleOpSaveEnd(const DynChunk &byteCode, uint8_t opCode) 198 { 199 uint32_t captureIndex = byteCode.GetU8(GetCurrentPC() + 1); 200 ASSERT(captureIndex < nCapture_); 201 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 202 CaptureState *captureState = &captureResultList_[captureIndex]; 203 captureState->captureEnd = GetCurrentPtr(); 204 Advance(opCode); 205 } 206 HandleOpSaveReset(const DynChunk & byteCode,uint8_t opCode)207 inline void HandleOpSaveReset(const DynChunk &byteCode, uint8_t opCode) 208 { 209 uint32_t catpureStartIndex = byteCode.GetU8(GetCurrentPC() + SAVE_RESET_START); 210 uint32_t catpureEndIndex = byteCode.GetU8(GetCurrentPC() + SAVE_RESET_END); 211 for (uint32_t i = catpureStartIndex; i <= catpureEndIndex; i++) { 212 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 213 CaptureState *captureState = &captureResultList_[i]; 214 captureState->captureStart = nullptr; 215 captureState->captureEnd = nullptr; 216 } 217 Advance(opCode); 218 } 219 HandleOpMatch(const DynChunk & byteCode,uint8_t opCode)220 inline void HandleOpMatch(const DynChunk &byteCode, uint8_t opCode) 221 { 222 auto type = static_cast<StateType>(opCode - RegExpOpCode::OP_SPLIT_NEXT); 223 ASSERT(type == STATE_SPLIT || type == STATE_MATCH_AHEAD || type == STATE_NEGATIVE_MATCH_AHEAD); 224 uint32_t offset = byteCode.GetU32(GetCurrentPC() + 1); 225 Advance(opCode); 226 uint32_t splitPc = GetCurrentPC() + offset; 227 PushRegExpState(type, splitPc); 228 } 229 HandleOpSplitFirst(const DynChunk & byteCode,uint8_t opCode)230 inline void HandleOpSplitFirst(const DynChunk &byteCode, uint8_t opCode) 231 { 232 uint32_t offset = byteCode.GetU32(GetCurrentPC() + 1); 233 Advance(opCode); 234 PushRegExpState(STATE_SPLIT, GetCurrentPC()); 235 AdvanceOffset(offset); 236 } 237 HandleOpPrev(uint8_t opCode)238 inline bool HandleOpPrev(uint8_t opCode) 239 { 240 if (GetCurrentPtr() == input_) { 241 if (MatchFailed()) { 242 return false; 243 } 244 } else { 245 PrevPtr(¤tPtr_, input_); 246 Advance(opCode); 247 } 248 return true; 249 } 250 HandleOpLoop(const DynChunk & byteCode,uint8_t opCode)251 inline void HandleOpLoop(const DynChunk &byteCode, uint8_t opCode) 252 { 253 uint32_t quantifyMin = byteCode.GetU32(GetCurrentPC() + LOOP_MIN_OFFSET); 254 uint32_t quantifyMax = byteCode.GetU32(GetCurrentPC() + LOOP_MAX_OFFSET); 255 uint32_t pcOffset = byteCode.GetU32(GetCurrentPC() + LOOP_PC_OFFSET); 256 Advance(opCode); 257 uint32_t loopPcEnd = GetCurrentPC(); 258 uint32_t loopPcStart = GetCurrentPC() + pcOffset; 259 bool isGreedy = opCode == RegExpOpCode::OP_LOOP_GREEDY; 260 uint32_t loopMax = isGreedy ? quantifyMax : quantifyMin; 261 262 uint32_t loopCount = PeekStack(); 263 SetStackValue(++loopCount); 264 if (loopCount < loopMax) { 265 // greedy failed, goto next 266 if (loopCount >= quantifyMin) { 267 PushRegExpState(STATE_SPLIT, loopPcEnd); 268 } 269 // Goto loop start 270 SetCurrentPC(loopPcStart); 271 } else { 272 if (!isGreedy && (loopCount < quantifyMax)) { 273 PushRegExpState(STATE_SPLIT, loopPcStart); 274 } 275 } 276 } 277 HandleOpRange32(const DynChunk & byteCode)278 inline bool HandleOpRange32(const DynChunk &byteCode) 279 { 280 if (IsEOF()) { 281 return !MatchFailed(); 282 } 283 uint32_t currentChar = GetCurrentChar(); 284 if (IsIgnoreCase()) { 285 currentChar = static_cast<uint32_t>(RegExpParser::Canonicalize(currentChar, IsUtf16())); 286 } 287 uint16_t rangeCount = byteCode.GetU16(GetCurrentPC() + 1); 288 bool isFound = false; 289 int32_t idxMin = 0; 290 int32_t idxMax = static_cast<int32_t>(rangeCount) - 1; 291 int32_t idx = 0; 292 uint32_t low = 0; 293 uint32_t high = byteCode.GetU32(GetCurrentPC() + RANGE32_HEAD_OFFSET + idxMax * RANGE32_MAX_OFFSET + 294 RANGE32_MAX_HALF_OFFSET); 295 if (currentChar <= high) { 296 while (idxMin <= idxMax) { 297 idx = (idxMin + idxMax) / RANGE32_OFFSET; 298 low = byteCode.GetU32(GetCurrentPC() + RANGE32_HEAD_OFFSET + 299 static_cast<uint32_t>(idx) * RANGE32_MAX_OFFSET); 300 high = byteCode.GetU32(GetCurrentPC() + RANGE32_HEAD_OFFSET + 301 static_cast<uint32_t>(idx) * RANGE32_MAX_OFFSET + RANGE32_MAX_HALF_OFFSET); 302 if (currentChar < low) { 303 idxMax = idx - 1; 304 } else if (currentChar > high) { 305 idxMin = idx + 1; 306 } else { 307 isFound = true; 308 break; 309 } 310 } 311 } 312 if (isFound) { 313 AdvanceOffset(rangeCount * RANGE32_MAX_OFFSET + RANGE32_HEAD_OFFSET); 314 } else { 315 if (MatchFailed()) { 316 return false; 317 } 318 } 319 return true; 320 } 321 HandleOpRange(const DynChunk & byteCode)322 inline bool HandleOpRange(const DynChunk &byteCode) 323 { 324 if (IsEOF()) { 325 return !MatchFailed(); 326 } 327 uint32_t currentChar = GetCurrentChar(); 328 if (IsIgnoreCase()) { 329 currentChar = static_cast<uint32_t>(RegExpParser::Canonicalize(currentChar, IsUtf16())); 330 } 331 uint16_t rangeCount = byteCode.GetU16(GetCurrentPC() + 1); 332 bool isFound = false; 333 int32_t idxMin = 0; 334 int32_t idxMax = rangeCount - 1; 335 int32_t idx = 0; 336 uint32_t low = 0; 337 uint32_t high = 338 byteCode.GetU16(GetCurrentPC() + RANGE32_HEAD_OFFSET + idxMax * RANGE32_MAX_HALF_OFFSET + RANGE32_OFFSET); 339 if (currentChar <= high) { 340 while (idxMin <= idxMax) { 341 idx = (idxMin + idxMax) / RANGE32_OFFSET; 342 low = byteCode.GetU16(GetCurrentPC() + RANGE32_HEAD_OFFSET + 343 static_cast<uint32_t>(idx) * RANGE32_MAX_HALF_OFFSET); 344 high = byteCode.GetU16(GetCurrentPC() + RANGE32_HEAD_OFFSET + 345 static_cast<uint32_t>(idx) * RANGE32_MAX_HALF_OFFSET + RANGE32_OFFSET); 346 if (currentChar < low) { 347 idxMax = idx - 1; 348 } else if (currentChar > high) { 349 idxMin = idx + 1; 350 } else { 351 isFound = true; 352 break; 353 } 354 } 355 } 356 if (isFound) { 357 AdvanceOffset(rangeCount * RANGE32_MAX_HALF_OFFSET + RANGE32_HEAD_OFFSET); 358 } else { 359 if (MatchFailed()) { 360 return false; 361 } 362 } 363 return true; 364 } 365 366 bool HandleOpBackReference(const DynChunk &byteCode, uint8_t opCode); 367 368 bool HandleOpBackReferenceMatch(const uint8_t *captureStart, const uint8_t *captureEnd, uint8_t opCode); 369 370 bool HandleOpBackwardBackReferenceMatch(const uint8_t *captureStart, const uint8_t *captureEnd, uint8_t opCode); 371 372 inline void Advance(uint8_t opCode, uint32_t offset = 0) 373 { 374 currentPc_ += offset + static_cast<uint32_t>(RegExpOpCode::GetRegExpOpCode(opCode)->GetSize()); 375 } 376 AdvanceOffset(uint32_t offset)377 inline void AdvanceOffset(uint32_t offset) 378 { 379 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 380 currentPc_ += offset; 381 } 382 GetCurrentChar()383 inline uint32_t GetCurrentChar() 384 { 385 return GetChar(¤tPtr_, inputEnd_); 386 } 387 AdvanceCurrentPtr()388 inline void AdvanceCurrentPtr() 389 { 390 AdvancePtr(¤tPtr_, inputEnd_); 391 } 392 393 uint32_t GetChar(const uint8_t **pp, const uint8_t *end) const; 394 395 uint32_t PeekChar(const uint8_t *p, const uint8_t *end) const; 396 397 void AdvancePtr(const uint8_t **pp, const uint8_t *end) const; 398 399 uint32_t PeekPrevChar(const uint8_t *p, const uint8_t *start) const; 400 401 uint32_t GetPrevChar(const uint8_t **pp, const uint8_t *start) const; 402 403 void PrevPtr(const uint8_t **pp, const uint8_t *start) const; 404 405 bool MatchFailed(bool isMatched = false); 406 SetCurrentPC(uint32_t pc)407 void SetCurrentPC(uint32_t pc) 408 { 409 currentPc_ = pc; 410 } 411 SetCurrentPtr(const uint8_t * ptr)412 void SetCurrentPtr(const uint8_t *ptr) 413 { 414 currentPtr_ = ptr; 415 } 416 IsEOF()417 bool IsEOF() const 418 { 419 return currentPtr_ >= inputEnd_; 420 } 421 GetCurrentPC()422 uint32_t GetCurrentPC() const 423 { 424 return currentPc_; 425 } 426 PushStack(uintptr_t val)427 void PushStack(uintptr_t val) 428 { 429 ASSERT(currentStack_ < nStack_); 430 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 431 stack_[currentStack_++] = val; 432 } 433 SetStackValue(uintptr_t val)434 void SetStackValue(uintptr_t val) const 435 { 436 ASSERT(currentStack_ >= 1); 437 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 438 stack_[currentStack_ - 1] = val; 439 } 440 PopStack()441 uintptr_t PopStack() 442 { 443 ASSERT(currentStack_ >= 1); 444 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 445 return stack_[--currentStack_]; 446 } 447 PeekStack()448 uintptr_t PeekStack() const 449 { 450 ASSERT(currentStack_ >= 1); 451 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 452 return stack_[currentStack_ - 1]; 453 } 454 GetCurrentPtr()455 const uint8_t *GetCurrentPtr() const 456 { 457 return currentPtr_; 458 } 459 GetCaptureResultList()460 CaptureState *GetCaptureResultList() const 461 { 462 return captureResultList_; 463 } 464 GetCaptureCount()465 uint32_t GetCaptureCount() const 466 { 467 return nCapture_; 468 } 469 470 void DumpResult(std::ostream &out) const; 471 472 void PushRegExpState(StateType type, uint32_t pc); 473 474 RegExpState *PopRegExpState(bool copyCaptrue = true); 475 DropRegExpState()476 void DropRegExpState() 477 { 478 stateStackLen_--; 479 } 480 PeekRegExpState()481 RegExpState *PeekRegExpState() const 482 { 483 ASSERT(stateStackLen_ >= 1); 484 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 485 return reinterpret_cast<RegExpState *>(stateStack_ + (stateStackLen_ - 1) * stateSize_); 486 } 487 488 void ReAllocStack(uint32_t stackLen); 489 IsWordChar(uint8_t value)490 inline bool IsWordChar(uint8_t value) const 491 { 492 return ((value >= '0' && value <= '9') || (value >= 'a' && value <= 'z') || (value >= 'A' && value <= 'Z') || 493 (value == '_')); 494 } 495 IsTerminator(uint32_t value)496 inline bool IsTerminator(uint32_t value) const 497 { 498 // NOLINTNEXTLINE(readability-magic-numbers) 499 return (value == '\n' || value == '\r' || value == 0x2028 || value == 0x2029); 500 } 501 IsIgnoreCase()502 inline bool IsIgnoreCase() const 503 { 504 return (flags_ & RegExpParser::FLAG_IGNORECASE) != 0; 505 } 506 IsUtf16()507 inline bool IsUtf16() const 508 { 509 return (flags_ & RegExpParser::FLAG_UTF16) != 0; 510 } 511 IsWideChar()512 inline bool IsWideChar() const 513 { 514 return isWideChar_; 515 } 516 GetInputPtr()517 inline uint8_t *GetInputPtr() const 518 { 519 return input_; 520 } 521 522 static constexpr size_t CHAR_SIZE = 1; 523 static constexpr size_t WIDE_CHAR_SIZE = 2; 524 static constexpr size_t SAVE_RESET_START = 1; 525 static constexpr size_t SAVE_RESET_END = 2; 526 static constexpr size_t LOOP_MIN_OFFSET = 5; 527 static constexpr size_t LOOP_MAX_OFFSET = 9; 528 static constexpr size_t LOOP_PC_OFFSET = 1; 529 static constexpr size_t RANGE32_HEAD_OFFSET = 3; 530 static constexpr size_t RANGE32_MAX_HALF_OFFSET = 4; 531 static constexpr size_t RANGE32_MAX_OFFSET = 8; 532 static constexpr size_t RANGE32_OFFSET = 2; 533 static constexpr uint32_t STACK_MULTIPLIER = 2; 534 static constexpr uint32_t MIN_STACK_SIZE = 8; 535 536 private: 537 uint8_t *input_ = nullptr; 538 uint8_t *inputEnd_ = nullptr; 539 bool isWideChar_ = false; 540 541 uint32_t currentPc_ = 0; 542 const uint8_t *currentPtr_ = nullptr; 543 CaptureState *captureResultList_ = nullptr; 544 uintptr_t *stack_ = nullptr; 545 uint32_t currentStack_ = 0; 546 547 uint32_t nCapture_ = 0; 548 uint32_t nStack_ = 0; 549 550 uint32_t flags_ = 0; 551 uint32_t stateStackLen_ = 0; 552 uint32_t stateStackSize_ = 0; 553 uint32_t stateSize_ = 0; 554 uint8_t *stateStack_ = nullptr; 555 }; 556 } // namespace ark 557 #endif // core_REGEXP_REGEXP_EXECUTOR_H 558