• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef PANDA_RUNTIME_REGEXP_EXECUTOR_H
17 #define PANDA_RUNTIME_REGEXP_EXECUTOR_H
18 
19 #include "runtime/regexp/ecmascript/regexp_parser.h"
20 
21 namespace ark {
22 
23 template <class T>
24 struct RegExpMatchResult {
25     uint32_t endIndex = 0;
26     uint32_t index = 0;
27     // NOTE(kirill-mitkin): Change ecmascript API to PandaVector<T>
28     // first value is true if result is undefined
29     PandaVector<std::pair<bool, T>> captures;
30     PandaVector<std::pair<uint32_t, uint32_t>> indices;
31     bool isSuccess = false;
32     bool isWide = false;
33 };
34 
35 class RegExpExecutor {
36 public:
37     struct CaptureState {
38         const uint8_t *captureStart;
39         const uint8_t *captureEnd;
40     };
41 
42     enum StateType : uint8_t {
43         STATE_SPLIT = 0,
44         STATE_MATCH_AHEAD,
45         STATE_NEGATIVE_MATCH_AHEAD,
46     };
47 
48     struct RegExpState {
49         StateType type = STATE_SPLIT;
50         uint32_t currentPc = 0;
51         uint32_t currentStack = 0;
52         const uint8_t *currentPtr = nullptr;
53         __extension__ CaptureState *captureResultList[0];  // NOLINT(modernize-avoid-c-arrays)
54     };
55 
56     explicit RegExpExecutor() = default;
57 
~RegExpExecutor()58     ~RegExpExecutor()
59     {
60         auto allocator = Runtime::GetCurrent()->GetInternalAllocator();
61         allocator->DeleteArray(stack_);
62         allocator->DeleteArray(captureResultList_);
63         allocator->DeleteArray(stateStack_);
64     }
65 
66     NO_COPY_SEMANTIC(RegExpExecutor);
67     NO_MOVE_SEMANTIC(RegExpExecutor);
68 
69     PANDA_PUBLIC_API bool Execute(const uint8_t *input, uint32_t lastIndex, uint32_t length, uint8_t *buf,
70                                   bool isWideChar = false);
71 
72     bool ExecuteInternal(const DynChunk &byteCode, uint32_t pcEnd);
73     // CC-OFFNXT(G.FUD.06) perf critical
HandleFirstSplit()74     inline bool HandleFirstSplit()
75     {
76         if (GetCurrentPC() == RegExpParser::OP_START_OFFSET && stateStackLen_ == 0 &&
77             (flags_ & RegExpParser::FLAG_STICKY) == 0) {
78             if (IsEOF()) {
79                 if (MatchFailed()) {
80                     return false;
81                 }
82             } else {
83                 AdvanceCurrentPtr();
84                 PushRegExpState(STATE_SPLIT, RegExpParser::OP_START_OFFSET);
85             }
86         }
87         return true;
88     }
89 
HandleOpAll(uint8_t opCode)90     inline bool HandleOpAll(uint8_t opCode)
91     {
92         if (IsEOF()) {
93             return !MatchFailed();
94         }
95         uint32_t currentChar = GetCurrentChar();
96         if ((opCode == RegExpOpCode::OP_DOTS) && IsTerminator(currentChar)) {
97             return !MatchFailed();
98         }
99         Advance(opCode);
100         return true;
101     }
102 
103     // CC-OFFNXT(G.FUD.06) perf critical
HandleOpChar(const DynChunk & byteCode,uint8_t opCode)104     inline bool HandleOpChar(const DynChunk &byteCode, uint8_t opCode)
105     {
106         uint32_t expectedChar;
107         if (opCode == RegExpOpCode::OP_CHAR32) {
108             expectedChar = byteCode.GetU32(GetCurrentPC() + 1);
109         } else {
110             expectedChar = byteCode.GetU16(GetCurrentPC() + 1);
111         }
112         if (IsEOF()) {
113             return !MatchFailed();
114         }
115         uint32_t currentChar = GetCurrentChar();
116         if (IsIgnoreCase()) {
117             currentChar = static_cast<uint32_t>(RegExpParser::Canonicalize(currentChar, IsUtf16()));
118         }
119         if (currentChar == expectedChar) {
120             Advance(opCode);
121         } else {
122             if (MatchFailed()) {
123                 return false;
124             }
125         }
126         return true;
127     }
128 
129     // CC-OFFNXT(G.FUD.06) perf critical
HandleOpWordBoundary(uint8_t opCode)130     inline bool HandleOpWordBoundary(uint8_t opCode)
131     {
132         if (IsEOF()) {
133             if (opCode == RegExpOpCode::OP_WORD_BOUNDARY) {
134                 Advance(opCode);
135             } else {
136                 if (MatchFailed()) {
137                     return false;
138                 }
139             }
140             return true;
141         }
142         bool preIsWord = false;
143         if (GetCurrentPtr() != input_) {
144             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
145             preIsWord = IsWordChar(PeekPrevChar(currentPtr_, input_));
146         }
147         bool currentIsWord = IsWordChar(PeekChar(currentPtr_, inputEnd_));
148         if (((opCode == RegExpOpCode::OP_WORD_BOUNDARY) &&
149              ((!preIsWord && currentIsWord) || (preIsWord && !currentIsWord))) ||
150             ((opCode == RegExpOpCode::OP_NOT_WORD_BOUNDARY) &&
151              ((preIsWord && currentIsWord) || (!preIsWord && !currentIsWord)))) {
152             Advance(opCode);
153         } else {
154             if (MatchFailed()) {
155                 return false;
156             }
157         }
158         return true;
159     }
160 
HandleOpLineStart(uint8_t opCode)161     inline bool HandleOpLineStart(uint8_t opCode)
162     {
163         if (IsEOF()) {
164             return !MatchFailed();
165         }
166         if ((GetCurrentPtr() == input_) ||
167             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
168             ((flags_ & RegExpParser::FLAG_MULTILINE) != 0 && PeekPrevChar(currentPtr_, input_) == '\n')) {
169             Advance(opCode);
170         } else if (MatchFailed()) {
171             return false;
172         }
173         return true;
174     }
175 
HandleOpLineEnd(uint8_t opCode)176     inline bool HandleOpLineEnd(uint8_t opCode)
177     {
178         if (IsEOF() ||
179             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
180             ((flags_ & RegExpParser::FLAG_MULTILINE) != 0 && PeekChar(currentPtr_, inputEnd_) == '\n')) {
181             Advance(opCode);
182         } else {
183             if (MatchFailed()) {
184                 return false;
185             }
186         }
187         return true;
188     }
189 
HandleOpSaveStart(const DynChunk & byteCode,uint8_t opCode)190     inline void HandleOpSaveStart(const DynChunk &byteCode, uint8_t opCode)
191     {
192         uint32_t captureIndex = byteCode.GetU8(GetCurrentPC() + 1);
193         ASSERT(captureIndex < nCapture_);
194         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
195         CaptureState *captureState = &captureResultList_[captureIndex];
196         captureState->captureStart = GetCurrentPtr();
197         Advance(opCode);
198     }
199 
HandleOpSaveEnd(const DynChunk & byteCode,uint8_t opCode)200     inline void HandleOpSaveEnd(const DynChunk &byteCode, uint8_t opCode)
201     {
202         uint32_t captureIndex = byteCode.GetU8(GetCurrentPC() + 1);
203         ASSERT(captureIndex < nCapture_);
204         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
205         CaptureState *captureState = &captureResultList_[captureIndex];
206         captureState->captureEnd = GetCurrentPtr();
207         Advance(opCode);
208     }
209 
HandleOpSaveReset(const DynChunk & byteCode,uint8_t opCode)210     inline void HandleOpSaveReset(const DynChunk &byteCode, uint8_t opCode)
211     {
212         uint32_t catpureStartIndex = byteCode.GetU8(GetCurrentPC() + SAVE_RESET_START);
213         uint32_t catpureEndIndex = byteCode.GetU8(GetCurrentPC() + SAVE_RESET_END);
214         for (uint32_t i = catpureStartIndex; i <= catpureEndIndex; i++) {
215             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
216             CaptureState *captureState = &captureResultList_[i];
217             captureState->captureStart = nullptr;
218             captureState->captureEnd = nullptr;
219         }
220         Advance(opCode);
221     }
222 
HandleOpMatch(const DynChunk & byteCode,uint8_t opCode)223     inline void HandleOpMatch(const DynChunk &byteCode, uint8_t opCode)
224     {
225         auto type = static_cast<StateType>(opCode - RegExpOpCode::OP_SPLIT_NEXT);
226         ASSERT(type == STATE_SPLIT || type == STATE_MATCH_AHEAD || type == STATE_NEGATIVE_MATCH_AHEAD);
227         uint32_t offset = byteCode.GetU32(GetCurrentPC() + 1);
228         Advance(opCode);
229         uint32_t splitPc = GetCurrentPC() + offset;
230         PushRegExpState(type, splitPc);
231     }
232 
HandleOpSplitFirst(const DynChunk & byteCode,uint8_t opCode)233     inline void HandleOpSplitFirst(const DynChunk &byteCode, uint8_t opCode)
234     {
235         uint32_t offset = byteCode.GetU32(GetCurrentPC() + 1);
236         Advance(opCode);
237         PushRegExpState(STATE_SPLIT, GetCurrentPC());
238         AdvanceOffset(offset);
239     }
240 
HandleOpPrev(uint8_t opCode)241     inline bool HandleOpPrev(uint8_t opCode)
242     {
243         if (GetCurrentPtr() == input_) {
244             if (MatchFailed()) {
245                 return false;
246             }
247         } else {
248             PrevPtr(&currentPtr_, input_);
249             Advance(opCode);
250         }
251         return true;
252     }
253 
254     // CC-OFFNXT(G.FUD.06) perf critical
HandleOpLoop(const DynChunk & byteCode,uint8_t opCode)255     inline void HandleOpLoop(const DynChunk &byteCode, uint8_t opCode)
256     {
257         uint32_t quantifyMin = byteCode.GetU32(GetCurrentPC() + LOOP_MIN_OFFSET);
258         uint32_t quantifyMax = byteCode.GetU32(GetCurrentPC() + LOOP_MAX_OFFSET);
259         uint32_t pcOffset = byteCode.GetU32(GetCurrentPC() + LOOP_PC_OFFSET);
260         Advance(opCode);
261         uint32_t loopPcEnd = GetCurrentPC();
262         uint32_t loopPcStart = GetCurrentPC() + pcOffset;
263         bool isGreedy = opCode == RegExpOpCode::OP_LOOP_GREEDY;
264         uint32_t loopMax = isGreedy ? quantifyMax : quantifyMin;
265 
266         uint32_t loopCount = PeekStack();
267         SetStackValue(++loopCount);
268         if (loopCount < loopMax) {
269             // greedy failed, goto next
270             if (loopCount >= quantifyMin) {
271                 PushRegExpState(STATE_SPLIT, loopPcEnd);
272             }
273             // Goto loop start
274             SetCurrentPC(loopPcStart);
275         } else {
276             if (!isGreedy && (loopCount < quantifyMax)) {
277                 PushRegExpState(STATE_SPLIT, loopPcStart);
278             }
279         }
280     }
281 
282     // CC-OFFNXT(G.FUD.06) perf critical
HandleOpRange32(const DynChunk & byteCode)283     inline bool HandleOpRange32(const DynChunk &byteCode)
284     {
285         if (IsEOF()) {
286             return !MatchFailed();
287         }
288         uint32_t currentChar = GetCurrentChar();
289         if (IsIgnoreCase()) {
290             currentChar = static_cast<uint32_t>(RegExpParser::Canonicalize(currentChar, IsUtf16()));
291         }
292         uint16_t rangeCount = byteCode.GetU16(GetCurrentPC() + 1);
293         bool isFound = false;
294         int32_t idxMin = 0;
295         int32_t idxMax = static_cast<int32_t>(rangeCount) - 1;
296         int32_t idx = 0;
297         uint32_t low = 0;
298         uint32_t high = byteCode.GetU32(GetCurrentPC() + RANGE32_HEAD_OFFSET + idxMax * RANGE32_MAX_OFFSET +
299                                         RANGE32_MAX_HALF_OFFSET);
300         if (currentChar <= high) {
301             while (idxMin <= idxMax) {
302                 idx = (idxMin + idxMax) / RANGE32_OFFSET;
303                 low = byteCode.GetU32(GetCurrentPC() + RANGE32_HEAD_OFFSET +
304                                       static_cast<uint32_t>(idx) * RANGE32_MAX_OFFSET);
305                 high = byteCode.GetU32(GetCurrentPC() + RANGE32_HEAD_OFFSET +
306                                        static_cast<uint32_t>(idx) * RANGE32_MAX_OFFSET + RANGE32_MAX_HALF_OFFSET);
307                 if (currentChar < low) {
308                     idxMax = idx - 1;
309                 } else if (currentChar > high) {
310                     idxMin = idx + 1;
311                 } else {
312                     isFound = true;
313                     break;
314                 }
315             }
316         }
317         if (isFound) {
318             AdvanceOffset(rangeCount * RANGE32_MAX_OFFSET + RANGE32_HEAD_OFFSET);
319         } else {
320             if (MatchFailed()) {
321                 return false;
322             }
323         }
324         return true;
325     }
326 
327     // CC-OFFNXT(G.FUD.06) perf critical
HandleOpRange(const DynChunk & byteCode)328     inline bool HandleOpRange(const DynChunk &byteCode)
329     {
330         if (IsEOF()) {
331             return !MatchFailed();
332         }
333         uint32_t currentChar = GetCurrentChar();
334         if (IsIgnoreCase()) {
335             currentChar = static_cast<uint32_t>(RegExpParser::Canonicalize(currentChar, IsUtf16()));
336         }
337         uint16_t rangeCount = byteCode.GetU16(GetCurrentPC() + 1);
338         bool isFound = false;
339         int32_t idxMin = 0;
340         int32_t idxMax = rangeCount - 1;
341         int32_t idx = 0;
342         uint32_t low = 0;
343         uint32_t high =
344             byteCode.GetU16(GetCurrentPC() + RANGE32_HEAD_OFFSET + idxMax * RANGE32_MAX_HALF_OFFSET + RANGE32_OFFSET);
345         if (currentChar <= high) {
346             while (idxMin <= idxMax) {
347                 idx = (idxMin + idxMax) / RANGE32_OFFSET;
348                 low = byteCode.GetU16(GetCurrentPC() + RANGE32_HEAD_OFFSET +
349                                       static_cast<uint32_t>(idx) * RANGE32_MAX_HALF_OFFSET);
350                 high = byteCode.GetU16(GetCurrentPC() + RANGE32_HEAD_OFFSET +
351                                        static_cast<uint32_t>(idx) * RANGE32_MAX_HALF_OFFSET + RANGE32_OFFSET);
352                 if (currentChar < low) {
353                     idxMax = idx - 1;
354                 } else if (currentChar > high) {
355                     idxMin = idx + 1;
356                 } else {
357                     isFound = true;
358                     break;
359                 }
360             }
361         }
362         if (isFound) {
363             AdvanceOffset(rangeCount * RANGE32_MAX_HALF_OFFSET + RANGE32_HEAD_OFFSET);
364         } else {
365             if (MatchFailed()) {
366                 return false;
367             }
368         }
369         return true;
370     }
371 
372     bool HandleOpBackReference(const DynChunk &byteCode, uint8_t opCode);
373 
374     bool HandleOpBackReferenceMatch(const uint8_t *captureStart, const uint8_t *captureEnd, uint8_t opCode);
375 
376     bool HandleOpBackwardBackReferenceMatch(const uint8_t *captureStart, const uint8_t *captureEnd, uint8_t opCode);
377 
378     inline void Advance(uint8_t opCode, uint32_t offset = 0)
379     {
380         currentPc_ += offset + static_cast<uint32_t>(RegExpOpCode::GetRegExpOpCode(opCode)->GetSize());
381     }
382 
AdvanceOffset(uint32_t offset)383     inline void AdvanceOffset(uint32_t offset)
384     {
385         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
386         currentPc_ += offset;
387     }
388 
GetCurrentChar()389     inline uint32_t GetCurrentChar()
390     {
391         return GetChar(&currentPtr_, inputEnd_);
392     }
393 
AdvanceCurrentPtr()394     inline void AdvanceCurrentPtr()
395     {
396         AdvancePtr(&currentPtr_, inputEnd_);
397     }
398 
399     uint32_t GetChar(const uint8_t **pp, const uint8_t *end) const;
400 
401     uint32_t PeekChar(const uint8_t *p, const uint8_t *end) const;
402 
403     void AdvancePtr(const uint8_t **pp, const uint8_t *end) const;
404 
405     uint32_t PeekPrevChar(const uint8_t *p, const uint8_t *start) const;
406 
407     uint32_t GetPrevChar(const uint8_t **pp, const uint8_t *start) const;
408 
409     void PrevPtr(const uint8_t **pp, const uint8_t *start) const;
410 
411     bool MatchFailed(bool isMatched = false);
412 
SetCurrentPC(uint32_t pc)413     void SetCurrentPC(uint32_t pc)
414     {
415         currentPc_ = pc;
416     }
417 
SetCurrentPtr(const uint8_t * ptr)418     void SetCurrentPtr(const uint8_t *ptr)
419     {
420         currentPtr_ = ptr;
421     }
422 
IsEOF()423     bool IsEOF() const
424     {
425         return currentPtr_ >= inputEnd_;
426     }
427 
GetCurrentPC()428     uint32_t GetCurrentPC() const
429     {
430         return currentPc_;
431     }
432 
PushStack(uintptr_t val)433     void PushStack(uintptr_t val)
434     {
435         ASSERT(currentStack_ < nStack_);
436         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
437         stack_[currentStack_++] = val;
438     }
439 
SetStackValue(uintptr_t val)440     void SetStackValue(uintptr_t val) const
441     {
442         ASSERT(currentStack_ >= 1);
443         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
444         stack_[currentStack_ - 1] = val;
445     }
446 
PopStack()447     uintptr_t PopStack()
448     {
449         ASSERT(currentStack_ >= 1);
450         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
451         return stack_[--currentStack_];
452     }
453 
PeekStack()454     uintptr_t PeekStack() const
455     {
456         ASSERT(currentStack_ >= 1);
457         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
458         return stack_[currentStack_ - 1];
459     }
460 
GetCurrentPtr()461     const uint8_t *GetCurrentPtr() const
462     {
463         return currentPtr_;
464     }
465 
GetCaptureResultList()466     CaptureState *GetCaptureResultList() const
467     {
468         return captureResultList_;
469     }
470 
GetCaptureCount()471     uint32_t GetCaptureCount() const
472     {
473         return nCapture_;
474     }
475 
476     void DumpResult(std::ostream &out) const;
477 
478     void PushRegExpState(StateType type, uint32_t pc);
479 
480     RegExpState *PopRegExpState(bool copyCaptrue = true);
481 
DropRegExpState()482     void DropRegExpState()
483     {
484         stateStackLen_--;
485     }
486 
PeekRegExpState()487     RegExpState *PeekRegExpState() const
488     {
489         ASSERT(stateStackLen_ >= 1);
490         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
491         return reinterpret_cast<RegExpState *>(stateStack_ + (stateStackLen_ - 1) * stateSize_);
492     }
493 
494     void ReAllocStack(uint32_t stackLen);
495 
IsWordChar(uint8_t value)496     inline bool IsWordChar(uint8_t value) const
497     {
498         return ((value >= '0' && value <= '9') || (value >= 'a' && value <= 'z') || (value >= 'A' && value <= 'Z') ||
499                 (value == '_'));
500     }
501 
IsTerminator(uint32_t value)502     inline bool IsTerminator(uint32_t value) const
503     {
504         // NOLINTNEXTLINE(readability-magic-numbers)
505         return (value == '\n' || value == '\r' || value == 0x2028 || value == 0x2029);
506     }
507 
IsIgnoreCase()508     inline bool IsIgnoreCase() const
509     {
510         return (flags_ & RegExpParser::FLAG_IGNORECASE) != 0;
511     }
512 
IsUtf16()513     inline bool IsUtf16() const
514     {
515         return (flags_ & RegExpParser::FLAG_UTF16) != 0;
516     }
517 
IsWideChar()518     inline bool IsWideChar() const
519     {
520         return isWideChar_;
521     }
522 
GetInputPtr()523     inline uint8_t *GetInputPtr() const
524     {
525         return input_;
526     }
527 
528     static constexpr size_t CHAR_SIZE = 1;
529     static constexpr size_t WIDE_CHAR_SIZE = 2;
530     static constexpr size_t SAVE_RESET_START = 1;
531     static constexpr size_t SAVE_RESET_END = 2;
532     static constexpr size_t LOOP_MIN_OFFSET = 5;
533     static constexpr size_t LOOP_MAX_OFFSET = 9;
534     static constexpr size_t LOOP_PC_OFFSET = 1;
535     static constexpr size_t RANGE32_HEAD_OFFSET = 3;
536     static constexpr size_t RANGE32_MAX_HALF_OFFSET = 4;
537     static constexpr size_t RANGE32_MAX_OFFSET = 8;
538     static constexpr size_t RANGE32_OFFSET = 2;
539     static constexpr uint32_t STACK_MULTIPLIER = 2;
540     static constexpr uint32_t MIN_STACK_SIZE = 8;
541 
542 private:
543     uint8_t *input_ = nullptr;
544     uint8_t *inputEnd_ = nullptr;
545     bool isWideChar_ = false;
546 
547     uint32_t currentPc_ = 0;
548     const uint8_t *currentPtr_ = nullptr;
549     CaptureState *captureResultList_ = nullptr;
550     uintptr_t *stack_ = nullptr;
551     uint32_t currentStack_ = 0;
552 
553     uint32_t nCapture_ = 0;
554     uint32_t nStack_ = 0;
555 
556     uint32_t flags_ = 0;
557     uint32_t stateStackLen_ = 0;
558     uint32_t stateStackSize_ = 0;
559     uint32_t stateSize_ = 0;
560     uint8_t *stateStack_ = nullptr;
561 };
562 }  // namespace ark
563 #endif  // core_REGEXP_REGEXP_EXECUTOR_H
564