• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef PANDA_RUNTIME_REGEXP_EXECUTOR_H
17 #define PANDA_RUNTIME_REGEXP_EXECUTOR_H
18 
19 #include "runtime/regexp/ecmascript/regexp_parser.h"
20 
21 namespace ark {
22 
23 template <class T>
24 struct RegExpMatchResult {
25     uint32_t endIndex = 0;
26     uint32_t index = 0;
27     // NOTE(kirill-mitkin): Change ecmascript API to PandaVector<T>
28     // first value is true if result is undefined
29     PandaVector<std::pair<bool, T>> captures;
30     PandaVector<std::pair<uint32_t, uint32_t>> indices;
31     bool isSuccess = false;
32     bool isWide = false;
33 };
34 
35 class RegExpExecutor {
36 public:
37     struct CaptureState {
38         const uint8_t *captureStart;
39         const uint8_t *captureEnd;
40     };
41 
42     enum StateType : uint8_t {
43         STATE_SPLIT = 0,
44         STATE_MATCH_AHEAD,
45         STATE_NEGATIVE_MATCH_AHEAD,
46     };
47 
48     struct RegExpState {
49         StateType type = STATE_SPLIT;
50         uint32_t currentPc = 0;
51         uint32_t currentStack = 0;
52         const uint8_t *currentPtr = nullptr;
53         __extension__ CaptureState *captureResultList[0];  // NOLINT(modernize-avoid-c-arrays)
54     };
55 
56     explicit RegExpExecutor() = default;
57 
~RegExpExecutor()58     ~RegExpExecutor()
59     {
60         auto allocator = Runtime::GetCurrent()->GetInternalAllocator();
61         allocator->DeleteArray(stack_);
62         allocator->DeleteArray(captureResultList_);
63         allocator->DeleteArray(stateStack_);
64     }
65 
66     NO_COPY_SEMANTIC(RegExpExecutor);
67     NO_MOVE_SEMANTIC(RegExpExecutor);
68 
69     PANDA_PUBLIC_API bool Execute(const uint8_t *input, uint32_t lastIndex, uint32_t length, uint8_t *buf,
70                                   bool isWideChar = false);
71 
72     bool ExecuteInternal(const DynChunk &byteCode, uint32_t pcEnd);
HandleFirstSplit()73     inline bool HandleFirstSplit()
74     {
75         if (GetCurrentPC() == RegExpParser::OP_START_OFFSET && stateStackLen_ == 0 &&
76             (flags_ & RegExpParser::FLAG_STICKY) == 0) {
77             if (IsEOF()) {
78                 if (MatchFailed()) {
79                     return false;
80                 }
81             } else {
82                 AdvanceCurrentPtr();
83                 PushRegExpState(STATE_SPLIT, RegExpParser::OP_START_OFFSET);
84             }
85         }
86         return true;
87     }
88 
HandleOpAll(uint8_t opCode)89     inline bool HandleOpAll(uint8_t opCode)
90     {
91         if (IsEOF()) {
92             return !MatchFailed();
93         }
94         uint32_t currentChar = GetCurrentChar();
95         if ((opCode == RegExpOpCode::OP_DOTS) && IsTerminator(currentChar)) {
96             return !MatchFailed();
97         }
98         Advance(opCode);
99         return true;
100     }
101 
HandleOpChar(const DynChunk & byteCode,uint8_t opCode)102     inline bool HandleOpChar(const DynChunk &byteCode, uint8_t opCode)
103     {
104         uint32_t expectedChar;
105         if (opCode == RegExpOpCode::OP_CHAR32) {
106             expectedChar = byteCode.GetU32(GetCurrentPC() + 1);
107         } else {
108             expectedChar = byteCode.GetU16(GetCurrentPC() + 1);
109         }
110         if (IsEOF()) {
111             return !MatchFailed();
112         }
113         uint32_t currentChar = GetCurrentChar();
114         if (IsIgnoreCase()) {
115             currentChar = static_cast<uint32_t>(RegExpParser::Canonicalize(currentChar, IsUtf16()));
116         }
117         if (currentChar == expectedChar) {
118             Advance(opCode);
119         } else {
120             if (MatchFailed()) {
121                 return false;
122             }
123         }
124         return true;
125     }
126 
HandleOpWordBoundary(uint8_t opCode)127     inline bool HandleOpWordBoundary(uint8_t opCode)
128     {
129         if (IsEOF()) {
130             if (opCode == RegExpOpCode::OP_WORD_BOUNDARY) {
131                 Advance(opCode);
132             } else {
133                 if (MatchFailed()) {
134                     return false;
135                 }
136             }
137             return true;
138         }
139         bool preIsWord = false;
140         if (GetCurrentPtr() != input_) {
141             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
142             preIsWord = IsWordChar(PeekPrevChar(currentPtr_, input_));
143         }
144         bool currentIsWord = IsWordChar(PeekChar(currentPtr_, inputEnd_));
145         if (((opCode == RegExpOpCode::OP_WORD_BOUNDARY) &&
146              ((!preIsWord && currentIsWord) || (preIsWord && !currentIsWord))) ||
147             ((opCode == RegExpOpCode::OP_NOT_WORD_BOUNDARY) &&
148              ((preIsWord && currentIsWord) || (!preIsWord && !currentIsWord)))) {
149             Advance(opCode);
150         } else {
151             if (MatchFailed()) {
152                 return false;
153             }
154         }
155         return true;
156     }
157 
HandleOpLineStart(uint8_t opCode)158     inline bool HandleOpLineStart(uint8_t opCode)
159     {
160         if (IsEOF()) {
161             return !MatchFailed();
162         }
163         if ((GetCurrentPtr() == input_) ||
164             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
165             ((flags_ & RegExpParser::FLAG_MULTILINE) != 0 && PeekPrevChar(currentPtr_, input_) == '\n')) {
166             Advance(opCode);
167         } else if (MatchFailed()) {
168             return false;
169         }
170         return true;
171     }
172 
HandleOpLineEnd(uint8_t opCode)173     inline bool HandleOpLineEnd(uint8_t opCode)
174     {
175         if (IsEOF() ||
176             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
177             ((flags_ & RegExpParser::FLAG_MULTILINE) != 0 && PeekChar(currentPtr_, inputEnd_) == '\n')) {
178             Advance(opCode);
179         } else {
180             if (MatchFailed()) {
181                 return false;
182             }
183         }
184         return true;
185     }
186 
HandleOpSaveStart(const DynChunk & byteCode,uint8_t opCode)187     inline void HandleOpSaveStart(const DynChunk &byteCode, uint8_t opCode)
188     {
189         uint32_t captureIndex = byteCode.GetU8(GetCurrentPC() + 1);
190         ASSERT(captureIndex < nCapture_);
191         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
192         CaptureState *captureState = &captureResultList_[captureIndex];
193         captureState->captureStart = GetCurrentPtr();
194         Advance(opCode);
195     }
196 
HandleOpSaveEnd(const DynChunk & byteCode,uint8_t opCode)197     inline void HandleOpSaveEnd(const DynChunk &byteCode, uint8_t opCode)
198     {
199         uint32_t captureIndex = byteCode.GetU8(GetCurrentPC() + 1);
200         ASSERT(captureIndex < nCapture_);
201         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
202         CaptureState *captureState = &captureResultList_[captureIndex];
203         captureState->captureEnd = GetCurrentPtr();
204         Advance(opCode);
205     }
206 
HandleOpSaveReset(const DynChunk & byteCode,uint8_t opCode)207     inline void HandleOpSaveReset(const DynChunk &byteCode, uint8_t opCode)
208     {
209         uint32_t catpureStartIndex = byteCode.GetU8(GetCurrentPC() + SAVE_RESET_START);
210         uint32_t catpureEndIndex = byteCode.GetU8(GetCurrentPC() + SAVE_RESET_END);
211         for (uint32_t i = catpureStartIndex; i <= catpureEndIndex; i++) {
212             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
213             CaptureState *captureState = &captureResultList_[i];
214             captureState->captureStart = nullptr;
215             captureState->captureEnd = nullptr;
216         }
217         Advance(opCode);
218     }
219 
HandleOpMatch(const DynChunk & byteCode,uint8_t opCode)220     inline void HandleOpMatch(const DynChunk &byteCode, uint8_t opCode)
221     {
222         auto type = static_cast<StateType>(opCode - RegExpOpCode::OP_SPLIT_NEXT);
223         ASSERT(type == STATE_SPLIT || type == STATE_MATCH_AHEAD || type == STATE_NEGATIVE_MATCH_AHEAD);
224         uint32_t offset = byteCode.GetU32(GetCurrentPC() + 1);
225         Advance(opCode);
226         uint32_t splitPc = GetCurrentPC() + offset;
227         PushRegExpState(type, splitPc);
228     }
229 
HandleOpSplitFirst(const DynChunk & byteCode,uint8_t opCode)230     inline void HandleOpSplitFirst(const DynChunk &byteCode, uint8_t opCode)
231     {
232         uint32_t offset = byteCode.GetU32(GetCurrentPC() + 1);
233         Advance(opCode);
234         PushRegExpState(STATE_SPLIT, GetCurrentPC());
235         AdvanceOffset(offset);
236     }
237 
HandleOpPrev(uint8_t opCode)238     inline bool HandleOpPrev(uint8_t opCode)
239     {
240         if (GetCurrentPtr() == input_) {
241             if (MatchFailed()) {
242                 return false;
243             }
244         } else {
245             PrevPtr(&currentPtr_, input_);
246             Advance(opCode);
247         }
248         return true;
249     }
250 
HandleOpLoop(const DynChunk & byteCode,uint8_t opCode)251     inline void HandleOpLoop(const DynChunk &byteCode, uint8_t opCode)
252     {
253         uint32_t quantifyMin = byteCode.GetU32(GetCurrentPC() + LOOP_MIN_OFFSET);
254         uint32_t quantifyMax = byteCode.GetU32(GetCurrentPC() + LOOP_MAX_OFFSET);
255         uint32_t pcOffset = byteCode.GetU32(GetCurrentPC() + LOOP_PC_OFFSET);
256         Advance(opCode);
257         uint32_t loopPcEnd = GetCurrentPC();
258         uint32_t loopPcStart = GetCurrentPC() + pcOffset;
259         bool isGreedy = opCode == RegExpOpCode::OP_LOOP_GREEDY;
260         uint32_t loopMax = isGreedy ? quantifyMax : quantifyMin;
261 
262         uint32_t loopCount = PeekStack();
263         SetStackValue(++loopCount);
264         if (loopCount < loopMax) {
265             // greedy failed, goto next
266             if (loopCount >= quantifyMin) {
267                 PushRegExpState(STATE_SPLIT, loopPcEnd);
268             }
269             // Goto loop start
270             SetCurrentPC(loopPcStart);
271         } else {
272             if (!isGreedy && (loopCount < quantifyMax)) {
273                 PushRegExpState(STATE_SPLIT, loopPcStart);
274             }
275         }
276     }
277 
HandleOpRange32(const DynChunk & byteCode)278     inline bool HandleOpRange32(const DynChunk &byteCode)
279     {
280         if (IsEOF()) {
281             return !MatchFailed();
282         }
283         uint32_t currentChar = GetCurrentChar();
284         if (IsIgnoreCase()) {
285             currentChar = static_cast<uint32_t>(RegExpParser::Canonicalize(currentChar, IsUtf16()));
286         }
287         uint16_t rangeCount = byteCode.GetU16(GetCurrentPC() + 1);
288         bool isFound = false;
289         int32_t idxMin = 0;
290         int32_t idxMax = static_cast<int32_t>(rangeCount) - 1;
291         int32_t idx = 0;
292         uint32_t low = 0;
293         uint32_t high = byteCode.GetU32(GetCurrentPC() + RANGE32_HEAD_OFFSET + idxMax * RANGE32_MAX_OFFSET +
294                                         RANGE32_MAX_HALF_OFFSET);
295         if (currentChar <= high) {
296             while (idxMin <= idxMax) {
297                 idx = (idxMin + idxMax) / RANGE32_OFFSET;
298                 low = byteCode.GetU32(GetCurrentPC() + RANGE32_HEAD_OFFSET +
299                                       static_cast<uint32_t>(idx) * RANGE32_MAX_OFFSET);
300                 high = byteCode.GetU32(GetCurrentPC() + RANGE32_HEAD_OFFSET +
301                                        static_cast<uint32_t>(idx) * RANGE32_MAX_OFFSET + RANGE32_MAX_HALF_OFFSET);
302                 if (currentChar < low) {
303                     idxMax = idx - 1;
304                 } else if (currentChar > high) {
305                     idxMin = idx + 1;
306                 } else {
307                     isFound = true;
308                     break;
309                 }
310             }
311         }
312         if (isFound) {
313             AdvanceOffset(rangeCount * RANGE32_MAX_OFFSET + RANGE32_HEAD_OFFSET);
314         } else {
315             if (MatchFailed()) {
316                 return false;
317             }
318         }
319         return true;
320     }
321 
HandleOpRange(const DynChunk & byteCode)322     inline bool HandleOpRange(const DynChunk &byteCode)
323     {
324         if (IsEOF()) {
325             return !MatchFailed();
326         }
327         uint32_t currentChar = GetCurrentChar();
328         if (IsIgnoreCase()) {
329             currentChar = static_cast<uint32_t>(RegExpParser::Canonicalize(currentChar, IsUtf16()));
330         }
331         uint16_t rangeCount = byteCode.GetU16(GetCurrentPC() + 1);
332         bool isFound = false;
333         int32_t idxMin = 0;
334         int32_t idxMax = rangeCount - 1;
335         int32_t idx = 0;
336         uint32_t low = 0;
337         uint32_t high =
338             byteCode.GetU16(GetCurrentPC() + RANGE32_HEAD_OFFSET + idxMax * RANGE32_MAX_HALF_OFFSET + RANGE32_OFFSET);
339         if (currentChar <= high) {
340             while (idxMin <= idxMax) {
341                 idx = (idxMin + idxMax) / RANGE32_OFFSET;
342                 low = byteCode.GetU16(GetCurrentPC() + RANGE32_HEAD_OFFSET +
343                                       static_cast<uint32_t>(idx) * RANGE32_MAX_HALF_OFFSET);
344                 high = byteCode.GetU16(GetCurrentPC() + RANGE32_HEAD_OFFSET +
345                                        static_cast<uint32_t>(idx) * RANGE32_MAX_HALF_OFFSET + RANGE32_OFFSET);
346                 if (currentChar < low) {
347                     idxMax = idx - 1;
348                 } else if (currentChar > high) {
349                     idxMin = idx + 1;
350                 } else {
351                     isFound = true;
352                     break;
353                 }
354             }
355         }
356         if (isFound) {
357             AdvanceOffset(rangeCount * RANGE32_MAX_HALF_OFFSET + RANGE32_HEAD_OFFSET);
358         } else {
359             if (MatchFailed()) {
360                 return false;
361             }
362         }
363         return true;
364     }
365 
366     bool HandleOpBackReference(const DynChunk &byteCode, uint8_t opCode);
367 
368     bool HandleOpBackReferenceMatch(const uint8_t *captureStart, const uint8_t *captureEnd, uint8_t opCode);
369 
370     bool HandleOpBackwardBackReferenceMatch(const uint8_t *captureStart, const uint8_t *captureEnd, uint8_t opCode);
371 
372     inline void Advance(uint8_t opCode, uint32_t offset = 0)
373     {
374         currentPc_ += offset + static_cast<uint32_t>(RegExpOpCode::GetRegExpOpCode(opCode)->GetSize());
375     }
376 
AdvanceOffset(uint32_t offset)377     inline void AdvanceOffset(uint32_t offset)
378     {
379         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
380         currentPc_ += offset;
381     }
382 
GetCurrentChar()383     inline uint32_t GetCurrentChar()
384     {
385         return GetChar(&currentPtr_, inputEnd_);
386     }
387 
AdvanceCurrentPtr()388     inline void AdvanceCurrentPtr()
389     {
390         AdvancePtr(&currentPtr_, inputEnd_);
391     }
392 
393     uint32_t GetChar(const uint8_t **pp, const uint8_t *end) const;
394 
395     uint32_t PeekChar(const uint8_t *p, const uint8_t *end) const;
396 
397     void AdvancePtr(const uint8_t **pp, const uint8_t *end) const;
398 
399     uint32_t PeekPrevChar(const uint8_t *p, const uint8_t *start) const;
400 
401     uint32_t GetPrevChar(const uint8_t **pp, const uint8_t *start) const;
402 
403     void PrevPtr(const uint8_t **pp, const uint8_t *start) const;
404 
405     bool MatchFailed(bool isMatched = false);
406 
SetCurrentPC(uint32_t pc)407     void SetCurrentPC(uint32_t pc)
408     {
409         currentPc_ = pc;
410     }
411 
SetCurrentPtr(const uint8_t * ptr)412     void SetCurrentPtr(const uint8_t *ptr)
413     {
414         currentPtr_ = ptr;
415     }
416 
IsEOF()417     bool IsEOF() const
418     {
419         return currentPtr_ >= inputEnd_;
420     }
421 
GetCurrentPC()422     uint32_t GetCurrentPC() const
423     {
424         return currentPc_;
425     }
426 
PushStack(uintptr_t val)427     void PushStack(uintptr_t val)
428     {
429         ASSERT(currentStack_ < nStack_);
430         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
431         stack_[currentStack_++] = val;
432     }
433 
SetStackValue(uintptr_t val)434     void SetStackValue(uintptr_t val) const
435     {
436         ASSERT(currentStack_ >= 1);
437         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
438         stack_[currentStack_ - 1] = val;
439     }
440 
PopStack()441     uintptr_t PopStack()
442     {
443         ASSERT(currentStack_ >= 1);
444         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
445         return stack_[--currentStack_];
446     }
447 
PeekStack()448     uintptr_t PeekStack() const
449     {
450         ASSERT(currentStack_ >= 1);
451         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
452         return stack_[currentStack_ - 1];
453     }
454 
GetCurrentPtr()455     const uint8_t *GetCurrentPtr() const
456     {
457         return currentPtr_;
458     }
459 
GetCaptureResultList()460     CaptureState *GetCaptureResultList() const
461     {
462         return captureResultList_;
463     }
464 
GetCaptureCount()465     uint32_t GetCaptureCount() const
466     {
467         return nCapture_;
468     }
469 
470     void DumpResult(std::ostream &out) const;
471 
472     void PushRegExpState(StateType type, uint32_t pc);
473 
474     RegExpState *PopRegExpState(bool copyCaptrue = true);
475 
DropRegExpState()476     void DropRegExpState()
477     {
478         stateStackLen_--;
479     }
480 
PeekRegExpState()481     RegExpState *PeekRegExpState() const
482     {
483         ASSERT(stateStackLen_ >= 1);
484         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
485         return reinterpret_cast<RegExpState *>(stateStack_ + (stateStackLen_ - 1) * stateSize_);
486     }
487 
488     void ReAllocStack(uint32_t stackLen);
489 
IsWordChar(uint8_t value)490     inline bool IsWordChar(uint8_t value) const
491     {
492         return ((value >= '0' && value <= '9') || (value >= 'a' && value <= 'z') || (value >= 'A' && value <= 'Z') ||
493                 (value == '_'));
494     }
495 
IsTerminator(uint32_t value)496     inline bool IsTerminator(uint32_t value) const
497     {
498         // NOLINTNEXTLINE(readability-magic-numbers)
499         return (value == '\n' || value == '\r' || value == 0x2028 || value == 0x2029);
500     }
501 
IsIgnoreCase()502     inline bool IsIgnoreCase() const
503     {
504         return (flags_ & RegExpParser::FLAG_IGNORECASE) != 0;
505     }
506 
IsUtf16()507     inline bool IsUtf16() const
508     {
509         return (flags_ & RegExpParser::FLAG_UTF16) != 0;
510     }
511 
IsWideChar()512     inline bool IsWideChar() const
513     {
514         return isWideChar_;
515     }
516 
GetInputPtr()517     inline uint8_t *GetInputPtr() const
518     {
519         return input_;
520     }
521 
522     static constexpr size_t CHAR_SIZE = 1;
523     static constexpr size_t WIDE_CHAR_SIZE = 2;
524     static constexpr size_t SAVE_RESET_START = 1;
525     static constexpr size_t SAVE_RESET_END = 2;
526     static constexpr size_t LOOP_MIN_OFFSET = 5;
527     static constexpr size_t LOOP_MAX_OFFSET = 9;
528     static constexpr size_t LOOP_PC_OFFSET = 1;
529     static constexpr size_t RANGE32_HEAD_OFFSET = 3;
530     static constexpr size_t RANGE32_MAX_HALF_OFFSET = 4;
531     static constexpr size_t RANGE32_MAX_OFFSET = 8;
532     static constexpr size_t RANGE32_OFFSET = 2;
533     static constexpr uint32_t STACK_MULTIPLIER = 2;
534     static constexpr uint32_t MIN_STACK_SIZE = 8;
535 
536 private:
537     uint8_t *input_ = nullptr;
538     uint8_t *inputEnd_ = nullptr;
539     bool isWideChar_ = false;
540 
541     uint32_t currentPc_ = 0;
542     const uint8_t *currentPtr_ = nullptr;
543     CaptureState *captureResultList_ = nullptr;
544     uintptr_t *stack_ = nullptr;
545     uint32_t currentStack_ = 0;
546 
547     uint32_t nCapture_ = 0;
548     uint32_t nStack_ = 0;
549 
550     uint32_t flags_ = 0;
551     uint32_t stateStackLen_ = 0;
552     uint32_t stateStackSize_ = 0;
553     uint32_t stateSize_ = 0;
554     uint8_t *stateStack_ = nullptr;
555 };
556 }  // namespace ark
557 #endif  // core_REGEXP_REGEXP_EXECUTOR_H
558