1 /*
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "ecmascript/regexp/regexp_executor.h"
17
18 namespace panda::ecmascript {
19 using RegExpState = RegExpExecutor::RegExpState;
20 using RegExpGlobalResult = builtins::RegExpGlobalResult;
Execute(const uint8_t * input,uint32_t lastIndex,uint32_t length,uint8_t * buf,bool isWideChar)21 bool RegExpExecutor::Execute(const uint8_t *input, uint32_t lastIndex, uint32_t length, uint8_t *buf, bool isWideChar)
22 {
23 DynChunk buffer(buf, chunk_);
24 input_ = const_cast<uint8_t *>(input);
25 inputEnd_ = const_cast<uint8_t *>(input + length * (isWideChar ? WIDE_CHAR_SIZE : CHAR_SIZE));
26 uint32_t size = buffer.GetU32(0);
27 nCapture_ = buffer.GetU32(RegExpParser::NUM_CAPTURE__OFFSET);
28 nStack_ = buffer.GetU32(RegExpParser::NUM_STACK_OFFSET);
29 flags_ = buffer.GetU32(RegExpParser::FLAGS_OFFSET);
30 prefilter_ = buffer.GetU32(RegExpParser::PREFILTER_OFFSET);
31 isWideChar_ = isWideChar;
32
33 uint32_t captureResultSize = sizeof(CaptureState) * nCapture_;
34 uint32_t stackSize = sizeof(uintptr_t) * nStack_;
35 stateStackLen_ = 0;
36 currentStack_ = 0;
37
38 if (captureResultSize != 0) {
39 if (captureResultList_ == nullptr) {
40 captureResultList_ = chunk_->NewArray<CaptureState>(nCapture_);
41 }
42 if (memset_s(captureResultList_, captureResultSize, 0, captureResultSize) != EOK) {
43 LOG_FULL(FATAL) << "memset_s failed";
44 UNREACHABLE();
45 }
46 }
47 if (stackSize != 0 && stack_ == nullptr) {
48 stack_ = chunk_->NewArray<uintptr_t>(nStack_);
49 if (memset_s(stack_, stackSize, 0, stackSize) != EOK) {
50 LOG_FULL(FATAL) << "memset_s failed";
51 UNREACHABLE();
52 }
53 }
54 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
55 SetCurrentPtr(input + lastIndex * (isWideChar ? WIDE_CHAR_SIZE : CHAR_SIZE));
56 SetCurrentPC(RegExpParser::OP_START_OFFSET);
57
58 // first split
59 if ((flags_ & RegExpParser::FLAG_STICKY) == 0) {
60 PushRegExpState(STATE_SPLIT, RegExpParser::OP_START_OFFSET);
61 }
62 return ExecuteInternal(buffer, size);
63 }
64
MatchFailed(bool isMatched)65 bool RegExpExecutor::MatchFailed(bool isMatched)
66 {
67 if (isMatched) {
68 stateStackLen_ = 0;
69 return true;
70 }
71 while (stateStackLen_ > 0) {
72 // StateType::STATE_SPLIT or STATE_NEGATIVE_MATCH_AHEAD
73 if (PopRegExpState() <= StateType::STATE_NEGATIVE_MATCH_AHEAD) {
74 return false;
75 }
76 }
77 return true;
78 }
79
80 // NOLINTNEXTLINE(readability-function-size)
ExecuteInternal(const DynChunk & byteCode,uint32_t pcEnd)81 bool RegExpExecutor::ExecuteInternal(const DynChunk &byteCode, uint32_t pcEnd)
82 {
83 while (GetCurrentPC() < pcEnd) {
84 // first split
85 if (!HandleFirstSplit()) {
86 return false;
87 }
88 uint8_t opCode = byteCode.GetU8(GetCurrentPC());
89 switch (opCode) {
90 case RegExpOpCode::OP_DOTS:
91 case RegExpOpCode::OP_ALL: {
92 if (!HandleOpAll(opCode)) {
93 return false;
94 }
95 break;
96 }
97 case RegExpOpCode::OP_CHAR32:
98 case RegExpOpCode::OP_CHAR: {
99 if (!HandleOpChar(byteCode, opCode)) {
100 return false;
101 }
102 break;
103 }
104 case RegExpOpCode::OP_NOT_WORD_BOUNDARY:
105 case RegExpOpCode::OP_WORD_BOUNDARY: {
106 if (!HandleOpWordBoundary(opCode)) {
107 return false;
108 }
109 break;
110 }
111 case RegExpOpCode::OP_LINE_START: {
112 if (!HandleOpLineStart(opCode)) {
113 return false;
114 }
115 break;
116 }
117 case RegExpOpCode::OP_LINE_END: {
118 if (!HandleOpLineEnd(opCode)) {
119 return false;
120 }
121 break;
122 }
123 case RegExpOpCode::OP_SAVE_START:
124 HandleOpSaveStart(byteCode, opCode);
125 break;
126 case RegExpOpCode::OP_SAVE_END:
127 HandleOpSaveEnd(byteCode, opCode);
128 break;
129 case RegExpOpCode::OP_GOTO: {
130 uint32_t offset = byteCode.GetU32(GetCurrentPC() + 1);
131 Advance(opCode, offset);
132 break;
133 }
134 case RegExpOpCode::OP_MATCH: {
135 ASSERT(stateStackLen_ > 0);
136 // jump to match ahead
137 uint32_t ahead = stateStackLen_ - 1;
138 auto stateStack = reinterpret_cast<RegExpState *>(stateStack_);
139 while (ahead != 0 && stateStack[ahead].type_ != StateType::STATE_MATCH_AHEAD &&
140 stateStack[ahead].type_ != StateType::STATE_NEGATIVE_MATCH_AHEAD) {
141 --ahead;
142 }
143 bool isNegative = stateStack[ahead].type_ == StateType::STATE_NEGATIVE_MATCH_AHEAD;
144 while (stateStackLen_ > ahead) {
145 PopRegExpState(isNegative);
146 }
147 if (isNegative && MatchFailed(false)) {
148 return false;
149 }
150 break;
151 }
152 case RegExpOpCode::OP_MATCH_END:
153 return true;
154 case RegExpOpCode::OP_SAVE_RESET:
155 HandleOpSaveReset(byteCode, opCode);
156 break;
157 case RegExpOpCode::OP_SPLIT_NEXT:
158 case RegExpOpCode::OP_MATCH_AHEAD:
159 case RegExpOpCode::OP_NEGATIVE_MATCH_AHEAD:
160 HandleOpMatch(byteCode, opCode);
161 break;
162 case RegExpOpCode::OP_SPLIT_FIRST:
163 HandleOpSplitFirst(byteCode, opCode);
164 break;
165 case RegExpOpCode::OP_PREV: {
166 if (!HandleOpPrev(opCode)) {
167 return false;
168 }
169 break;
170 }
171 case RegExpOpCode::OP_LOOP_GREEDY:
172 case RegExpOpCode::OP_LOOP:
173 HandleOpLoop(byteCode, opCode);
174 break;
175 case RegExpOpCode::OP_PUSH_CHAR: {
176 PushRegExpState(StateType::STATE_PUSH, 0, 0);
177 PushStack(reinterpret_cast<uintptr_t>(GetCurrentPtr()));
178 Advance(opCode);
179 break;
180 }
181 case RegExpOpCode::OP_CHECK_CHAR: {
182 if (stateStackLen_ > 0 && PeekRegExpState()->type_ == StateType::STATE_PUSH) {
183 DropRegExpState();
184 } else {
185 ASSERT(currentStack_ > 0);
186 PushRegExpState(StateType::STATE_POP, 0, stack_[currentStack_ - 1]);
187 }
188 if (PopStack() != reinterpret_cast<uintptr_t>(GetCurrentPtr())) {
189 Advance(opCode);
190 } else {
191 uint32_t offset = byteCode.GetU32(GetCurrentPC() + 1);
192 Advance(opCode, offset);
193 }
194 break;
195 }
196 case RegExpOpCode::OP_PUSH: {
197 PushRegExpState(StateType::STATE_PUSH, 0, 0);
198 PushStack(0);
199 Advance(opCode);
200 break;
201 }
202 case RegExpOpCode::OP_POP: {
203 ASSERT(currentStack_ > 0);
204 PushRegExpState(StateType::STATE_POP, 0, stack_[currentStack_ - 1]);
205 PopStack();
206 Advance(opCode);
207 break;
208 }
209 case RegExpOpCode::OP_RANGE32: {
210 if (!HandleOpRange32(byteCode)) {
211 return false;
212 }
213 break;
214 }
215 case RegExpOpCode::OP_RANGE: {
216 if (!HandleOpRange(byteCode)) {
217 return false;
218 }
219 break;
220 }
221 case RegExpOpCode::OP_SPARSE: {
222 if (!HandleOpSparse(byteCode)) {
223 return false;
224 }
225 break;
226 }
227 case RegExpOpCode::OP_BACKREFERENCE:
228 case RegExpOpCode::OP_BACKWARD_BACKREFERENCE: {
229 if (!HandleOpBackReference(byteCode, opCode)) {
230 return false;
231 }
232 break;
233 }
234 default: {
235 LOG_ECMA(FATAL) << "opCode overflow, OpCode: " << static_cast<int32_t>(opCode)
236 << " CurrentPc: " << static_cast<int32_t>(GetCurrentPC()) << " pcEnd: " << pcEnd;
237 UNREACHABLE();
238 }
239 }
240 }
241 // for loop match
242 return true;
243 }
244
DumpResult(std::ostream & out) const245 void RegExpExecutor::DumpResult(std::ostream &out) const
246 {
247 out << "captures:" << std::endl;
248 for (uint32_t i = 0; i < nCapture_; i++) {
249 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
250 CaptureState *captureState = &captureResultList_[i];
251 int32_t len = captureState->captureEnd - captureState->captureStart;
252 if ((captureState->captureStart != nullptr && captureState->captureEnd != nullptr) && (len >= 0)) {
253 out << i << ":\t" << CString(reinterpret_cast<const char *>(captureState->captureStart), len) << std::endl;
254 } else {
255 out << i << ":\t"
256 << "undefined" << std::endl;
257 }
258 }
259 }
260
GetResult(JSThread * thread)261 void RegExpExecutor::GetResult(JSThread *thread)
262 {
263 JSHandle<RegExpGlobalResult> matchResult(thread->GetCurrentEcmaContext()->GetRegExpGlobalResult());
264 matchResult->SetTotalCaptureCounts(thread, JSTaggedValue(nCapture_));
265 uint32_t firstIndex = RegExpGlobalResult::FIRST_CAPTURE_INDEX;
266 uint32_t availableCaptureSlot = matchResult->GetLength() - firstIndex;
267 uint32_t requiredLength = nCapture_ * 2;
268 if (requiredLength > availableCaptureSlot) {
269 matchResult = RegExpGlobalResult::GrowCapturesCapacity(thread, matchResult, requiredLength + firstIndex);
270 }
271 for (uint32_t i = 0; i < nCapture_; i++) {
272 CaptureState *captureState = &captureResultList_[i];
273 int32_t len = captureState->captureEnd - captureState->captureStart;
274 if ((captureState->captureStart != nullptr && captureState->captureEnd != nullptr) && (len >= 0)) {
275 if (isWideChar_) {
276 matchResult->SetStartOfCaptureIndex(thread, i, JSTaggedValue(
277 static_cast<int32_t>((captureState->captureStart - input_) / WIDE_CHAR_SIZE)));
278 matchResult->SetEndOfCaptureIndex(thread, i, JSTaggedValue(
279 static_cast<int32_t>((captureState->captureEnd - input_) / WIDE_CHAR_SIZE)));
280 } else {
281 matchResult->SetStartOfCaptureIndex(thread, i, JSTaggedValue(
282 static_cast<int32_t>(captureState->captureStart - input_)));
283 matchResult->SetEndOfCaptureIndex(thread, i, JSTaggedValue(
284 static_cast<int32_t>(captureState->captureEnd - input_)));
285 }
286 } else {
287 // undefined
288 matchResult->SetStartOfCaptureIndex(thread, i, JSTaggedValue(0));
289 matchResult->SetEndOfCaptureIndex(thread, i, JSTaggedValue(-1));
290 }
291 }
292 uint32_t endIndex = currentPtr_ - input_;
293 if (isWideChar_) {
294 endIndex /= WIDE_CHAR_SIZE;
295 }
296 matchResult->SetEndIndex(thread, JSTaggedValue(endIndex));
297 }
298
PushRegExpState(StateType type,uint32_t pc)299 void RegExpExecutor::PushRegExpState(StateType type, uint32_t pc)
300 {
301 ReAllocStack(stateStackLen_ + 1);
302 auto state = reinterpret_cast<RegExpState *>(
303 stateStack_ + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
304 stateStackLen_ * sizeof(RegExpState));
305 state->type_ = type;
306 state->currentPc_ = pc;
307 state->currentPtr_ = GetCurrentPtr();
308 stateStackLen_++;
309 }
310
PushRegExpState(StateType type,uint32_t pc,uintptr_t ptr)311 void RegExpExecutor::PushRegExpState(StateType type, uint32_t pc, uintptr_t ptr)
312 {
313 ReAllocStack(stateStackLen_ + 1);
314 auto state = reinterpret_cast<RegExpState *>(
315 stateStack_ + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
316 stateStackLen_ * sizeof(RegExpState));
317 state->type_ = type;
318 state->currentPc_ = pc;
319 state->currentPtr_ = reinterpret_cast<const uint8_t *>(ptr);
320 stateStackLen_++;
321 }
322
PopRegExpState(bool copyCapture)323 RegExpExecutor::StateType RegExpExecutor::PopRegExpState(bool copyCapture)
324 {
325 if (stateStackLen_ != 0) {
326 auto state = PeekRegExpState();
327 stateStackLen_--;
328 switch (state->type_) {
329 case StateType::STATE_SPLIT:
330 case StateType::STATE_NEGATIVE_MATCH_AHEAD:
331 case StateType::STATE_MATCH_AHEAD:
332 SetCurrentPC(state->currentPc_);
333 SetCurrentPtr(state->currentPtr_);
334 break;
335 case StateType::STATE_SAVE:
336 if (copyCapture) {
337 *(reinterpret_cast<const uint8_t **>(GetCaptureResultList()) + state->currentPc_) =
338 state->currentPtr_;
339 }
340 break;
341 case StateType::STATE_PUSH:
342 PopStack();
343 break;
344 case StateType::STATE_POP:
345 PushStack((uintptr_t)state->currentPtr_);
346 break;
347 case StateType::STATE_SET:
348 SetStackValue((uintptr_t)state->currentPtr_);
349 break;
350 default:
351 UNREACHABLE();
352 break;
353 }
354 return state->type_;
355 }
356 return StateType::STATE_INVALID;
357 }
358
ReAllocStack(uint32_t stackLen)359 void RegExpExecutor::ReAllocStack(uint32_t stackLen)
360 {
361 if (stackLen > stateStackSize_) {
362 ASSERT((static_cast<size_t>(stateStackSize_) * 2) <= static_cast<size_t>(UINT32_MAX)); // 2: double the size
363 uint32_t newStackSize = std::max(stateStackSize_ * 2, MIN_STACK_SIZE); // 2: double the size
364 ASSERT((static_cast<size_t>(newStackSize) * static_cast<size_t>(sizeof(RegExpState))) <=
365 static_cast<size_t>(UINT32_MAX));
366 uint32_t stackByteSize = newStackSize * sizeof(RegExpState);
367 auto newStack = chunk_->NewArray<uint8_t>(stackByteSize);
368 if (memset_s(newStack, stackByteSize, 0, stackByteSize) != EOK) {
369 LOG_FULL(FATAL) << "memset_s failed";
370 UNREACHABLE();
371 }
372 if (stateStack_ != nullptr) {
373 auto stackSize = stateStackSize_ * sizeof(RegExpState);
374 if (memcpy_s(newStack, stackSize, stateStack_, stackSize) != EOK) {
375 return;
376 }
377 }
378 stateStack_ = newStack;
379 stateStackSize_ = newStackSize;
380 }
381 }
382 } // namespace panda::ecmascript
383