1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "ecmascript/regexp/regexp_executor.h"
17
18 #include "ecmascript/base/string_helper.h"
19 #include "ecmascript/mem/c_string.h"
20 #include "ecmascript/mem/dyn_chunk.h"
21 #include "ecmascript/regexp/regexp_opcode.h"
22 #include "securec.h"
23
24 namespace panda::ecmascript {
25 using RegExpState = RegExpExecutor::RegExpState;
26 using MatchResult = RegExpExecutor::MatchResult;
Execute(const uint8_t * input,uint32_t lastIndex,uint32_t length,uint8_t * buf,bool isWideChar)27 bool RegExpExecutor::Execute(const uint8_t *input, uint32_t lastIndex, uint32_t length, uint8_t *buf, bool isWideChar)
28 {
29 DynChunk buffer(buf, chunk_);
30 input_ = const_cast<uint8_t *>(input);
31 inputEnd_ = const_cast<uint8_t *>(input + length * (isWideChar ? WIDE_CHAR_SIZE : CHAR_SIZE));
32 uint32_t size = buffer.GetU32(0);
33 nCapture_ = buffer.GetU32(RegExpParser::NUM_CAPTURE__OFFSET);
34 nStack_ = buffer.GetU32(RegExpParser::NUM_STACK_OFFSET);
35 flags_ = buffer.GetU32(RegExpParser::FLAGS_OFFSET);
36 isWideChar_ = isWideChar;
37
38 uint32_t captureResultSize = sizeof(CaptureState) * nCapture_;
39 uint32_t stackSize = sizeof(uintptr_t) * nStack_;
40 stateSize_ = sizeof(RegExpState) + captureResultSize + stackSize;
41 stateStackLen_ = 0;
42
43 if (captureResultSize != 0) {
44 captureResultList_ = chunk_->NewArray<CaptureState>(nCapture_);
45 if (memset_s(captureResultList_, captureResultSize, 0, captureResultSize) != EOK) {
46 LOG_FULL(FATAL) << "memset_s failed";
47 UNREACHABLE();
48 }
49 }
50 if (stackSize != 0) {
51 stack_ = chunk_->NewArray<uintptr_t>(nStack_);
52 if (memset_s(stack_, stackSize, 0, stackSize) != EOK) {
53 LOG_FULL(FATAL) << "memset_s failed";
54 UNREACHABLE();
55 }
56 }
57 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
58 SetCurrentPtr(input + lastIndex * (isWideChar ? WIDE_CHAR_SIZE : CHAR_SIZE));
59 SetCurrentPC(RegExpParser::OP_START_OFFSET);
60
61 // first split
62 if ((flags_ & RegExpParser::FLAG_STICKY) == 0) {
63 PushRegExpState(STATE_SPLIT, RegExpParser::OP_START_OFFSET);
64 }
65 return ExecuteInternal(buffer, size);
66 }
67
MatchFailed(bool isMatched)68 bool RegExpExecutor::MatchFailed(bool isMatched)
69 {
70 while (true) {
71 if (stateStackLen_ == 0) {
72 return true;
73 }
74 RegExpState *state = PeekRegExpState();
75 if (state->type_ == StateType::STATE_SPLIT) {
76 if (!isMatched) {
77 PopRegExpState();
78 return false;
79 }
80 } else {
81 isMatched = (state->type_ == StateType::STATE_MATCH_AHEAD && isMatched) ||
82 (state->type_ == StateType::STATE_NEGATIVE_MATCH_AHEAD && !isMatched);
83 if (isMatched) {
84 if (state->type_ == StateType::STATE_MATCH_AHEAD) {
85 PopRegExpState(false);
86 return false;
87 }
88 if (state->type_ == StateType::STATE_NEGATIVE_MATCH_AHEAD) {
89 PopRegExpState();
90 return false;
91 }
92 }
93 }
94 DropRegExpState();
95 }
96
97 return true;
98 }
99
100 // NOLINTNEXTLINE(readability-function-size)
ExecuteInternal(const DynChunk & byteCode,uint32_t pcEnd)101 bool RegExpExecutor::ExecuteInternal(const DynChunk &byteCode, uint32_t pcEnd)
102 {
103 while (GetCurrentPC() < pcEnd) {
104 // first split
105 if (!HandleFirstSplit()) {
106 return false;
107 }
108 uint8_t opCode = byteCode.GetU8(GetCurrentPC());
109 switch (opCode) {
110 case RegExpOpCode::OP_DOTS:
111 case RegExpOpCode::OP_ALL: {
112 if (!HandleOpAll(opCode)) {
113 return false;
114 }
115 break;
116 }
117 case RegExpOpCode::OP_CHAR32:
118 case RegExpOpCode::OP_CHAR: {
119 if (!HandleOpChar(byteCode, opCode)) {
120 return false;
121 }
122 break;
123 }
124 case RegExpOpCode::OP_NOT_WORD_BOUNDARY:
125 case RegExpOpCode::OP_WORD_BOUNDARY: {
126 if (!HandleOpWordBoundary(opCode)) {
127 return false;
128 }
129 break;
130 }
131 case RegExpOpCode::OP_LINE_START: {
132 if (!HandleOpLineStart(opCode)) {
133 return false;
134 }
135 break;
136 }
137 case RegExpOpCode::OP_LINE_END: {
138 if (!HandleOpLineEnd(opCode)) {
139 return false;
140 }
141 break;
142 }
143 case RegExpOpCode::OP_SAVE_START:
144 HandleOpSaveStart(byteCode, opCode);
145 break;
146 case RegExpOpCode::OP_SAVE_END:
147 HandleOpSaveEnd(byteCode, opCode);
148 break;
149 case RegExpOpCode::OP_GOTO: {
150 uint32_t offset = byteCode.GetU32(GetCurrentPC() + 1);
151 Advance(opCode, offset);
152 break;
153 }
154 case RegExpOpCode::OP_MATCH: {
155 // jump to match ahead
156 if (MatchFailed(true)) {
157 return false;
158 }
159 break;
160 }
161 case RegExpOpCode::OP_MATCH_END:
162 return true;
163 case RegExpOpCode::OP_SAVE_RESET:
164 HandleOpSaveReset(byteCode, opCode);
165 break;
166 case RegExpOpCode::OP_SPLIT_NEXT:
167 case RegExpOpCode::OP_MATCH_AHEAD:
168 case RegExpOpCode::OP_NEGATIVE_MATCH_AHEAD:
169 HandleOpMatch(byteCode, opCode);
170 break;
171 case RegExpOpCode::OP_SPLIT_FIRST:
172 HandleOpSplitFirst(byteCode, opCode);
173 break;
174 case RegExpOpCode::OP_PREV: {
175 if (!HandleOpPrev(opCode)) {
176 return false;
177 }
178 break;
179 }
180 case RegExpOpCode::OP_LOOP_GREEDY:
181 case RegExpOpCode::OP_LOOP:
182 HandleOpLoop(byteCode, opCode);
183 break;
184 case RegExpOpCode::OP_PUSH_CHAR: {
185 PushStack(reinterpret_cast<uintptr_t>(GetCurrentPtr()));
186 Advance(opCode);
187 break;
188 }
189 case RegExpOpCode::OP_CHECK_CHAR: {
190 if (PopStack() != reinterpret_cast<uintptr_t>(GetCurrentPtr())) {
191 Advance(opCode);
192 } else {
193 uint32_t offset = byteCode.GetU32(GetCurrentPC() + 1);
194 Advance(opCode, offset);
195 }
196 break;
197 }
198 case RegExpOpCode::OP_PUSH: {
199 PushStack(0);
200 Advance(opCode);
201 break;
202 }
203 case RegExpOpCode::OP_POP: {
204 PopStack();
205 Advance(opCode);
206 break;
207 }
208 case RegExpOpCode::OP_RANGE32: {
209 if (!HandleOpRange32(byteCode)) {
210 return false;
211 }
212 break;
213 }
214 case RegExpOpCode::OP_RANGE: {
215 if (!HandleOpRange(byteCode)) {
216 return false;
217 }
218 break;
219 }
220 case RegExpOpCode::OP_BACKREFERENCE:
221 case RegExpOpCode::OP_BACKWARD_BACKREFERENCE: {
222 if (!HandleOpBackReference(byteCode, opCode)) {
223 return false;
224 }
225 break;
226 }
227 default:
228 UNREACHABLE();
229 }
230 }
231 // for loop match
232 return true;
233 }
234
DumpResult(std::ostream & out) const235 void RegExpExecutor::DumpResult(std::ostream &out) const
236 {
237 out << "captures:" << std::endl;
238 for (uint32_t i = 0; i < nCapture_; i++) {
239 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
240 CaptureState *captureState = &captureResultList_[i];
241 int32_t len = captureState->captureEnd - captureState->captureStart;
242 if ((captureState->captureStart != nullptr && captureState->captureEnd != nullptr) && (len >= 0)) {
243 out << i << ":\t" << CString(reinterpret_cast<const char *>(captureState->captureStart), len) << std::endl;
244 } else {
245 out << i << ":\t"
246 << "undefined" << std::endl;
247 }
248 }
249 }
250
GetResult(const JSThread * thread,bool isSuccess) const251 MatchResult RegExpExecutor::GetResult(const JSThread *thread, bool isSuccess) const
252 {
253 ObjectFactory *factory = thread->GetEcmaVM()->GetFactory();
254 MatchResult result;
255 std::vector<std::pair<bool, JSHandle<EcmaString>>> captures;
256 result.isSuccess_ = isSuccess;
257 if (isSuccess) {
258 for (uint32_t i = 0; i < nCapture_; i++) {
259 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
260 CaptureState *captureState = &captureResultList_[i];
261 if (i == 0) {
262 result.index_ = captureState->captureStart - input_;
263 if (isWideChar_) {
264 result.index_ /= WIDE_CHAR_SIZE;
265 }
266 }
267 int32_t len = captureState->captureEnd - captureState->captureStart;
268 std::pair<bool, JSHandle<EcmaString>> pair;
269 if ((captureState->captureStart != nullptr && captureState->captureEnd != nullptr) && (len >= 0)) {
270 pair.first = false;
271 if (isWideChar_) {
272 // create utf-16 string
273 pair.second = factory->NewFromUtf16(
274 reinterpret_cast<const uint16_t *>(captureState->captureStart), len / 2);
275 } else {
276 // create utf-8 string
277 CVector<uint8_t> buffer(len + 1);
278 uint8_t *dest = buffer.data();
279 if (memcpy_s(dest, len + 1, reinterpret_cast<const uint8_t *>(captureState->captureStart), len) !=
280 EOK) {
281 LOG_FULL(FATAL) << "memcpy_s failed";
282 UNREACHABLE();
283 }
284 dest[len] = '\0'; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
285 pair.second =
286 factory->NewFromUtf8(reinterpret_cast<const uint8_t *>(buffer.data()), len);
287 }
288 } else {
289 // undefined
290 pair.first = true;
291 }
292 captures.emplace_back(pair);
293 }
294 result.captures_ = captures;
295 result.endIndex_ = currentPtr_ - input_;
296 if (isWideChar_) {
297 result.endIndex_ /= WIDE_CHAR_SIZE;
298 }
299 }
300 return result;
301 }
302
PushRegExpState(StateType type,uint32_t pc)303 void RegExpExecutor::PushRegExpState(StateType type, uint32_t pc)
304 {
305 ReAllocStack(stateStackLen_ + 1);
306 auto state = reinterpret_cast<RegExpState *>(
307 stateStack_ + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
308 stateStackLen_ * stateSize_);
309 state->type_ = type;
310 state->currentPc_ = pc;
311 state->currentStack_ = currentStack_;
312 state->currentPtr_ = GetCurrentPtr();
313 size_t listSize = sizeof(CaptureState) * nCapture_;
314 if (memcpy_s(state->captureResultList_, listSize, GetCaptureResultList(), listSize) != EOK) {
315 LOG_FULL(FATAL) << "memcpy_s failed";
316 UNREACHABLE();
317 }
318 uint8_t *stackStart =
319 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
320 reinterpret_cast<uint8_t *>(state->captureResultList_) + sizeof(CaptureState) * nCapture_;
321 if (stack_ != nullptr) {
322 size_t stackSize = sizeof(uintptr_t) * nStack_;
323 if (memcpy_s(stackStart, stackSize, stack_, stackSize) != EOK) {
324 LOG_FULL(FATAL) << "memcpy_s failed";
325 UNREACHABLE();
326 }
327 }
328 stateStackLen_++;
329 }
330
PopRegExpState(bool copyCaptrue)331 RegExpState *RegExpExecutor::PopRegExpState(bool copyCaptrue)
332 {
333 if (stateStackLen_ != 0) {
334 auto state = PeekRegExpState();
335 size_t listSize = sizeof(CaptureState) * nCapture_;
336 if (copyCaptrue) {
337 if (memcpy_s(GetCaptureResultList(), listSize, state->captureResultList_, listSize) != EOK) {
338 LOG_FULL(FATAL) << "memcpy_s failed";
339 UNREACHABLE();
340 }
341 }
342 SetCurrentPtr(state->currentPtr_);
343 SetCurrentPC(state->currentPc_);
344 currentStack_ = state->currentStack_;
345 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
346 uint8_t *stackStart = reinterpret_cast<uint8_t *>(state->captureResultList_) + listSize;
347 if (stack_ != nullptr) {
348 size_t stackSize = sizeof(uintptr_t) * nStack_;
349 if (memcpy_s(stack_, stackSize, stackStart, stackSize) != EOK) {
350 LOG_FULL(FATAL) << "memcpy_s failed";
351 UNREACHABLE();
352 }
353 }
354 stateStackLen_--;
355 return state;
356 }
357 return nullptr;
358 }
359
ReAllocStack(uint32_t stackLen)360 void RegExpExecutor::ReAllocStack(uint32_t stackLen)
361 {
362 if (stackLen > stateStackSize_) {
363 ASSERT((static_cast<size_t>(stateStackSize_) * 2) <= static_cast<size_t>(UINT32_MAX)); // 2: double the size
364 uint32_t newStackSize = std::max(stateStackSize_ * 2, MIN_STACK_SIZE); // 2: double the size
365 ASSERT((static_cast<size_t>(newStackSize) * static_cast<size_t>(stateSize_)) <=
366 static_cast<size_t>(UINT32_MAX));
367 uint32_t stackByteSize = newStackSize * stateSize_;
368 auto newStack = chunk_->NewArray<uint8_t>(stackByteSize);
369 if (memset_s(newStack, stackByteSize, 0, stackByteSize) != EOK) {
370 LOG_FULL(FATAL) << "memset_s failed";
371 UNREACHABLE();
372 }
373 if (stateStack_ != nullptr) {
374 auto stackSize = stateStackSize_ * stateSize_;
375 if (memcpy_s(newStack, stackSize, stateStack_, stackSize) != EOK) {
376 return;
377 }
378 }
379 stateStack_ = newStack;
380 stateStackSize_ = newStackSize;
381 }
382 }
383 } // namespace panda::ecmascript
384