• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ECMASCRIPT_REGEXP_PARSER_H
17 #define ECMASCRIPT_REGEXP_PARSER_H
18 
19 #include <cstdarg>
20 #include <cstdio>
21 #include <cstdint>
22 #include "ecmascript/js_thread.h"
23 #include "ecmascript/ecma_macros.h"
24 #include "ecmascript/mem/chunk.h"
25 #include "ecmascript/mem/c_containers.h"
26 #include "ecmascript/mem/c_string.h"
27 #include "ecmascript/mem/dyn_chunk.h"
28 #include "ecmascript/regexp/regexp_opcode.h"
29 #include "unicode/stringpiece.h"
30 #include "unicode/uchar.h"
31 #include "unicode/utf16.h"
32 #include "unicode/utf8.h"
33 #include "unicode/utypes.h"
34 #include "unicode/udata.h"
35 
36 namespace panda::ecmascript {
37 class RegExpParser {
38 public:
39     static constexpr auto FLAG_GLOBAL = (1U << 0U);
40     static constexpr auto FLAG_IGNORECASE = (1U << 1U);
41     static constexpr auto FLAG_MULTILINE = (1U << 2U);
42     static constexpr auto FLAG_DOTALL = (1U << 3U);
43     static constexpr auto FLAG_UTF16 = (1U << 4U);
44     static constexpr auto FLAG_STICKY = (1U << 5U);
45     static constexpr auto FLAG_HASINDICES = (1U << 6U);
46     static constexpr uint32_t FLAG_NUM = 7;
47     static const uint32_t KEY_EOF = UINT32_MAX;
48     static constexpr int CLASS_RANGE_BASE = 0x40000000;
49     static constexpr uint32_t NUM_CAPTURE__OFFSET = 4;
50     static constexpr uint32_t NUM_STACK_OFFSET = 8;
51     static constexpr uint32_t OCTAL_VALUE = 8;
52     static constexpr uint32_t OCTAL_VALUE_RANGE = 32;
53     static constexpr uint32_t HEX_VALUE = 16;
54     static constexpr uint32_t DECIMAL_DIGITS_ADVANCE = 10;
55     static constexpr uint32_t FLAGS_OFFSET = 12;
56     static constexpr uint32_t OP_START_OFFSET = 16;
57     static constexpr uint32_t UNICODE_HEX_VALUE = 4;
58     static constexpr uint32_t UNICODE_HEX_ADVANCE = 2;
59     static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3;
60     static constexpr uint32_t UTF8_CHAR_LEN_MAX = 6;
61     static int Canonicalize(int c, bool isUnicode);
62 
RegExpParser(JSThread * thread,Chunk * chunk)63     explicit RegExpParser(JSThread *thread, Chunk *chunk)
64         : thread_(thread),
65           base_(nullptr),
66           pc_(nullptr),
67           end_(nullptr),
68           flags_(0),
69           c0_(KEY_EOF),
70           captureCount_(0),
71           stackCount_(0),
72           isError_(false),
73           buffer_(chunk),
74           groupNames_(chunk)
75     {
76     }
77 
~RegExpParser()78     ~RegExpParser()
79     {
80         Clear();
81     }
82 
83     NO_COPY_SEMANTIC(RegExpParser);
84     NO_MOVE_SEMANTIC(RegExpParser);
85 
Init(char * source,size_t length,uint32_t flags)86     inline void Init(char *source, size_t length, uint32_t flags)
87     {
88         pc_ = reinterpret_cast<uint8_t *>(source);
89         base_ = pc_;
90         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
91         end_ = reinterpret_cast<uint8_t *>(source) + length - 1;
92         flags_ = flags;
93     }
94 
95     void Parse();
96     void ParseDisjunction(bool isBackward);
97     void ParseAlternative(bool isBackward);
98     bool ParseAssertionCapture(int *captureIndex, bool isBackward);
99     void ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd);
100     int ParseDecimalDigits();
101     int ParseAtomEscape(bool isBackward);
102     int ParseCharacterEscape();
103     bool ParseGroupSpecifier(const uint8_t **pp, CString &name);
104     int ParseCaptureCount(const char *groupName);
105     bool ParseClassRanges(RangeSet *result);
106     void ParseNonemptyClassRangesNoDash(DynChunk *buffer);
107     uint32_t ParseClassAtom(RangeSet *atom);
108     int ParseClassEscape(RangeSet *atom);
109     void ParseError(const char *errorMessage);
110     void ParseUnicodePropertyValueCharacters(bool *isValue);
111     int FindGroupName(const CString &name);
112     uint32_t ParseOctalLiteral();
113     bool ParseHexEscape(int length, uint32_t *value);
114     bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value);
115     bool ParseUnicodeEscape(uint32_t *value);
116     bool ParserIntervalQuantifier(int *pmin, int *pmax);
117     bool HasNamedCaptures();
118     int ParseEscape(const uint8_t **pp, int isUtf16);
119     int RecountCaptures();
120     int IsIdentFirst(uint32_t c);
121     bool NeedIntersection(uint32_t c);
122     void DoParserStackOverflowCheck(const char *errorMessage);
123 
GetGroupNames()124     inline CVector<CString> GetGroupNames() const
125     {
126         return newGroupNames_;
127     }
128 
GetGroupNamesSize()129     inline size_t GetGroupNamesSize() const
130     {
131         return groupNames_.size_;
132     }
133 
IsError()134     inline bool IsError() const
135     {
136         return isError_;
137     }
138 
GetOriginBuffer()139     inline uint8_t *GetOriginBuffer() const
140     {
141         return buffer_.buf_;
142     }
143 
GetOriginBufferSize()144     inline size_t GetOriginBufferSize() const
145     {
146         return buffer_.size_;
147     }
148 
GetErrorMsg()149     inline CString GetErrorMsg() const
150     {
151         if (isError_) {
152             return CString(errorMsg_);
153         }
154         return CString("");
155     }
156 
IsGlobal()157     inline bool IsGlobal() const
158     {
159         return (flags_ & FLAG_GLOBAL) != 0;
160     }
161 
IsIgnoreCase()162     inline bool IsIgnoreCase() const
163     {
164         return (flags_ & FLAG_IGNORECASE) != 0;
165     }
166 
IsMultiline()167     inline bool IsMultiline() const
168     {
169         return (flags_ & FLAG_MULTILINE) != 0;
170     }
171 
IsDotAll()172     inline bool IsDotAll() const
173     {
174         return (flags_ & FLAG_DOTALL) != 0;
175     }
176 
IsUtf16()177     inline bool IsUtf16() const
178     {
179         return (flags_ & FLAG_UTF16) != 0;
180     }
181 
IsStick()182     inline bool IsStick() const
183     {
184         return (flags_ & FLAG_STICKY) != 0;
185     }
186 
GetcurrentCharNext(int c)187     inline static int GetcurrentCharNext(int c)
188     {
189         int cur = c;
190         c = u_tolower(static_cast<UChar32>(c));
191         if (c == cur) {
192             c = u_toupper(static_cast<UChar32>(c));
193         }
194         if (((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) &&
195             !((cur >= 'A' && cur <= 'Z') || (cur >= 'a' && cur <= 'z'))) {
196             c = cur;
197         }
198         return c;
199     }
ProcessIntersection(RangeSet * result)200     inline static void ProcessIntersection(RangeSet *result)
201     {
202         RangeSet cr;
203         RangeSet cr1;
204         const uint32_t MINLOWERCHAR = 'a';
205         const uint32_t MAXLOWERCHAR = 'z' + 1;
206         const uint32_t MINUPPERCHAR = 'A';
207         const uint32_t MAXUPPERCHAR = 'Z' + 1;
208         // Range values for a and z + 1
209         cr.Insert(MINLOWERCHAR, MAXLOWERCHAR);
210         // Range values for A and Z + 1
211         cr.Insert(MINUPPERCHAR, MAXUPPERCHAR);
212         result->Inter(cr1, cr);
213         result->Insert(cr1);
214     }
215 private:
216     friend class RegExpExecutor;
217     static constexpr int TMP_BUF_SIZE = 128;
Clear()218     void Clear()
219     {
220         base_ = nullptr;
221         pc_ = nullptr;
222         end_ = nullptr;
223         c0_ = KEY_EOF;
224         isError_ = false;
225     }
226 
Advance()227     void Advance()
228     {
229         if (pc_ <= end_) {
230             DoParserStackOverflowCheck("Advance stack overflow!");
231             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
232             c0_ = *pc_++;
233         } else {
234             c0_ = KEY_EOF;
235         }
236     }
237 
Advance(int offset)238     void Advance(int offset)
239     {
240         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
241         pc_ += offset - 1;
242         Advance();
243     }
244 
Prev()245     void Prev()
246     {
247         if (pc_ >= base_) {
248             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
249             c0_ = *pc_--;
250         } else {
251             c0_ = KEY_EOF;
252         }
253     }
254 
SetIsError()255     void SetIsError()
256     {
257         isError_ = true;
258     }
259 
260     void PrintF(const char *fmt, ...);
261     JSThread *thread_;
262     uint8_t *base_;
263     uint8_t *pc_;
264     uint8_t *end_;
265     uint32_t flags_;
266     uint32_t c0_;
267     int captureCount_;
268     int stackCount_;
269     bool isError_;
270     char errorMsg_[TMP_BUF_SIZE] = {0};  // NOLINTNEXTLINE(modernize-avoid-c-arrays)
271     int hasNamedCaptures_ = -1;
272     int totalCaptureCount_ = -1;
273     DynChunk buffer_;
274     DynChunk groupNames_;
275     CVector<CString> newGroupNames_;
276 };
277 }  // namespace panda::ecmascript
278 #endif  // ECMASCRIPT_REGEXP_PARSER_H
279