• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ECMASCRIPT_REGEXP_PARSER_H
17 #define ECMASCRIPT_REGEXP_PARSER_H
18 
19 #include <cstdarg>
20 #include <cstdio>
21 #include <cstdint>
22 #include "ecmascript/mem/chunk.h"
23 #include "ecmascript/mem/c_containers.h"
24 #include "ecmascript/mem/c_string.h"
25 #include "ecmascript/mem/dyn_chunk.h"
26 #include "ecmascript/regexp/regexp_opcode.h"
27 #include "unicode/stringpiece.h"
28 #include "unicode/uchar.h"
29 #include "unicode/utf16.h"
30 #include "unicode/utf8.h"
31 #include "unicode/utypes.h"
32 #include "unicode/udata.h"
33 
34 namespace panda::ecmascript {
35 class RegExpParser {
36 public:
37     static constexpr auto FLAG_GLOBAL = (1U << 0U);
38     static constexpr auto FLAG_IGNORECASE = (1U << 1U);
39     static constexpr auto FLAG_MULTILINE = (1U << 2U);
40     static constexpr auto FLAG_DOTALL = (1U << 3U);
41     static constexpr auto FLAG_UTF16 = (1U << 4U);
42     static constexpr auto FLAG_STICKY = (1U << 5U);
43     static const uint32_t KEY_EOF = UINT32_MAX;
44     static constexpr int CLASS_RANGE_BASE = 0x40000000;
45     static constexpr uint32_t NUM_CAPTURE__OFFSET = 4;
46     static constexpr uint32_t NUM_STACK_OFFSET = 8;
47     static constexpr uint32_t OCTAL_VALUE = 8;
48     static constexpr uint32_t OCTAL_VALUE_RANGE = 32;
49     static constexpr uint32_t HEX_VALUE = 16;
50     static constexpr uint32_t DECIMAL_DIGITS_ADVANCE = 10;
51     static constexpr uint32_t FLAGS_OFFSET = 12;
52     static constexpr uint32_t OP_START_OFFSET = 16;
53     static constexpr uint32_t UNICODE_HEX_VALUE = 4;
54     static constexpr uint32_t UNICODE_HEX_ADVANCE = 2;
55     static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3;
56     static constexpr uint32_t UTF8_CHAR_LEN_MAX = 6;
57     static int Canonicalize(int c, bool isUnicode);
58 
RegExpParser(Chunk * chunk)59     explicit RegExpParser(Chunk *chunk)
60         : base_(nullptr),
61           pc_(nullptr),
62           end_(nullptr),
63           flags_(0),
64           c0_(KEY_EOF),
65           captureCount_(0),
66           stackCount_(0),
67           isError_(false),
68           isEmpty_(false),
69           buffer_(chunk),
70           groupNames_(chunk)
71     {
72     }
73 
~RegExpParser()74     ~RegExpParser()
75     {
76         Clear();
77     }
78 
79     NO_COPY_SEMANTIC(RegExpParser);
80     NO_MOVE_SEMANTIC(RegExpParser);
81 
Init(char * source,size_t length,uint32_t flags)82     inline void Init(char *source, size_t length, uint32_t flags)
83     {
84         pc_ = reinterpret_cast<uint8_t *>(source);
85         base_ = pc_;
86         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
87         end_ = reinterpret_cast<uint8_t *>(source) + length - 1;
88         flags_ = flags;
89     }
90 
91     void Parse();
92     void ParseDisjunction(bool isBackward);
93     void ParseAlternative(bool isBackward);
94     bool ParseAssertionCapture(int *captureIndex, bool isBackward);
95     void ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd);
96     int ParseDecimalDigits();
97     int ParseAtomEscape(bool isBackward);
98     int ParseCharacterEscape();
99     bool ParseGroupSpecifier(const uint8_t **pp, CString &name);
100     int ParseCaptureCount(const char *groupName);
101     bool ParseClassRanges(RangeSet *result);
102     void ParseNonemptyClassRangesNoDash(DynChunk *buffer);
103     uint32_t ParseClassAtom(RangeSet *atom);
104     int ParseClassEscape(RangeSet *atom);
105     void ParseError(const char *errorMessage);
106     void ParseUnicodePropertyValueCharacters(bool *isValue);
107     int FindGroupName(const CString &name);
108     uint32_t ParseOctalLiteral();
109     bool ParseHexEscape(int length, uint32_t *value);
110     bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value);
111     bool ParseUnicodeEscape(uint32_t *value);
112     bool ParserIntervalQuantifier(int *pmin, int *pmax);
113     bool HasNamedCaptures();
114     int ParseEscape(const uint8_t **pp, int isUtf16);
115     int RecountCaptures();
116     int IsIdentFirst(uint32_t c);
117     bool NeedIntersection(uint32_t c);
118 
GetGroupNames()119     inline CVector<CString> GetGroupNames() const
120     {
121         return newGroupNames_;
122     }
123 
GetGroupNamesSize()124     inline size_t GetGroupNamesSize() const
125     {
126         return groupNames_.size_;
127     }
128 
IsError()129     inline bool IsError() const
130     {
131         return isError_;
132     }
133 
GetOriginBuffer()134     inline uint8_t *GetOriginBuffer() const
135     {
136         return buffer_.buf_;
137     }
138 
GetOriginBufferSize()139     inline size_t GetOriginBufferSize() const
140     {
141         return buffer_.size_;
142     }
143 
GetErrorMsg()144     inline CString GetErrorMsg() const
145     {
146         if (isError_) {
147             return CString(errorMsg_);
148         }
149         return CString("");
150     }
151 
IsGlobal()152     inline bool IsGlobal() const
153     {
154         return (flags_ & FLAG_GLOBAL) != 0;
155     }
156 
IsIgnoreCase()157     inline bool IsIgnoreCase() const
158     {
159         return (flags_ & FLAG_IGNORECASE) != 0;
160     }
161 
IsMultiline()162     inline bool IsMultiline() const
163     {
164         return (flags_ & FLAG_MULTILINE) != 0;
165     }
166 
IsDotAll()167     inline bool IsDotAll() const
168     {
169         return (flags_ & FLAG_DOTALL) != 0;
170     }
171 
IsUtf16()172     inline bool IsUtf16() const
173     {
174         return (flags_ & FLAG_UTF16) != 0;
175     }
176 
IsStick()177     inline bool IsStick() const
178     {
179         return (flags_ & FLAG_STICKY) != 0;
180     }
181 
GetcurrentCharNext(int c)182     inline static int GetcurrentCharNext(int c)
183     {
184         int cur = c;
185         c = u_tolower(static_cast<UChar32>(c));
186         if (c == cur) {
187             c = u_toupper(static_cast<UChar32>(c));
188         }
189         if (((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) &&
190             !((cur >= 'A' && cur <= 'Z') || (cur >= 'a' && cur <= 'z'))) {
191             c = cur;
192         }
193         return c;
194     }
ProcessIntersection(RangeSet * result)195     inline static void ProcessIntersection(RangeSet *result)
196     {
197         RangeSet cr;
198         RangeSet cr1;
199         const uint32_t MINLOWERCHAR = 'a';
200         const uint32_t MAXLOWERCHAR = 'z' + 1;
201         const uint32_t MINUPPERCHAR = 'A';
202         const uint32_t MAXUPPERCHAR = 'Z' + 1;
203         // Range values for a and z + 1
204         cr.Insert(MINLOWERCHAR, MAXLOWERCHAR);
205         // Range values for A and Z + 1
206         cr.Insert(MINUPPERCHAR, MAXUPPERCHAR);
207         result->Inter(cr1, cr);
208         result->Insert(cr1);
209     }
210 private:
211     friend class RegExpExecutor;
212     static constexpr int TMP_BUF_SIZE = 128;
Clear()213     void Clear()
214     {
215         base_ = nullptr;
216         pc_ = nullptr;
217         end_ = nullptr;
218         c0_ = KEY_EOF;
219         isError_ = false;
220         isEmpty_ = false;
221     }
222 
Advance()223     void Advance()
224     {
225         if (pc_ <= end_) {
226             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
227             c0_ = *pc_++;
228         } else {
229             c0_ = KEY_EOF;
230         }
231     }
232 
Advance(int offset)233     void Advance(int offset)
234     {
235         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
236         pc_ += offset - 1;
237         Advance();
238     }
239 
Prev()240     void Prev()
241     {
242         if (pc_ >= base_) {
243             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
244             c0_ = *pc_--;
245         } else {
246             c0_ = KEY_EOF;
247         }
248     }
249 
SetIsError()250     void SetIsError()
251     {
252         isError_ = true;
253     }
254 
255     void PrintF(const char *fmt, ...);
256     uint8_t *base_;
257     uint8_t *pc_;
258     uint8_t *end_;
259     uint32_t flags_;
260     uint32_t c0_;
261     int captureCount_;
262     int stackCount_;
263     bool isError_;
264     bool isEmpty_;
265     char errorMsg_[TMP_BUF_SIZE] = {0};  // NOLINTNEXTLINE(modernize-avoid-c-arrays)
266     int hasNamedCaptures_ = -1;
267     int totalCaptureCount_ = -1;
268     DynChunk buffer_;
269     DynChunk groupNames_;
270     CVector<CString> newGroupNames_;
271 };
272 }  // namespace panda::ecmascript
273 #endif  // ECMASCRIPT_REGEXP_PARSER_H
274