• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ECMASCRIPT_REGEXP_PARSER_H
17 #define ECMASCRIPT_REGEXP_PARSER_H
18 
19 #include <cstdarg>
20 #include <cstdio>
21 #include <cstdint>
22 #include "ecmascript/js_thread.h"
23 #include "ecmascript/ecma_macros.h"
24 #include "ecmascript/mem/chunk.h"
25 #include "ecmascript/mem/c_containers.h"
26 #include "ecmascript/mem/c_string.h"
27 #include "ecmascript/mem/dyn_chunk.h"
28 #include "ecmascript/regexp/regexp_opcode.h"
29 #include "unicode/stringpiece.h"
30 #include "unicode/uchar.h"
31 #include "unicode/utf16.h"
32 #include "unicode/utf8.h"
33 #include "unicode/utypes.h"
34 #include "unicode/udata.h"
35 #include "unicode/uniset.h"
36 
37 namespace panda::ecmascript {
38 class RegExpParser {
39 public:
40     static constexpr auto FLAG_GLOBAL = (1U << 0U);
41     static constexpr auto FLAG_IGNORECASE = (1U << 1U);
42     static constexpr auto FLAG_MULTILINE = (1U << 2U);
43     static constexpr auto FLAG_DOTALL = (1U << 3U);
44     static constexpr auto FLAG_UTF16 = (1U << 4U);
45     static constexpr auto FLAG_STICKY = (1U << 5U);
46     static constexpr auto FLAG_HASINDICES = (1U << 6U);
47     static constexpr uint32_t FLAG_NUM = 7;
48     static const uint32_t KEY_EOF = UINT32_MAX;
49     static constexpr int CLASS_RANGE_BASE = 0x40000000;
50     static constexpr uint32_t NUM_CAPTURE__OFFSET = 4;
51     static constexpr uint32_t NUM_STACK_OFFSET = 8;
52     static constexpr uint32_t OCTAL_VALUE = 8;
53     static constexpr uint32_t OCTAL_VALUE_RANGE = 32;
54     static constexpr uint32_t HEX_VALUE = 16;
55     static constexpr uint32_t DECIMAL_DIGITS_ADVANCE = 10;
56     static constexpr uint32_t FLAGS_OFFSET = 12;
57     static constexpr uint32_t PREFILTER_OFFSET = 16;
58     static constexpr uint32_t OP_START_OFFSET = 20;
59     static constexpr uint32_t UNICODE_HEX_VALUE = 4;
60     static constexpr uint32_t UNICODE_HEX_ADVANCE = 2;
61     static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3;
62     static constexpr uint32_t UTF8_CHAR_LEN_MAX = 6;
63     static constexpr size_t SPARSE_HEAD_OFFSET = 3;
64     static constexpr size_t SPARSE_OFF_OFFSET = 2;
65     static constexpr size_t SPARSE_MAX_OFFSET = 6;
66     static int Canonicalize(int c, bool isUnicode);
67 
RegExpParser(JSThread * thread,Chunk * chunk)68     explicit RegExpParser(JSThread *thread, Chunk *chunk)
69         : thread_(thread),
70           base_(nullptr),
71           pc_(nullptr),
72           end_(nullptr),
73           flags_(0),
74           c0_(KEY_EOF),
75           captureCount_(0),
76           stackCount_(0),
77           isError_(false),
78           isEmpty_(false),
79           buffer_(chunk),
80           groupNames_(chunk)
81     {
82     }
83 
~RegExpParser()84     ~RegExpParser()
85     {
86         Clear();
87     }
88 
89     NO_COPY_SEMANTIC(RegExpParser);
90     NO_MOVE_SEMANTIC(RegExpParser);
91 
Init(char * source,size_t length,uint32_t flags)92     inline void Init(char *source, size_t length, uint32_t flags)
93     {
94         pc_ = reinterpret_cast<uint8_t *>(source);
95         base_ = pc_;
96         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
97         end_ = reinterpret_cast<uint8_t *>(source) + length - 1;
98         flags_ = flags;
99     }
100 
101     void Parse();
102     void ParseDisjunction(bool isBackward);
103     void ParseAlternative(bool isBackward);
104     bool ParseAssertionCapture(int *captureIndex, bool isBackward);
105     void ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd);
106     int ParseDecimalDigits();
107     int ParseAtomEscape(bool isBackward);
108     int ParseCharacterEscape();
109     bool ParseGroupSpecifier(const uint8_t **pp, CString &name);
110     int ParseCaptureCount(const char *groupName);
111     bool ParseClassRanges(RangeSet *result);
112     void ParseNonemptyClassRangesNoDash(DynChunk *buffer);
113     uint32_t ParseClassAtom(RangeSet *atom);
114     int ParseClassEscape(RangeSet *atom);
115     void ParseError(const char *errorMessage);
116     bool ParseUnicodePropertyValueCharacters(CString &categoryName, CString &valueName);
117     int FindGroupName(const CString &name);
118     uint32_t ParseOctalLiteral();
119     bool ParseHexEscape(int length, uint32_t *value);
120     bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value);
121     bool ParseUnicodeEscape(uint32_t *value);
122     bool ParserIntervalQuantifier(int *pmin, int *pmax);
123     bool HasNamedCaptures();
124     int ParseEscape(const uint8_t **pp, int isUtf16);
125     int RecountCaptures();
126     int IsIdentFirst(uint32_t c);
127     bool NeedIntersection(uint32_t c);
128     void DoParserStackOverflowCheck(const char *errorMessage);
129     bool MatchUnicodeProperty(UProperty property, const char *propertyName, RangeSet *atom, bool negate);
130     bool IsExactPropertyValueAlis(const char *valueName, UProperty property, int32_t propertyValue);
131     bool ParseUnicodePropertyClassRange(CString &propertyName, CString &valueName, RangeSet *atom, bool negate);
132     bool GetUnicodePropertyName(CString &propertyName);
133     bool GetUnicodePropertyValueName(CString &valueName);
134     bool IsExactPropertyAlias(const char *propertyName, UProperty property);
135     bool MatchSepcialUnicodeProperty(CString &name, bool negate, RangeSet *atom);
136     bool IsSupportedBinaryProperty(UProperty property);
137     bool IsBinaryPropertyOfStrings(UProperty property);
GetGroupNames()138     inline CVector<CString> GetGroupNames() const
139     {
140         return newGroupNames_;
141     }
142 
GetGroupNamesSize()143     inline size_t GetGroupNamesSize() const
144     {
145         return groupNames_.size_;
146     }
147 
IsError()148     inline bool IsError() const
149     {
150         return isError_;
151     }
152 
GetOriginBuffer()153     inline uint8_t *GetOriginBuffer() const
154     {
155         return buffer_.buf_;
156     }
157 
GetOriginBufferSize()158     inline size_t GetOriginBufferSize() const
159     {
160         return buffer_.size_;
161     }
162 
GetErrorMsg()163     inline CString GetErrorMsg() const
164     {
165         if (isError_) {
166             return CString(errorMsg_);
167         }
168         return CString("");
169     }
170 
IsGlobal()171     inline bool IsGlobal() const
172     {
173         return (flags_ & FLAG_GLOBAL) != 0;
174     }
175 
IsIgnoreCase()176     inline bool IsIgnoreCase() const
177     {
178         return (flags_ & FLAG_IGNORECASE) != 0;
179     }
180 
IsMultiline()181     inline bool IsMultiline() const
182     {
183         return (flags_ & FLAG_MULTILINE) != 0;
184     }
185 
IsDotAll()186     inline bool IsDotAll() const
187     {
188         return (flags_ & FLAG_DOTALL) != 0;
189     }
190 
IsUtf16()191     inline bool IsUtf16() const
192     {
193         return (flags_ & FLAG_UTF16) != 0;
194     }
195 
IsStick()196     inline bool IsStick() const
197     {
198         return (flags_ & FLAG_STICKY) != 0;
199     }
200 
IsUnicodePropertyValueCharacter(char c)201     inline bool IsUnicodePropertyValueCharacter(char c) const
202     {
203         if (c >= 'a' && c <= 'z') {
204             return true;
205         }
206         if (c >= 'A' && c <= 'Z') {
207             return true;
208         }
209         if (c >= '0' && c <= '9') {
210             return true;
211         }
212         return (c == '_');
213     }
214 
GetcurrentCharNext(int c)215     inline static int GetcurrentCharNext(int c)
216     {
217         int cur = c;
218         c = u_tolower(static_cast<UChar32>(c));
219         if (c == cur) {
220             c = u_toupper(static_cast<UChar32>(c));
221         }
222         if (((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) &&
223             !((cur >= 'A' && cur <= 'Z') || (cur >= 'a' && cur <= 'z'))) {
224             c = cur;
225         }
226         return c;
227     }
ProcessIntersection(RangeSet * result)228     inline static void ProcessIntersection(RangeSet *result)
229     {
230         RangeSet cr;
231         RangeSet cr1;
232         const uint32_t MINLOWERCHAR = 'a';
233         const uint32_t MAXLOWERCHAR = 'z' + 1;
234         const uint32_t MINUPPERCHAR = 'A';
235         const uint32_t MAXUPPERCHAR = 'Z' + 1;
236         // Range values for a and z + 1
237         cr.Insert(MINLOWERCHAR, MAXLOWERCHAR);
238         // Range values for A and Z + 1
239         cr.Insert(MINUPPERCHAR, MAXUPPERCHAR);
240         result->Inter(cr1, cr);
241         result->Insert(cr1);
242     }
243 private:
244     friend class RegExpExecutor;
245     static constexpr int TMP_BUF_SIZE = 128;
Clear()246     void Clear()
247     {
248         base_ = nullptr;
249         pc_ = nullptr;
250         end_ = nullptr;
251         c0_ = KEY_EOF;
252         isError_ = false;
253         isEmpty_ = false;
254     }
255 
Advance()256     void Advance()
257     {
258         if (pc_ <= end_) {
259             DoParserStackOverflowCheck("Advance stack overflow!");
260             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
261             c0_ = *pc_++;
262         } else {
263             c0_ = KEY_EOF;
264         }
265     }
266 
Advance(int offset)267     void Advance(int offset)
268     {
269         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
270         pc_ += offset - 1;
271         Advance();
272     }
273 
Prev()274     void Prev()
275     {
276         if (pc_ >= base_) {
277             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
278             c0_ = *pc_--;
279         } else {
280             c0_ = KEY_EOF;
281         }
282     }
283 
SetIsError()284     void SetIsError()
285     {
286         isError_ = true;
287     }
288 
289     void PrintF(const char *fmt, ...);
290     JSThread *thread_;
291     uint8_t *base_;
292     uint8_t *pc_;
293     uint8_t *end_;
294     uint32_t flags_;
295     uint32_t c0_;
296     int captureCount_;
297     int stackCount_;
298     bool isError_;
299     bool isEmpty_;
300     char errorMsg_[TMP_BUF_SIZE] = {0};  // NOLINTNEXTLINE(modernize-avoid-c-arrays)
301     int hasNamedCaptures_ = -1;
302     int totalCaptureCount_ = -1;
303     DynChunk buffer_;
304     DynChunk groupNames_;
305     CVector<CString> newGroupNames_;
306 };
307 }  // namespace panda::ecmascript
308 #endif  // ECMASCRIPT_REGEXP_PARSER_H
309