• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ECMASCRIPT_REGEXP_PARSER_H
17 #define ECMASCRIPT_REGEXP_PARSER_H
18 
19 #include <cstdarg>
20 #include <cstdio>
21 #include <cstdint>
22 #include "ecmascript/mem/chunk.h"
23 #include "ecmascript/mem/c_containers.h"
24 #include "ecmascript/mem/c_string.h"
25 #include "ecmascript/mem/dyn_chunk.h"
26 #include "ecmascript/regexp/regexp_opcode.h"
27 #include "unicode/stringpiece.h"
28 #include "unicode/uchar.h"
29 #include "unicode/utf16.h"
30 #include "unicode/utf8.h"
31 #include "unicode/utypes.h"
32 #include "unicode/udata.h"
33 
34 namespace panda::ecmascript {
35 class RegExpParser {
36 public:
37     static constexpr auto FLAG_GLOBAL = (1U << 0U);
38     static constexpr auto FLAG_IGNORECASE = (1U << 1U);
39     static constexpr auto FLAG_MULTILINE = (1U << 2U);
40     static constexpr auto FLAG_DOTALL = (1U << 3U);
41     static constexpr auto FLAG_UTF16 = (1U << 4U);
42     static constexpr auto FLAG_STICKY = (1U << 5U);
43     static const uint32_t KEY_EOF = UINT32_MAX;
44     static constexpr int CLASS_RANGE_BASE = 0x40000000;
45     static constexpr uint32_t NUM_CAPTURE__OFFSET = 4;
46     static constexpr uint32_t NUM_STACK_OFFSET = 8;
47     static constexpr uint32_t OCTAL_VALUE = 8;
48     static constexpr uint32_t OCTAL_VALUE_RANGE = 32;
49     static constexpr uint32_t HEX_VALUE = 16;
50     static constexpr uint32_t DECIMAL_DIGITS_ADVANCE = 10;
51     static constexpr uint32_t FLAGS_OFFSET = 12;
52     static constexpr uint32_t OP_START_OFFSET = 16;
53     static constexpr uint32_t UNICODE_HEX_VALUE = 4;
54     static constexpr uint32_t UNICODE_HEX_ADVANCE = 2;
55     static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3;
56     static constexpr uint32_t UTF8_CHAR_LEN_MAX = 6;
57     static int Canonicalize(int c, bool isUnicode);
58 
RegExpParser(Chunk * chunk)59     explicit RegExpParser(Chunk *chunk)
60         : base_(nullptr),
61           pc_(nullptr),
62           end_(nullptr),
63           flags_(0),
64           c0_(KEY_EOF),
65           captureCount_(0),
66           stackCount_(0),
67           isError_(false),
68           buffer_(chunk),
69           groupNames_(chunk)
70     {
71     }
72 
~RegExpParser()73     ~RegExpParser()
74     {
75         Clear();
76     }
77 
78     NO_COPY_SEMANTIC(RegExpParser);
79     NO_MOVE_SEMANTIC(RegExpParser);
80 
Init(char * source,size_t length,uint32_t flags)81     inline void Init(char *source, size_t length, uint32_t flags)
82     {
83         pc_ = reinterpret_cast<uint8_t *>(source);
84         base_ = pc_;
85         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
86         end_ = reinterpret_cast<uint8_t *>(source) + length - 1;
87         flags_ = flags;
88     }
89 
90     void Parse();
91     void ParseDisjunction(bool isBackward);
92     void ParseAlternative(bool isBackward);
93     bool ParseAssertionCapture(int *captureIndex, bool isBackward);
94     void ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd);
95     int ParseDecimalDigits();
96     int ParseAtomEscape(bool isBackward);
97     int ParseCharacterEscape();
98     bool ParseGroupSpecifier(const uint8_t **pp, CString &name);
99     int ParseCaptureCount(const char *groupName);
100     bool ParseClassRanges(RangeSet *result);
101     void ParseNonemptyClassRangesNoDash(DynChunk *buffer);
102     uint32_t ParseClassAtom(RangeSet *atom);
103     int ParseClassEscape(RangeSet *atom);
104     void ParseError(const char *errorMessage);
105     void ParseUnicodePropertyValueCharacters(bool *isValue);
106     int FindGroupName(const CString &name);
107     uint32_t ParseOctalLiteral();
108     bool ParseHexEscape(int length, uint32_t *value);
109     bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value);
110     bool ParseUnicodeEscape(uint32_t *value);
111     bool ParserIntervalQuantifier(int *pmin, int *pmax);
112     bool HasNamedCaptures();
113     int ParseEscape(const uint8_t **pp, int isUtf16);
114     int RecountCaptures();
115     int IsIdentFirst(uint32_t c);
116     bool NeedIntersection(uint32_t c);
117 
GetGroupNames()118     inline CVector<CString> GetGroupNames() const
119     {
120         return newGroupNames_;
121     }
122 
GetGroupNamesSize()123     inline size_t GetGroupNamesSize() const
124     {
125         return groupNames_.size_;
126     }
127 
IsError()128     inline bool IsError() const
129     {
130         return isError_;
131     }
132 
GetOriginBuffer()133     inline uint8_t *GetOriginBuffer() const
134     {
135         return buffer_.buf_;
136     }
137 
GetOriginBufferSize()138     inline size_t GetOriginBufferSize() const
139     {
140         return buffer_.size_;
141     }
142 
GetErrorMsg()143     inline CString GetErrorMsg() const
144     {
145         if (isError_) {
146             return CString(errorMsg_);
147         }
148         return CString("");
149     }
150 
IsGlobal()151     inline bool IsGlobal() const
152     {
153         return (flags_ & FLAG_GLOBAL) != 0;
154     }
155 
IsIgnoreCase()156     inline bool IsIgnoreCase() const
157     {
158         return (flags_ & FLAG_IGNORECASE) != 0;
159     }
160 
IsMultiline()161     inline bool IsMultiline() const
162     {
163         return (flags_ & FLAG_MULTILINE) != 0;
164     }
165 
IsDotAll()166     inline bool IsDotAll() const
167     {
168         return (flags_ & FLAG_DOTALL) != 0;
169     }
170 
IsUtf16()171     inline bool IsUtf16() const
172     {
173         return (flags_ & FLAG_UTF16) != 0;
174     }
175 
IsStick()176     inline bool IsStick() const
177     {
178         return (flags_ & FLAG_STICKY) != 0;
179     }
180 
GetcurrentCharNext(int c)181     inline static int GetcurrentCharNext(int c)
182     {
183         int cur = c;
184         c = u_tolower(static_cast<UChar32>(c));
185         if (c == cur) {
186             c = u_toupper(static_cast<UChar32>(c));
187         }
188         if (((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) &&
189             !((cur >= 'A' && cur <= 'Z') || (cur >= 'a' && cur <= 'z'))) {
190             c = cur;
191         }
192         return c;
193     }
ProcessIntersection(RangeSet * result)194     inline static void ProcessIntersection(RangeSet *result)
195     {
196         RangeSet cr;
197         RangeSet cr1;
198         const uint32_t MINLOWERCHAR = 'a';
199         const uint32_t MAXLOWERCHAR = 'z' + 1;
200         const uint32_t MINUPPERCHAR = 'A';
201         const uint32_t MAXUPPERCHAR = 'Z' + 1;
202         // Range values for a and z + 1
203         cr.Insert(MINLOWERCHAR, MAXLOWERCHAR);
204         // Range values for A and Z + 1
205         cr.Insert(MINUPPERCHAR, MAXUPPERCHAR);
206         result->Inter(cr1, cr);
207         result->Insert(cr1);
208     }
209 private:
210     friend class RegExpExecutor;
211     static constexpr int TMP_BUF_SIZE = 128;
Clear()212     void Clear()
213     {
214         base_ = nullptr;
215         pc_ = nullptr;
216         end_ = nullptr;
217         c0_ = KEY_EOF;
218         isError_ = false;
219     }
220 
Advance()221     void Advance()
222     {
223         if (pc_ <= end_) {
224             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
225             c0_ = *pc_++;
226         } else {
227             c0_ = KEY_EOF;
228         }
229     }
230 
Advance(int offset)231     void Advance(int offset)
232     {
233         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
234         pc_ += offset - 1;
235         Advance();
236     }
237 
Prev()238     void Prev()
239     {
240         if (pc_ >= base_) {
241             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
242             c0_ = *pc_--;
243         } else {
244             c0_ = KEY_EOF;
245         }
246     }
247 
SetIsError()248     void SetIsError()
249     {
250         isError_ = true;
251     }
252 
253     void PrintF(const char *fmt, ...);
254     uint8_t *base_;
255     uint8_t *pc_;
256     uint8_t *end_;
257     uint32_t flags_;
258     uint32_t c0_;
259     int captureCount_;
260     int stackCount_;
261     bool isError_;
262     char errorMsg_[TMP_BUF_SIZE] = {0};  // NOLINTNEXTLINE(modernize-avoid-c-arrays)
263     int hasNamedCaptures_ = -1;
264     int totalCaptureCount_ = -1;
265     DynChunk buffer_;
266     DynChunk groupNames_;
267     CVector<CString> newGroupNames_;
268 };
269 }  // namespace panda::ecmascript
270 #endif  // ECMASCRIPT_REGEXP_PARSER_H
271