• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ECMASCRIPT_REGEXP_PARSER_H
17 #define ECMASCRIPT_REGEXP_PARSER_H
18 
19 #include <cstdarg>
20 #include <cstdio>
21 #include <cstdint>
22 #include "ecmascript/mem/chunk.h"
23 #include "ecmascript/mem/c_containers.h"
24 #include "ecmascript/mem/c_string.h"
25 #include "ecmascript/mem/dyn_chunk.h"
26 #include "ecmascript/regexp/regexp_opcode.h"
27 #include "unicode/stringpiece.h"
28 #include "unicode/uchar.h"
29 #include "unicode/utf16.h"
30 #include "unicode/utf8.h"
31 #include "unicode/utypes.h"
32 #include "unicode/udata.h"
33 
34 namespace panda::ecmascript {
35 class RegExpParser {
36 public:
37     static constexpr auto FLAG_GLOBAL = (1U << 0U);
38     static constexpr auto FLAG_IGNORECASE = (1U << 1U);
39     static constexpr auto FLAG_MULTILINE = (1U << 2U);
40     static constexpr auto FLAG_DOTALL = (1U << 3U);
41     static constexpr auto FLAG_UTF16 = (1U << 4U);
42     static constexpr auto FLAG_STICKY = (1U << 5U);
43     static const uint32_t KEY_EOF = UINT32_MAX;
44     static constexpr int CLASS_RANGE_BASE = 0x40000000;
45     static constexpr uint32_t NUM_CAPTURE__OFFSET = 4;
46     static constexpr uint32_t NUM_STACK_OFFSET = 8;
47     static constexpr uint32_t OCTAL_VALUE = 8;
48     static constexpr uint32_t OCTAL_VALUE_RANGE = 32;
49     static constexpr uint32_t HEX_VALUE = 16;
50     static constexpr int32_t DECIMAL_DIGITS_ADVANCE = 10;
51     static constexpr uint32_t FLAGS_OFFSET = 12;
52     static constexpr uint32_t OP_START_OFFSET = 16;
53     static constexpr uint32_t UNICODE_HEX_VALUE = 4;
54     static constexpr uint32_t UNICODE_HEX_ADVANCE = 2;
55     static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3;
56     static constexpr uint32_t UTF8_CHAR_LEN_MAX = 6;
57 
RegExpParser(Chunk * chunk)58     explicit RegExpParser(Chunk *chunk)
59         : base_(nullptr),
60           pc_(nullptr),
61           end_(nullptr),
62           flags_(0),
63           c0_(KEY_EOF),
64           captureCount_(0),
65           stackCount_(0),
66           isError_(false),
67           buffer_(chunk),
68           groupNames_(chunk)
69     {
70     }
71 
~RegExpParser()72     ~RegExpParser()
73     {
74         Clear();
75     }
76 
77     NO_COPY_SEMANTIC(RegExpParser);
78     NO_MOVE_SEMANTIC(RegExpParser);
79 
Init(char * source,size_t length,uint32_t flags)80     inline void Init(char *source, size_t length, uint32_t flags)
81     {
82         pc_ = reinterpret_cast<uint8_t *>(source);
83         base_ = pc_;
84         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
85         end_ = reinterpret_cast<uint8_t *>(source) + length - 1;
86         flags_ = flags;
87     }
88 
89     void Parse();
90     void ParseDisjunction(bool isBackward);
91     void ParseAlternative(bool isBackward);
92     bool ParseAssertionCapture(int *captureIndex, bool isBackward);
93     void ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd);
94     int ParseDecimalDigits();
95     int ParseAtomEscape(bool isBackward);
96     int ParseCharacterEscape();
97     bool ParseGroupSpecifier(const uint8_t **pp, CString &name);
98     int ParseCaptureCount(const char *groupName);
99     bool ParseClassRanges(RangeSet *result);
100     void ParseNonemptyClassRangesNoDash(DynChunk *buffer);
101     uint32_t ParseClassAtom(RangeSet *atom);
102     int ParseClassEscape(RangeSet *atom);
103     void ParseError(const char *errorMessage);
104     void ParseUnicodePropertyValueCharacters(bool *isValue);
105     int FindGroupName(const CString &name);
106     uint32_t ParseOctalLiteral();
107     bool ParseHexEscape(int length, uint32_t *value);
108     bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value);
109     bool ParseUnicodeEscape(uint32_t *value);
110     bool ParserIntervalQuantifier(int *pmin, int *pmax);
111     bool HasNamedCaptures();
112     int ParseEscape(const uint8_t **pp, int isUtf16);
113     int RecountCaptures();
114     int IsIdentFirst(uint32_t c);
115 
GetGroupNames()116     inline CVector<CString> GetGroupNames() const
117     {
118         return newGroupNames_;
119     }
120 
GetGroupNamesSize()121     inline size_t GetGroupNamesSize() const
122     {
123         return groupNames_.size_ ;
124     }
125 
IsError()126     inline bool IsError() const
127     {
128         return isError_;
129     }
130 
GetOriginBuffer()131     inline uint8_t *GetOriginBuffer() const
132     {
133         return buffer_.buf_;
134     }
135 
GetOriginBufferSize()136     inline size_t GetOriginBufferSize() const
137     {
138         return buffer_.size_;
139     }
140 
GetErrorMsg()141     inline CString GetErrorMsg() const
142     {
143         if (isError_) {
144             return CString(errorMsg_);
145         }
146         return CString("");
147     }
148 
IsGlobal()149     inline bool IsGlobal() const
150     {
151         return (flags_ & FLAG_GLOBAL) != 0;
152     }
153 
IsIgnoreCase()154     inline bool IsIgnoreCase() const
155     {
156         return (flags_ & FLAG_IGNORECASE) != 0;
157     }
158 
IsMultiline()159     inline bool IsMultiline() const
160     {
161         return (flags_ & FLAG_MULTILINE) != 0;
162     }
163 
IsDotAll()164     inline bool IsDotAll() const
165     {
166         return (flags_ & FLAG_DOTALL) != 0;
167     }
168 
IsUtf16()169     inline bool IsUtf16() const
170     {
171         return (flags_ & FLAG_UTF16) != 0;
172     }
173 
IsStick()174     inline bool IsStick() const
175     {
176         return (flags_ & FLAG_STICKY) != 0;
177     }
178 
Canonicalize(int c,bool isUnicode)179     inline static int Canonicalize(int c, bool isUnicode)
180     {
181         if (c < TMP_BUF_SIZE) {  // NOLINTNEXTLINE(readability-magic-numbers)
182             if (c >= 'a' && c <= 'z') {
183                 c = c - 'a' + 'A';
184             }
185         } else {
186             if (isUnicode) {
187                 c = u_toupper(static_cast<UChar32>(c));
188             }
189         }
190         return c;
191     }
192 
193 private:
194     friend class RegExpExecutor;
195     static constexpr int TMP_BUF_SIZE = 128;
Clear()196     void Clear()
197     {
198         base_ = nullptr;
199         pc_ = nullptr;
200         end_ = nullptr;
201         c0_ = KEY_EOF;
202         isError_ = false;
203     }
204 
Advance()205     void Advance()
206     {
207         if (pc_ <= end_) {
208             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
209             c0_ = *pc_++;
210         } else {
211             c0_ = KEY_EOF;
212         }
213     }
214 
Advance(int offset)215     void Advance(int offset)
216     {
217         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
218         pc_ += offset - 1;
219         Advance();
220     }
221 
Prev()222     void Prev()
223     {
224         if (pc_ >= base_) {
225             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
226             c0_ = *pc_--;
227         } else {
228             c0_ = KEY_EOF;
229         }
230     }
231 
SetIsError()232     void SetIsError()
233     {
234         isError_ = true;
235     }
236 
237     void PrintF(const char *fmt, ...);
238     uint8_t *base_;
239     uint8_t *pc_;
240     uint8_t *end_;
241     uint32_t flags_;
242     uint32_t c0_;
243     int captureCount_;
244     int stackCount_;
245     bool isError_;
246     char errorMsg_[TMP_BUF_SIZE] = {0};  // NOLINTNEXTLINE(modernize-avoid-c-arrays)
247     int hasNamedCaptures_ = -1;
248     int totalCaptureCount_ = -1;
249     DynChunk buffer_;
250     DynChunk groupNames_;
251     CVector<CString> newGroupNames_;
252 };
253 }  // namespace panda::ecmascript
254 #endif  // ECMASCRIPT_REGEXP_PARSER_H
255