• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ECMASCRIPT_REGEXP_PARSER_H
17 #define ECMASCRIPT_REGEXP_PARSER_H
18 
19 #include <cstdarg>
20 #include <cstdio>
21 #include <cstdint>
22 #include "ecmascript/mem/chunk.h"
23 #include "ecmascript/regexp/dyn_chunk.h"
24 #include "ecmascript/regexp/regexp_opcode.h"
25 #include "unicode/stringpiece.h"
26 #include "unicode/uchar.h"
27 #include "unicode/utf16.h"
28 #include "unicode/utf8.h"
29 #include "unicode/utypes.h"
30 
31 namespace panda::ecmascript {
32 class RegExpParser {
33 public:
34     static constexpr auto FLAG_GLOBAL = (1U << 0U);
35     static constexpr auto FLAG_IGNORECASE = (1U << 1U);
36     static constexpr auto FLAG_MULTILINE = (1U << 2U);
37     static constexpr auto FLAG_DOTALL = (1U << 3U);
38     static constexpr auto FLAG_UTF16 = (1U << 4U);
39     static constexpr auto FLAG_STICKY = (1U << 5U);
40     static const int KEY_EOF = -1;
41     static constexpr int CLASS_RANGE_BASE = 0x40000000;
42     static constexpr uint32_t NUM_CAPTURE__OFFSET = 4;
43     static constexpr uint32_t NUM_STACK_OFFSET = 8;
44     static constexpr uint32_t OCTAL_VALUE = 8;
45     static constexpr uint32_t OCTAL_VALUE_RANGE = 32;
46     static constexpr uint32_t HEX_VALUE = 16;
47     static constexpr int32_t DECIMAL_DIGITS_ADVANCE = 10;
48     static constexpr uint32_t FLAGS_OFFSET = 12;
49     static constexpr uint32_t OP_START_OFFSET = 16;
50     static constexpr uint32_t UNICODE_HEX_VALUE = 4;
51     static constexpr uint32_t UNICODE_HEX_ADVANCE = 2;
52     static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3;
53 
RegExpParser(Chunk * chunk)54     explicit RegExpParser(Chunk *chunk)
55         : base_(nullptr),
56           pc_(nullptr),
57           end_(nullptr),
58           flags_(0),
59           c0_(KEY_EOF),
60           captureCount_(0),
61           stackCount_(0),
62           isError_(false),
63           buffer_(chunk),
64           groupNames_(chunk)
65     {
66     }
67 
~RegExpParser()68     ~RegExpParser()
69     {
70         Clear();
71     }
72 
73     NO_COPY_SEMANTIC(RegExpParser);
74     NO_MOVE_SEMANTIC(RegExpParser);
75 
Init(char * source,size_t length,uint32_t flags)76     inline void Init(char *source, size_t length, uint32_t flags)
77     {
78         pc_ = reinterpret_cast<uint8_t *>(source);
79         base_ = pc_;
80         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
81         end_ = reinterpret_cast<uint8_t *>(source) + length - 1;
82         flags_ = flags;
83     }
84 
85     void Parse();
86     void ParseDisjunction(bool isBackward);
87     void ParseAlternative(bool isBackward);
88     bool ParseAssertionCapture(int *captureIndex, bool isBackward);
89     void ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd);
90     int ParseDecimalDigits();
91     int ParseAtomEscape(bool isBackward);
92     int ParseCharacterEscape();
93     bool ParseGroupSpecifier(const uint8_t **pp, CString &name);
94     int ParseCaptureCount(const char *groupName);
95     bool ParseClassRanges(RangeSet *result);
96     void ParseNonemptyClassRangesNoDash(DynChunk *buffer);
97     uint32_t ParseClassAtom(RangeSet *atom);
98     int ParseClassEscape(RangeSet *atom);
99     void ParseError(const char *errorMessage);
100     void ParseUnicodePropertyValueCharacters(bool *isValue);
101     int FindGroupName(const CString &name);
102     uint32_t ParseOctalLiteral();
103     bool ParseHexEscape(int length, uint32_t *value);
104     bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value);
105     bool ParseUnicodeEscape(uint32_t *value);
106     bool ParserIntervalQuantifier(int *pmin, int *pmax);
107 
IsError()108     inline bool IsError() const
109     {
110         return isError_;
111     }
112 
GetOriginBuffer()113     inline uint8_t *GetOriginBuffer() const
114     {
115         return buffer_.buf_;
116     }
117 
GetOriginBufferSize()118     inline size_t GetOriginBufferSize() const
119     {
120         return buffer_.size_;
121     }
122 
GetErrorMsg()123     inline CString GetErrorMsg() const
124     {
125         if (isError_) {
126             return CString(errorMsg_);
127         }
128         return CString("");
129     }
130 
IsGlobal()131     inline bool IsGlobal() const
132     {
133         return (flags_ & FLAG_GLOBAL) != 0;
134     }
135 
IsIgnoreCase()136     inline bool IsIgnoreCase() const
137     {
138         return (flags_ & FLAG_IGNORECASE) != 0;
139     }
140 
IsMultiline()141     inline bool IsMultiline() const
142     {
143         return (flags_ & FLAG_MULTILINE) != 0;
144     }
145 
IsDotAll()146     inline bool IsDotAll() const
147     {
148         return (flags_ & FLAG_DOTALL) != 0;
149     }
150 
IsUtf16()151     inline bool IsUtf16() const
152     {
153         return (flags_ & FLAG_UTF16) != 0;
154     }
155 
IsStick()156     inline bool IsStick() const
157     {
158         return (flags_ & FLAG_STICKY) != 0;
159     }
160 
Canonicalize(int c,bool isUnicode)161     inline static int Canonicalize(int c, bool isUnicode)
162     {
163         if (c < TMP_BUF_SIZE) {  // NOLINTNEXTLINE(readability-magic-numbers)
164             if (c >= 'a' && c <= 'z') {
165                 c = c - 'a' + 'A';
166             }
167         } else {
168             if (isUnicode) {
169                 c = u_toupper(static_cast<UChar32>(c));
170             }
171         }
172         return c;
173     }
174 
175 private:
176     friend class RegExpExecutor;
177     static constexpr int TMP_BUF_SIZE = 128;
Clear()178     void Clear()
179     {
180         base_ = nullptr;
181         pc_ = nullptr;
182         end_ = nullptr;
183         c0_ = KEY_EOF;
184         isError_ = false;
185     }
186 
Advance()187     void Advance()
188     {
189         if (pc_ <= end_) {
190             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
191             c0_ = *pc_++;
192         } else {
193             c0_ = KEY_EOF;
194         }
195     }
196 
Advance(int offset)197     void Advance(int offset)
198     {
199         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
200         pc_ += offset - 1;
201         Advance();
202     }
203 
Prev()204     void Prev()
205     {
206         if (pc_ >= base_) {
207             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
208             c0_ = *pc_--;
209         } else {
210             c0_ = KEY_EOF;
211         }
212     }
213 
SetIsError()214     void SetIsError()
215     {
216         isError_ = true;
217     }
218 
219     void PrintF(const char *fmt, ...);
220     uint8_t *base_;
221     uint8_t *pc_;
222     uint8_t *end_;
223     uint32_t flags_;
224     int c0_;
225     int captureCount_;
226     int stackCount_;
227     bool isError_;
228     char errorMsg_[TMP_BUF_SIZE] = {0};  // NOLINTNEXTLINE(modernize-avoid-c-arrays)
229     DynChunk buffer_;
230     DynChunk groupNames_;
231 };
232 }  // namespace panda::ecmascript
233 #endif  // ECMASCRIPT_REGEXP_PARSER_H