• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef PANDA_RUNTIME_REGEXP_PARSER_H
17 #define PANDA_RUNTIME_REGEXP_PARSER_H
18 
19 #include <cstdarg>
20 #include <cstdio>
21 #include <cstdint>
22 #include "runtime/regexp/ecmascript/mem/dyn_chunk.h"
23 #include "runtime/regexp/ecmascript/regexp_opcode.h"
24 #include "unicode/stringpiece.h"
25 #include "unicode/uchar.h"
26 #include "unicode/utf16.h"
27 #include "unicode/utf8.h"
28 #include "unicode/utypes.h"
29 #include "unicode/udata.h"
30 
31 namespace ark {
32 class RegExpParser {
33 public:
34     static constexpr auto FLAG_GLOBAL = (1U << 0U);
35     static constexpr auto FLAG_IGNORECASE = (1U << 1U);
36     static constexpr auto FLAG_MULTILINE = (1U << 2U);
37     static constexpr auto FLAG_DOTALL = (1U << 3U);
38     static constexpr auto FLAG_UTF16 = (1U << 4U);
39     static constexpr auto FLAG_STICKY = (1U << 5U);
40     static constexpr auto FLAG_HASINDICES = (1U << 6U);
41     static const uint32_t KEY_EOF = UINT32_MAX;
42     static constexpr int CLASS_RANGE_BASE = 0x40000000;
43     static constexpr uint32_t NUM_CAPTURE__OFFSET = 4;
44     static constexpr uint32_t NUM_STACK_OFFSET = 8;
45     static constexpr uint32_t OCTAL_VALUE = 8;
46     static constexpr uint32_t OCTAL_VALUE_RANGE = 32;
47     static constexpr uint32_t HEX_VALUE = 16;
48     static constexpr int32_t DECIMAL_DIGITS_ADVANCE = 10;
49     static constexpr uint32_t FLAGS_OFFSET = 12;
50     static constexpr uint32_t OP_START_OFFSET = 16;
51     static constexpr uint32_t UNICODE_HEX_VALUE = 4;
52     static constexpr uint32_t UNICODE_HEX_ADVANCE = 2;
53     static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3;
54     static constexpr uint32_t UTF8_CHAR_LEN_MAX = 6;
55 
56     explicit RegExpParser() = default;
57 
~RegExpParser()58     ~RegExpParser()
59     {
60         Clear();
61     }
62 
63     NO_COPY_SEMANTIC(RegExpParser);
64     NO_MOVE_SEMANTIC(RegExpParser);
65 
Init(char * source,size_t length,uint32_t flags)66     inline void Init(char *source, size_t length, uint32_t flags)
67     {
68         pc_ = reinterpret_cast<uint8_t *>(source);
69         base_ = pc_;
70         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
71         end_ = reinterpret_cast<uint8_t *>(source) + length - 1;
72         flags_ = flags;
73     }
74 
75     PANDA_PUBLIC_API void Parse();
76     void ParseDisjunction(bool isBackward);
77     void ParseAlternative(bool isBackward);
78     bool ParseAssertionCapture(int *captureIndex, bool isBackward);
79     void ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd);
80     int ParseDecimalDigits();
81     int ParseAtomEscape(bool isBackward);
82     int ParseCharacterEscape();
83     bool ParseGroupSpecifier(const uint8_t **pp, PandaString &name);
84     int ParseCaptureCount(const char *groupName);
85     bool ParseClassRanges(RangeSet *result);
86     void ParseNonemptyClassRangesNoDash(DynChunk *buffer);
87     uint32_t ParseClassAtom(RangeSet *atom);
88     int ParseClassEscape(RangeSet *atom);
89     void ParseError(const char *errorMessage);
90     void ParseUnicodePropertyValueCharactersImpl(bool *isValue);
91     int FindGroupName(const PandaString &name);
92     uint32_t ParseOctalLiteral();
93     bool ParseHexEscape(int length, uint32_t *value);
94     bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value);
95     bool ParseUnicodeEscape(uint32_t *value);
96     bool ParserIntervalQuantifier(int *pmin, int *pmax);
97     bool HasNamedCaptures();
98     int ParseEscape(const uint8_t **pp, int isUtf16);
99     int RecountCaptures();
100     int IsIdentFirst(uint32_t c);
101 
GetGroupNames()102     inline PandaVector<PandaString> GetGroupNames() const
103     {
104         return newGroupNames_;
105     }
106 
GetGroupNamesSize()107     inline size_t GetGroupNamesSize() const
108     {
109         return groupNames_.size_;
110     }
111 
IsError()112     inline bool IsError() const
113     {
114         return isError_;
115     }
116 
GetOriginBuffer()117     inline uint8_t *GetOriginBuffer() const
118     {
119         return buffer_.buf_;
120     }
121 
GetOriginBufferSize()122     inline size_t GetOriginBufferSize() const
123     {
124         return buffer_.size_;
125     }
126 
GetErrorMsg()127     inline PandaString GetErrorMsg() const
128     {
129         if (isError_) {
130             return PandaString(errorMsg_);
131         }
132         return PandaString("");
133     }
134 
IsGlobal()135     inline bool IsGlobal() const
136     {
137         return (flags_ & FLAG_GLOBAL) != 0;
138     }
139 
IsIgnoreCase()140     inline bool IsIgnoreCase() const
141     {
142         return (flags_ & FLAG_IGNORECASE) != 0;
143     }
144 
IsMultiline()145     inline bool IsMultiline() const
146     {
147         return (flags_ & FLAG_MULTILINE) != 0;
148     }
149 
IsDotAll()150     inline bool IsDotAll() const
151     {
152         return (flags_ & FLAG_DOTALL) != 0;
153     }
154 
IsUtf16()155     inline bool IsUtf16() const
156     {
157         return (flags_ & FLAG_UTF16) != 0;
158     }
159 
IsStick()160     inline bool IsStick() const
161     {
162         return (flags_ & FLAG_STICKY) != 0;
163     }
164 
Canonicalize(int c,bool isUnicode)165     inline static int Canonicalize(int c, bool isUnicode)
166     {
167         if (c < TMP_BUF_SIZE) {  // NOLINTNEXTLINE(readability-magic-numbers)
168             if (c >= 'a' && c <= 'z') {
169                 c = c - 'a' + 'A';
170             }
171         } else {
172             if (isUnicode) {
173                 c = u_toupper(static_cast<UChar32>(c));
174             }
175         }
176         return c;
177     }
178 
179 private:
180     friend class RegExpExecutor;
181     static constexpr int TMP_BUF_SIZE = 128;
Clear()182     void Clear()
183     {
184         base_ = nullptr;
185         pc_ = nullptr;
186         end_ = nullptr;
187         c0_ = KEY_EOF;
188         isError_ = false;
189     }
190 
Advance()191     void Advance()
192     {
193         if (pc_ <= end_) {
194             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
195             c0_ = *pc_++;
196         } else {
197             c0_ = KEY_EOF;
198         }
199     }
200 
Advance(int offset)201     void Advance(int offset)
202     {
203         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
204         pc_ += offset - 1;
205         Advance();
206     }
207 
Prev()208     void Prev()
209     {
210         if (pc_ >= base_) {
211             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
212             c0_ = *pc_--;
213         } else {
214             c0_ = KEY_EOF;
215         }
216     }
217 
SetIsError()218     void SetIsError()
219     {
220         isError_ = true;
221     }
222 
223     bool ParseQuantifierPrefix(int &min, int &max, bool &isGreedy);
224     void PrintF(const char *fmt, ...);
225     void ParseUnicodePropertyValueCharacters(int &result);
226     void PrintControlEscapeAndAdvance();
227     void ParseControlLetter(uint32_t &result);
228     void ParseCharacterEscapeDefault(uint32_t &result);
229     void InsertRangeBase(RangeSet *atom, RangeSet &rangeSet, bool invert);
230     void InsertRangeOpCode(DynChunk &buffer, RangeSet &rangeSet, PrevOpCode &prevOp, bool isBackward);
231     void InsertRange32OpCode(DynChunk &buffer, RangeSet &rangeSet, PrevOpCode &prevOp, bool isBackward);
232     void ParseLookBehind(DynChunk &buffer, PrevOpCode &prevOp, bool isBackward);
233     int ParseGroupName();
234 
235     template <typename OpCodeT>
236     void InsertMatchAheadOpCode(bool isBackward);
237 
238     bool ParseAssertion(bool isBackward, bool &isAtom, bool &parseCapture);
239     bool HandleGroupName();
240     bool ParseClassRangesImpl(RangeSet *result);
241     bool CalculateCaptureIndex(const uint8_t *p, int &captureIndex, const char *groupName, PandaString &name);
242     bool ParseCaptureCountImpl(const uint8_t *p, int &captureIndex, const char *groupName, PandaString &name);
243 
244     void ParseAlternativeEscape(bool isBackward, bool &isAtom);
245     void ParseAlternativeEscapeDefault(int atomValue);
246     void ParsePatternCharacter(bool isBackward);
247     void ParseAlternativeAny(bool isBackward);
248     void ParseAlternativeRange(bool isBackward);
249     void ParseAlternativeImpl(bool isBackward, bool &isAtom, int &captureIndex);
250 
251     uint8_t *base_ {nullptr};
252     uint8_t *pc_ {nullptr};
253     uint8_t *end_ {nullptr};
254     uint32_t flags_ {0};
255     uint32_t c0_ {KEY_EOF};
256     int captureCount_ {0};
257     int stackCount_ {0};
258     bool isError_ {false};
259     char errorMsg_[TMP_BUF_SIZE] = {0};  // NOLINT(modernize-avoid-c-arrays)
260     int hasNamedCaptures_ = -1;
261     int totalCaptureCount_ = -1;
262     DynChunk buffer_ {};
263     DynChunk groupNames_ {};
264     PandaVector<PandaString> newGroupNames_ {};
265 };
266 }  // namespace ark
267 #endif  // CORE_REGEXP_PARSER_H
268