1 /** 2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef PANDA_RUNTIME_REGEXP_PARSER_H 17 #define PANDA_RUNTIME_REGEXP_PARSER_H 18 19 #include <cstdarg> 20 #include <cstdio> 21 #include <cstdint> 22 #include "runtime/regexp/ecmascript/mem/dyn_chunk.h" 23 #include "runtime/regexp/ecmascript/regexp_opcode.h" 24 #include "unicode/stringpiece.h" 25 #include "unicode/uchar.h" 26 #include "unicode/utf16.h" 27 #include "unicode/utf8.h" 28 #include "unicode/utypes.h" 29 #include "unicode/udata.h" 30 31 namespace ark { 32 class RegExpParser { 33 public: 34 static constexpr auto FLAG_GLOBAL = (1U << 0U); 35 static constexpr auto FLAG_IGNORECASE = (1U << 1U); 36 static constexpr auto FLAG_MULTILINE = (1U << 2U); 37 static constexpr auto FLAG_DOTALL = (1U << 3U); 38 static constexpr auto FLAG_UTF16 = (1U << 4U); 39 static constexpr auto FLAG_STICKY = (1U << 5U); 40 static constexpr auto FLAG_HASINDICES = (1U << 6U); 41 static const uint32_t KEY_EOF = UINT32_MAX; 42 static constexpr int CLASS_RANGE_BASE = 0x40000000; 43 static constexpr uint32_t NUM_CAPTURE__OFFSET = 4; 44 static constexpr uint32_t NUM_STACK_OFFSET = 8; 45 static constexpr uint32_t OCTAL_VALUE = 8; 46 static constexpr uint32_t OCTAL_VALUE_RANGE = 32; 47 static constexpr uint32_t HEX_VALUE = 16; 48 static constexpr int32_t DECIMAL_DIGITS_ADVANCE = 10; 49 static constexpr uint32_t FLAGS_OFFSET = 12; 50 static constexpr uint32_t OP_START_OFFSET = 16; 51 static constexpr uint32_t UNICODE_HEX_VALUE = 4; 52 static constexpr uint32_t UNICODE_HEX_ADVANCE = 2; 53 static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3; 54 static constexpr uint32_t UTF8_CHAR_LEN_MAX = 6; 55 56 explicit RegExpParser() = default; 57 ~RegExpParser()58 ~RegExpParser() 59 { 60 Clear(); 61 } 62 63 NO_COPY_SEMANTIC(RegExpParser); 64 NO_MOVE_SEMANTIC(RegExpParser); 65 Init(char * source,size_t length,uint32_t flags)66 inline void Init(char *source, size_t length, uint32_t flags) 67 { 68 pc_ = reinterpret_cast<uint8_t *>(source); 69 base_ = pc_; 70 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 71 end_ = reinterpret_cast<uint8_t *>(source) + length - 1; 72 flags_ = flags; 73 } 74 75 PANDA_PUBLIC_API void Parse(); 76 void ParseDisjunction(bool isBackward); 77 void ParseAlternative(bool isBackward); 78 bool ParseAssertionCapture(int *captureIndex, bool isBackward); 79 void ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd); 80 int ParseDecimalDigits(); 81 int ParseAtomEscape(bool isBackward); 82 int ParseCharacterEscape(); 83 bool ParseGroupSpecifier(const uint8_t **pp, PandaString &name); 84 int ParseCaptureCount(const char *groupName); 85 bool ParseClassRanges(RangeSet *result); 86 void ParseNonemptyClassRangesNoDash(DynChunk *buffer); 87 uint32_t ParseClassAtom(RangeSet *atom); 88 int ParseClassEscape(RangeSet *atom); 89 void ParseError(const char *errorMessage); 90 void ParseUnicodePropertyValueCharactersImpl(bool *isValue); 91 int FindGroupName(const PandaString &name); 92 uint32_t ParseOctalLiteral(); 93 bool ParseHexEscape(int length, uint32_t *value); 94 bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value); 95 bool ParseUnicodeEscape(uint32_t *value); 96 bool ParserIntervalQuantifier(int *pmin, int *pmax); 97 bool HasNamedCaptures(); 98 int ParseEscape(const uint8_t **pp, int isUtf16); 99 int RecountCaptures(); 100 int IsIdentFirst(uint32_t c); 101 GetGroupNames()102 inline PandaVector<PandaString> GetGroupNames() const 103 { 104 return newGroupNames_; 105 } 106 GetGroupNamesSize()107 inline size_t GetGroupNamesSize() const 108 { 109 return groupNames_.size_; 110 } 111 IsError()112 inline bool IsError() const 113 { 114 return isError_; 115 } 116 GetOriginBuffer()117 inline uint8_t *GetOriginBuffer() const 118 { 119 return buffer_.buf_; 120 } 121 GetOriginBufferSize()122 inline size_t GetOriginBufferSize() const 123 { 124 return buffer_.size_; 125 } 126 GetErrorMsg()127 inline PandaString GetErrorMsg() const 128 { 129 if (isError_) { 130 return PandaString(errorMsg_); 131 } 132 return PandaString(""); 133 } 134 IsGlobal()135 inline bool IsGlobal() const 136 { 137 return (flags_ & FLAG_GLOBAL) != 0; 138 } 139 IsIgnoreCase()140 inline bool IsIgnoreCase() const 141 { 142 return (flags_ & FLAG_IGNORECASE) != 0; 143 } 144 IsMultiline()145 inline bool IsMultiline() const 146 { 147 return (flags_ & FLAG_MULTILINE) != 0; 148 } 149 IsDotAll()150 inline bool IsDotAll() const 151 { 152 return (flags_ & FLAG_DOTALL) != 0; 153 } 154 IsUtf16()155 inline bool IsUtf16() const 156 { 157 return (flags_ & FLAG_UTF16) != 0; 158 } 159 IsStick()160 inline bool IsStick() const 161 { 162 return (flags_ & FLAG_STICKY) != 0; 163 } 164 Canonicalize(int c,bool isUnicode)165 inline static int Canonicalize(int c, bool isUnicode) 166 { 167 if (c < TMP_BUF_SIZE) { // NOLINTNEXTLINE(readability-magic-numbers) 168 if (c >= 'a' && c <= 'z') { 169 c = c - 'a' + 'A'; 170 } 171 } else { 172 if (isUnicode) { 173 c = u_toupper(static_cast<UChar32>(c)); 174 } 175 } 176 return c; 177 } 178 179 private: 180 friend class RegExpExecutor; 181 static constexpr int TMP_BUF_SIZE = 128; Clear()182 void Clear() 183 { 184 base_ = nullptr; 185 pc_ = nullptr; 186 end_ = nullptr; 187 c0_ = KEY_EOF; 188 isError_ = false; 189 } 190 Advance()191 void Advance() 192 { 193 if (pc_ <= end_) { 194 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 195 c0_ = *pc_++; 196 } else { 197 c0_ = KEY_EOF; 198 } 199 } 200 Advance(int offset)201 void Advance(int offset) 202 { 203 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 204 pc_ += offset - 1; 205 Advance(); 206 } 207 Prev()208 void Prev() 209 { 210 if (pc_ >= base_) { 211 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 212 c0_ = *pc_--; 213 } else { 214 c0_ = KEY_EOF; 215 } 216 } 217 SetIsError()218 void SetIsError() 219 { 220 isError_ = true; 221 } 222 223 bool ParseQuantifierPrefix(int &min, int &max, bool &isGreedy); 224 void PrintF(const char *fmt, ...); 225 void ParseUnicodePropertyValueCharacters(int &result); 226 void PrintControlEscapeAndAdvance(); 227 void ParseControlLetter(uint32_t &result); 228 void ParseCharacterEscapeDefault(uint32_t &result); 229 void InsertRangeBase(RangeSet *atom, RangeSet &rangeSet, bool invert); 230 void InsertRangeOpCode(DynChunk &buffer, RangeSet &rangeSet, PrevOpCode &prevOp, bool isBackward); 231 void InsertRange32OpCode(DynChunk &buffer, RangeSet &rangeSet, PrevOpCode &prevOp, bool isBackward); 232 void ParseLookBehind(DynChunk &buffer, PrevOpCode &prevOp, bool isBackward); 233 int ParseGroupName(); 234 235 template <typename OpCodeT> 236 void InsertMatchAheadOpCode(bool isBackward); 237 238 bool ParseAssertion(bool isBackward, bool &isAtom, bool &parseCapture); 239 bool HandleGroupName(); 240 bool ParseClassRangesImpl(RangeSet *result); 241 bool CalculateCaptureIndex(const uint8_t *p, int &captureIndex, const char *groupName, PandaString &name); 242 bool ParseCaptureCountImpl(const uint8_t *p, int &captureIndex, const char *groupName, PandaString &name); 243 244 void ParseAlternativeEscape(bool isBackward, bool &isAtom); 245 void ParseAlternativeEscapeDefault(int atomValue); 246 void ParsePatternCharacter(bool isBackward); 247 void ParseAlternativeAny(bool isBackward); 248 void ParseAlternativeRange(bool isBackward); 249 void ParseAlternativeImpl(bool isBackward, bool &isAtom, int &captureIndex); 250 251 uint8_t *base_ {nullptr}; 252 uint8_t *pc_ {nullptr}; 253 uint8_t *end_ {nullptr}; 254 uint32_t flags_ {0}; 255 uint32_t c0_ {KEY_EOF}; 256 int captureCount_ {0}; 257 int stackCount_ {0}; 258 bool isError_ {false}; 259 char errorMsg_[TMP_BUF_SIZE] = {0}; // NOLINT(modernize-avoid-c-arrays) 260 int hasNamedCaptures_ = -1; 261 int totalCaptureCount_ = -1; 262 DynChunk buffer_ {}; 263 DynChunk groupNames_ {}; 264 PandaVector<PandaString> newGroupNames_ {}; 265 }; 266 } // namespace ark 267 #endif // CORE_REGEXP_PARSER_H 268