1 /* 2 * Copyright (c) 2021 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef ECMASCRIPT_REGEXP_PARSER_H 17 #define ECMASCRIPT_REGEXP_PARSER_H 18 19 #include <cstdarg> 20 #include <cstdio> 21 #include <cstdint> 22 #include "ecmascript/mem/chunk.h" 23 #include "ecmascript/mem/c_containers.h" 24 #include "ecmascript/mem/c_string.h" 25 #include "ecmascript/mem/dyn_chunk.h" 26 #include "ecmascript/regexp/regexp_opcode.h" 27 #include "unicode/stringpiece.h" 28 #include "unicode/uchar.h" 29 #include "unicode/utf16.h" 30 #include "unicode/utf8.h" 31 #include "unicode/utypes.h" 32 #include "unicode/udata.h" 33 34 namespace panda::ecmascript { 35 class RegExpParser { 36 public: 37 static constexpr auto FLAG_GLOBAL = (1U << 0U); 38 static constexpr auto FLAG_IGNORECASE = (1U << 1U); 39 static constexpr auto FLAG_MULTILINE = (1U << 2U); 40 static constexpr auto FLAG_DOTALL = (1U << 3U); 41 static constexpr auto FLAG_UTF16 = (1U << 4U); 42 static constexpr auto FLAG_STICKY = (1U << 5U); 43 static const uint32_t KEY_EOF = UINT32_MAX; 44 static constexpr int CLASS_RANGE_BASE = 0x40000000; 45 static constexpr uint32_t NUM_CAPTURE__OFFSET = 4; 46 static constexpr uint32_t NUM_STACK_OFFSET = 8; 47 static constexpr uint32_t OCTAL_VALUE = 8; 48 static constexpr uint32_t OCTAL_VALUE_RANGE = 32; 49 static constexpr uint32_t HEX_VALUE = 16; 50 static constexpr int32_t DECIMAL_DIGITS_ADVANCE = 10; 51 static constexpr uint32_t FLAGS_OFFSET = 12; 52 static constexpr uint32_t OP_START_OFFSET = 16; 53 static constexpr uint32_t UNICODE_HEX_VALUE = 4; 54 static constexpr uint32_t UNICODE_HEX_ADVANCE = 2; 55 static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3; 56 static constexpr uint32_t UTF8_CHAR_LEN_MAX = 6; 57 RegExpParser(Chunk * chunk)58 explicit RegExpParser(Chunk *chunk) 59 : base_(nullptr), 60 pc_(nullptr), 61 end_(nullptr), 62 flags_(0), 63 c0_(KEY_EOF), 64 captureCount_(0), 65 stackCount_(0), 66 isError_(false), 67 buffer_(chunk), 68 groupNames_(chunk) 69 { 70 } 71 ~RegExpParser()72 ~RegExpParser() 73 { 74 Clear(); 75 } 76 77 NO_COPY_SEMANTIC(RegExpParser); 78 NO_MOVE_SEMANTIC(RegExpParser); 79 Init(char * source,size_t length,uint32_t flags)80 inline void Init(char *source, size_t length, uint32_t flags) 81 { 82 pc_ = reinterpret_cast<uint8_t *>(source); 83 base_ = pc_; 84 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 85 end_ = reinterpret_cast<uint8_t *>(source) + length - 1; 86 flags_ = flags; 87 } 88 89 void Parse(); 90 void ParseDisjunction(bool isBackward); 91 void ParseAlternative(bool isBackward); 92 bool ParseAssertionCapture(int *captureIndex, bool isBackward); 93 void ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd); 94 int ParseDecimalDigits(); 95 int ParseAtomEscape(bool isBackward); 96 int ParseCharacterEscape(); 97 bool ParseGroupSpecifier(const uint8_t **pp, CString &name); 98 int ParseCaptureCount(const char *groupName); 99 bool ParseClassRanges(RangeSet *result); 100 void ParseNonemptyClassRangesNoDash(DynChunk *buffer); 101 uint32_t ParseClassAtom(RangeSet *atom); 102 int ParseClassEscape(RangeSet *atom); 103 void ParseError(const char *errorMessage); 104 void ParseUnicodePropertyValueCharacters(bool *isValue); 105 int FindGroupName(const CString &name); 106 uint32_t ParseOctalLiteral(); 107 bool ParseHexEscape(int length, uint32_t *value); 108 bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value); 109 bool ParseUnicodeEscape(uint32_t *value); 110 bool ParserIntervalQuantifier(int *pmin, int *pmax); 111 bool HasNamedCaptures(); 112 int ParseEscape(const uint8_t **pp, int isUtf16); 113 int RecountCaptures(); 114 int IsIdentFirst(uint32_t c); 115 GetGroupNames()116 inline CVector<CString> GetGroupNames() const 117 { 118 return newGroupNames_; 119 } 120 GetGroupNamesSize()121 inline size_t GetGroupNamesSize() const 122 { 123 return groupNames_.size_ ; 124 } 125 IsError()126 inline bool IsError() const 127 { 128 return isError_; 129 } 130 GetOriginBuffer()131 inline uint8_t *GetOriginBuffer() const 132 { 133 return buffer_.buf_; 134 } 135 GetOriginBufferSize()136 inline size_t GetOriginBufferSize() const 137 { 138 return buffer_.size_; 139 } 140 GetErrorMsg()141 inline CString GetErrorMsg() const 142 { 143 if (isError_) { 144 return CString(errorMsg_); 145 } 146 return CString(""); 147 } 148 IsGlobal()149 inline bool IsGlobal() const 150 { 151 return (flags_ & FLAG_GLOBAL) != 0; 152 } 153 IsIgnoreCase()154 inline bool IsIgnoreCase() const 155 { 156 return (flags_ & FLAG_IGNORECASE) != 0; 157 } 158 IsMultiline()159 inline bool IsMultiline() const 160 { 161 return (flags_ & FLAG_MULTILINE) != 0; 162 } 163 IsDotAll()164 inline bool IsDotAll() const 165 { 166 return (flags_ & FLAG_DOTALL) != 0; 167 } 168 IsUtf16()169 inline bool IsUtf16() const 170 { 171 return (flags_ & FLAG_UTF16) != 0; 172 } 173 IsStick()174 inline bool IsStick() const 175 { 176 return (flags_ & FLAG_STICKY) != 0; 177 } 178 Canonicalize(int c,bool isUnicode)179 inline static int Canonicalize(int c, bool isUnicode) 180 { 181 if (c < TMP_BUF_SIZE) { // NOLINTNEXTLINE(readability-magic-numbers) 182 if (c >= 'a' && c <= 'z') { 183 c = c - 'a' + 'A'; 184 } 185 } else { 186 if (isUnicode) { 187 c = u_toupper(static_cast<UChar32>(c)); 188 } 189 } 190 return c; 191 } 192 193 private: 194 friend class RegExpExecutor; 195 static constexpr int TMP_BUF_SIZE = 128; Clear()196 void Clear() 197 { 198 base_ = nullptr; 199 pc_ = nullptr; 200 end_ = nullptr; 201 c0_ = KEY_EOF; 202 isError_ = false; 203 } 204 Advance()205 void Advance() 206 { 207 if (pc_ <= end_) { 208 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 209 c0_ = *pc_++; 210 } else { 211 c0_ = KEY_EOF; 212 } 213 } 214 Advance(int offset)215 void Advance(int offset) 216 { 217 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 218 pc_ += offset - 1; 219 Advance(); 220 } 221 Prev()222 void Prev() 223 { 224 if (pc_ >= base_) { 225 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 226 c0_ = *pc_--; 227 } else { 228 c0_ = KEY_EOF; 229 } 230 } 231 SetIsError()232 void SetIsError() 233 { 234 isError_ = true; 235 } 236 237 void PrintF(const char *fmt, ...); 238 uint8_t *base_; 239 uint8_t *pc_; 240 uint8_t *end_; 241 uint32_t flags_; 242 uint32_t c0_; 243 int captureCount_; 244 int stackCount_; 245 bool isError_; 246 char errorMsg_[TMP_BUF_SIZE] = {0}; // NOLINTNEXTLINE(modernize-avoid-c-arrays) 247 int hasNamedCaptures_ = -1; 248 int totalCaptureCount_ = -1; 249 DynChunk buffer_; 250 DynChunk groupNames_; 251 CVector<CString> newGroupNames_; 252 }; 253 } // namespace panda::ecmascript 254 #endif // ECMASCRIPT_REGEXP_PARSER_H 255