1 /* 2 * Copyright (c) 2021 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef ECMASCRIPT_REGEXP_PARSER_H 17 #define ECMASCRIPT_REGEXP_PARSER_H 18 19 #include <cstdarg> 20 #include <cstdio> 21 #include <cstdint> 22 #include "ecmascript/js_thread.h" 23 #include "ecmascript/ecma_macros.h" 24 #include "ecmascript/mem/chunk.h" 25 #include "ecmascript/mem/c_containers.h" 26 #include "ecmascript/mem/c_string.h" 27 #include "ecmascript/mem/dyn_chunk.h" 28 #include "ecmascript/regexp/regexp_opcode.h" 29 #include "unicode/stringpiece.h" 30 #include "unicode/uchar.h" 31 #include "unicode/utf16.h" 32 #include "unicode/utf8.h" 33 #include "unicode/utypes.h" 34 #include "unicode/udata.h" 35 36 namespace panda::ecmascript { 37 class RegExpParser { 38 public: 39 static constexpr auto FLAG_GLOBAL = (1U << 0U); 40 static constexpr auto FLAG_IGNORECASE = (1U << 1U); 41 static constexpr auto FLAG_MULTILINE = (1U << 2U); 42 static constexpr auto FLAG_DOTALL = (1U << 3U); 43 static constexpr auto FLAG_UTF16 = (1U << 4U); 44 static constexpr auto FLAG_STICKY = (1U << 5U); 45 static constexpr auto FLAG_HASINDICES = (1U << 6U); 46 static constexpr uint32_t FLAG_NUM = 7; 47 static const uint32_t KEY_EOF = UINT32_MAX; 48 static constexpr int CLASS_RANGE_BASE = 0x40000000; 49 static constexpr uint32_t NUM_CAPTURE__OFFSET = 4; 50 static constexpr uint32_t NUM_STACK_OFFSET = 8; 51 static constexpr uint32_t OCTAL_VALUE = 8; 52 static constexpr uint32_t OCTAL_VALUE_RANGE = 32; 53 static constexpr uint32_t HEX_VALUE = 16; 54 static constexpr uint32_t DECIMAL_DIGITS_ADVANCE = 10; 55 static constexpr uint32_t FLAGS_OFFSET = 12; 56 static constexpr uint32_t OP_START_OFFSET = 16; 57 static constexpr uint32_t UNICODE_HEX_VALUE = 4; 58 static constexpr uint32_t UNICODE_HEX_ADVANCE = 2; 59 static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3; 60 static constexpr uint32_t UTF8_CHAR_LEN_MAX = 6; 61 static int Canonicalize(int c, bool isUnicode); 62 RegExpParser(JSThread * thread,Chunk * chunk)63 explicit RegExpParser(JSThread *thread, Chunk *chunk) 64 : thread_(thread), 65 base_(nullptr), 66 pc_(nullptr), 67 end_(nullptr), 68 flags_(0), 69 c0_(KEY_EOF), 70 captureCount_(0), 71 stackCount_(0), 72 isError_(false), 73 buffer_(chunk), 74 groupNames_(chunk) 75 { 76 } 77 ~RegExpParser()78 ~RegExpParser() 79 { 80 Clear(); 81 } 82 83 NO_COPY_SEMANTIC(RegExpParser); 84 NO_MOVE_SEMANTIC(RegExpParser); 85 Init(char * source,size_t length,uint32_t flags)86 inline void Init(char *source, size_t length, uint32_t flags) 87 { 88 pc_ = reinterpret_cast<uint8_t *>(source); 89 base_ = pc_; 90 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 91 end_ = reinterpret_cast<uint8_t *>(source) + length - 1; 92 flags_ = flags; 93 } 94 95 void Parse(); 96 void ParseDisjunction(bool isBackward); 97 void ParseAlternative(bool isBackward); 98 bool ParseAssertionCapture(int *captureIndex, bool isBackward); 99 void ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd); 100 int ParseDecimalDigits(); 101 int ParseAtomEscape(bool isBackward); 102 int ParseCharacterEscape(); 103 bool ParseGroupSpecifier(const uint8_t **pp, CString &name); 104 int ParseCaptureCount(const char *groupName); 105 bool ParseClassRanges(RangeSet *result); 106 void ParseNonemptyClassRangesNoDash(DynChunk *buffer); 107 uint32_t ParseClassAtom(RangeSet *atom); 108 int ParseClassEscape(RangeSet *atom); 109 void ParseError(const char *errorMessage); 110 void ParseUnicodePropertyValueCharacters(bool *isValue); 111 int FindGroupName(const CString &name); 112 uint32_t ParseOctalLiteral(); 113 bool ParseHexEscape(int length, uint32_t *value); 114 bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value); 115 bool ParseUnicodeEscape(uint32_t *value); 116 bool ParserIntervalQuantifier(int *pmin, int *pmax); 117 bool HasNamedCaptures(); 118 int ParseEscape(const uint8_t **pp, int isUtf16); 119 int RecountCaptures(); 120 int IsIdentFirst(uint32_t c); 121 bool NeedIntersection(uint32_t c); 122 void DoParserStackOverflowCheck(const char *errorMessage); 123 GetGroupNames()124 inline CVector<CString> GetGroupNames() const 125 { 126 return newGroupNames_; 127 } 128 GetGroupNamesSize()129 inline size_t GetGroupNamesSize() const 130 { 131 return groupNames_.size_; 132 } 133 IsError()134 inline bool IsError() const 135 { 136 return isError_; 137 } 138 GetOriginBuffer()139 inline uint8_t *GetOriginBuffer() const 140 { 141 return buffer_.buf_; 142 } 143 GetOriginBufferSize()144 inline size_t GetOriginBufferSize() const 145 { 146 return buffer_.size_; 147 } 148 GetErrorMsg()149 inline CString GetErrorMsg() const 150 { 151 if (isError_) { 152 return CString(errorMsg_); 153 } 154 return CString(""); 155 } 156 IsGlobal()157 inline bool IsGlobal() const 158 { 159 return (flags_ & FLAG_GLOBAL) != 0; 160 } 161 IsIgnoreCase()162 inline bool IsIgnoreCase() const 163 { 164 return (flags_ & FLAG_IGNORECASE) != 0; 165 } 166 IsMultiline()167 inline bool IsMultiline() const 168 { 169 return (flags_ & FLAG_MULTILINE) != 0; 170 } 171 IsDotAll()172 inline bool IsDotAll() const 173 { 174 return (flags_ & FLAG_DOTALL) != 0; 175 } 176 IsUtf16()177 inline bool IsUtf16() const 178 { 179 return (flags_ & FLAG_UTF16) != 0; 180 } 181 IsStick()182 inline bool IsStick() const 183 { 184 return (flags_ & FLAG_STICKY) != 0; 185 } 186 GetcurrentCharNext(int c)187 inline static int GetcurrentCharNext(int c) 188 { 189 int cur = c; 190 c = u_tolower(static_cast<UChar32>(c)); 191 if (c == cur) { 192 c = u_toupper(static_cast<UChar32>(c)); 193 } 194 if (((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) && 195 !((cur >= 'A' && cur <= 'Z') || (cur >= 'a' && cur <= 'z'))) { 196 c = cur; 197 } 198 return c; 199 } ProcessIntersection(RangeSet * result)200 inline static void ProcessIntersection(RangeSet *result) 201 { 202 RangeSet cr; 203 RangeSet cr1; 204 const uint32_t MINLOWERCHAR = 'a'; 205 const uint32_t MAXLOWERCHAR = 'z' + 1; 206 const uint32_t MINUPPERCHAR = 'A'; 207 const uint32_t MAXUPPERCHAR = 'Z' + 1; 208 // Range values for a and z + 1 209 cr.Insert(MINLOWERCHAR, MAXLOWERCHAR); 210 // Range values for A and Z + 1 211 cr.Insert(MINUPPERCHAR, MAXUPPERCHAR); 212 result->Inter(cr1, cr); 213 result->Insert(cr1); 214 } 215 private: 216 friend class RegExpExecutor; 217 static constexpr int TMP_BUF_SIZE = 128; Clear()218 void Clear() 219 { 220 base_ = nullptr; 221 pc_ = nullptr; 222 end_ = nullptr; 223 c0_ = KEY_EOF; 224 isError_ = false; 225 } 226 Advance()227 void Advance() 228 { 229 if (pc_ <= end_) { 230 DoParserStackOverflowCheck("Advance stack overflow!"); 231 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 232 c0_ = *pc_++; 233 } else { 234 c0_ = KEY_EOF; 235 } 236 } 237 Advance(int offset)238 void Advance(int offset) 239 { 240 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 241 pc_ += offset - 1; 242 Advance(); 243 } 244 Prev()245 void Prev() 246 { 247 if (pc_ >= base_) { 248 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 249 c0_ = *pc_--; 250 } else { 251 c0_ = KEY_EOF; 252 } 253 } 254 SetIsError()255 void SetIsError() 256 { 257 isError_ = true; 258 } 259 260 void PrintF(const char *fmt, ...); 261 JSThread *thread_; 262 uint8_t *base_; 263 uint8_t *pc_; 264 uint8_t *end_; 265 uint32_t flags_; 266 uint32_t c0_; 267 int captureCount_; 268 int stackCount_; 269 bool isError_; 270 char errorMsg_[TMP_BUF_SIZE] = {0}; // NOLINTNEXTLINE(modernize-avoid-c-arrays) 271 int hasNamedCaptures_ = -1; 272 int totalCaptureCount_ = -1; 273 DynChunk buffer_; 274 DynChunk groupNames_; 275 CVector<CString> newGroupNames_; 276 }; 277 } // namespace panda::ecmascript 278 #endif // ECMASCRIPT_REGEXP_PARSER_H 279