1 /* 2 * Copyright (c) 2021 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef ECMASCRIPT_REGEXP_PARSER_H 17 #define ECMASCRIPT_REGEXP_PARSER_H 18 19 #include <cstdarg> 20 #include <cstdio> 21 #include <cstdint> 22 #include "ecmascript/js_thread.h" 23 #include "ecmascript/ecma_macros.h" 24 #include "ecmascript/mem/chunk.h" 25 #include "ecmascript/mem/c_containers.h" 26 #include "ecmascript/mem/c_string.h" 27 #include "ecmascript/mem/dyn_chunk.h" 28 #include "ecmascript/regexp/regexp_opcode.h" 29 #include "unicode/stringpiece.h" 30 #include "unicode/uchar.h" 31 #include "unicode/utf16.h" 32 #include "unicode/utf8.h" 33 #include "unicode/utypes.h" 34 #include "unicode/udata.h" 35 #include "unicode/uniset.h" 36 37 namespace panda::ecmascript { 38 class RegExpParser { 39 public: 40 static constexpr auto FLAG_GLOBAL = (1U << 0U); 41 static constexpr auto FLAG_IGNORECASE = (1U << 1U); 42 static constexpr auto FLAG_MULTILINE = (1U << 2U); 43 static constexpr auto FLAG_DOTALL = (1U << 3U); 44 static constexpr auto FLAG_UTF16 = (1U << 4U); 45 static constexpr auto FLAG_STICKY = (1U << 5U); 46 static constexpr auto FLAG_HASINDICES = (1U << 6U); 47 static constexpr uint32_t FLAG_NUM = 7; 48 static const uint32_t KEY_EOF = UINT32_MAX; 49 static constexpr int CLASS_RANGE_BASE = 0x40000000; 50 static constexpr uint32_t NUM_CAPTURE__OFFSET = 4; 51 static constexpr uint32_t NUM_STACK_OFFSET = 8; 52 static constexpr uint32_t OCTAL_VALUE = 8; 53 static constexpr uint32_t OCTAL_VALUE_RANGE = 32; 54 static constexpr uint32_t HEX_VALUE = 16; 55 static constexpr uint32_t DECIMAL_DIGITS_ADVANCE = 10; 56 static constexpr uint32_t FLAGS_OFFSET = 12; 57 static constexpr uint32_t PREFILTER_OFFSET = 16; 58 static constexpr uint32_t OP_START_OFFSET = 20; 59 static constexpr uint32_t UNICODE_HEX_VALUE = 4; 60 static constexpr uint32_t UNICODE_HEX_ADVANCE = 2; 61 static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3; 62 static constexpr uint32_t UTF8_CHAR_LEN_MAX = 6; 63 static constexpr size_t SPARSE_HEAD_OFFSET = 3; 64 static constexpr size_t SPARSE_OFF_OFFSET = 2; 65 static constexpr size_t SPARSE_MAX_OFFSET = 6; 66 static int Canonicalize(int c, bool isUnicode); 67 RegExpParser(JSThread * thread,Chunk * chunk)68 explicit RegExpParser(JSThread *thread, Chunk *chunk) 69 : thread_(thread), 70 base_(nullptr), 71 pc_(nullptr), 72 end_(nullptr), 73 flags_(0), 74 c0_(KEY_EOF), 75 captureCount_(0), 76 stackCount_(0), 77 isError_(false), 78 isEmpty_(false), 79 buffer_(chunk), 80 groupNames_(chunk) 81 { 82 } 83 ~RegExpParser()84 ~RegExpParser() 85 { 86 Clear(); 87 } 88 89 NO_COPY_SEMANTIC(RegExpParser); 90 NO_MOVE_SEMANTIC(RegExpParser); 91 Init(char * source,size_t length,uint32_t flags)92 inline void Init(char *source, size_t length, uint32_t flags) 93 { 94 pc_ = reinterpret_cast<uint8_t *>(source); 95 base_ = pc_; 96 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 97 end_ = reinterpret_cast<uint8_t *>(source) + length - 1; 98 flags_ = flags; 99 } 100 101 void Parse(); 102 void ParseDisjunction(bool isBackward); 103 void ParseAlternative(bool isBackward); 104 bool ParseAssertionCapture(int *captureIndex, bool isBackward); 105 void ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd); 106 int ParseDecimalDigits(); 107 int ParseAtomEscape(bool isBackward); 108 int ParseCharacterEscape(); 109 bool ParseGroupSpecifier(const uint8_t **pp, CString &name); 110 int ParseCaptureCount(const char *groupName); 111 bool ParseClassRanges(RangeSet *result); 112 void ParseNonemptyClassRangesNoDash(DynChunk *buffer); 113 uint32_t ParseClassAtom(RangeSet *atom); 114 int ParseClassEscape(RangeSet *atom); 115 void ParseError(const char *errorMessage); 116 bool ParseUnicodePropertyValueCharacters(CString &categoryName, CString &valueName); 117 int FindGroupName(const CString &name); 118 uint32_t ParseOctalLiteral(); 119 bool ParseHexEscape(int length, uint32_t *value); 120 bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value); 121 bool ParseUnicodeEscape(uint32_t *value); 122 bool ParserIntervalQuantifier(int *pmin, int *pmax); 123 bool HasNamedCaptures(); 124 int ParseEscape(const uint8_t **pp, int isUtf16); 125 int RecountCaptures(); 126 int IsIdentFirst(uint32_t c); 127 bool NeedIntersection(uint32_t c); 128 void DoParserStackOverflowCheck(const char *errorMessage); 129 bool MatchUnicodeProperty(UProperty property, const char *propertyName, RangeSet *atom, bool negate); 130 bool IsExactPropertyValueAlis(const char *valueName, UProperty property, int32_t propertyValue); 131 bool ParseUnicodePropertyClassRange(CString &propertyName, CString &valueName, RangeSet *atom, bool negate); 132 bool GetUnicodePropertyName(CString &propertyName); 133 bool GetUnicodePropertyValueName(CString &valueName); 134 bool IsExactPropertyAlias(const char *propertyName, UProperty property); 135 bool MatchSepcialUnicodeProperty(CString &name, bool negate, RangeSet *atom); 136 bool IsSupportedBinaryProperty(UProperty property); 137 bool IsBinaryPropertyOfStrings(UProperty property); GetGroupNames()138 inline CVector<CString> GetGroupNames() const 139 { 140 return newGroupNames_; 141 } 142 GetGroupNamesSize()143 inline size_t GetGroupNamesSize() const 144 { 145 return groupNames_.size_; 146 } 147 IsError()148 inline bool IsError() const 149 { 150 return isError_; 151 } 152 GetOriginBuffer()153 inline uint8_t *GetOriginBuffer() const 154 { 155 return buffer_.buf_; 156 } 157 GetOriginBufferSize()158 inline size_t GetOriginBufferSize() const 159 { 160 return buffer_.size_; 161 } 162 GetErrorMsg()163 inline CString GetErrorMsg() const 164 { 165 if (isError_) { 166 return CString(errorMsg_); 167 } 168 return CString(""); 169 } 170 IsGlobal()171 inline bool IsGlobal() const 172 { 173 return (flags_ & FLAG_GLOBAL) != 0; 174 } 175 IsIgnoreCase()176 inline bool IsIgnoreCase() const 177 { 178 return (flags_ & FLAG_IGNORECASE) != 0; 179 } 180 IsMultiline()181 inline bool IsMultiline() const 182 { 183 return (flags_ & FLAG_MULTILINE) != 0; 184 } 185 IsDotAll()186 inline bool IsDotAll() const 187 { 188 return (flags_ & FLAG_DOTALL) != 0; 189 } 190 IsUtf16()191 inline bool IsUtf16() const 192 { 193 return (flags_ & FLAG_UTF16) != 0; 194 } 195 IsStick()196 inline bool IsStick() const 197 { 198 return (flags_ & FLAG_STICKY) != 0; 199 } 200 IsUnicodePropertyValueCharacter(char c)201 inline bool IsUnicodePropertyValueCharacter(char c) const 202 { 203 if (c >= 'a' && c <= 'z') { 204 return true; 205 } 206 if (c >= 'A' && c <= 'Z') { 207 return true; 208 } 209 if (c >= '0' && c <= '9') { 210 return true; 211 } 212 return (c == '_'); 213 } 214 GetcurrentCharNext(int c)215 inline static int GetcurrentCharNext(int c) 216 { 217 int cur = c; 218 c = u_tolower(static_cast<UChar32>(c)); 219 if (c == cur) { 220 c = u_toupper(static_cast<UChar32>(c)); 221 } 222 if (((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) && 223 !((cur >= 'A' && cur <= 'Z') || (cur >= 'a' && cur <= 'z'))) { 224 c = cur; 225 } 226 return c; 227 } ProcessIntersection(RangeSet * result)228 inline static void ProcessIntersection(RangeSet *result) 229 { 230 RangeSet cr; 231 RangeSet cr1; 232 const uint32_t MINLOWERCHAR = 'a'; 233 const uint32_t MAXLOWERCHAR = 'z' + 1; 234 const uint32_t MINUPPERCHAR = 'A'; 235 const uint32_t MAXUPPERCHAR = 'Z' + 1; 236 // Range values for a and z + 1 237 cr.Insert(MINLOWERCHAR, MAXLOWERCHAR); 238 // Range values for A and Z + 1 239 cr.Insert(MINUPPERCHAR, MAXUPPERCHAR); 240 result->Inter(cr1, cr); 241 result->Insert(cr1); 242 } 243 private: 244 friend class RegExpExecutor; 245 static constexpr int TMP_BUF_SIZE = 128; Clear()246 void Clear() 247 { 248 base_ = nullptr; 249 pc_ = nullptr; 250 end_ = nullptr; 251 c0_ = KEY_EOF; 252 isError_ = false; 253 isEmpty_ = false; 254 } 255 Advance()256 void Advance() 257 { 258 if (pc_ <= end_) { 259 DoParserStackOverflowCheck("Advance stack overflow!"); 260 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 261 c0_ = *pc_++; 262 } else { 263 c0_ = KEY_EOF; 264 } 265 } 266 Advance(int offset)267 void Advance(int offset) 268 { 269 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 270 pc_ += offset - 1; 271 Advance(); 272 } 273 Prev()274 void Prev() 275 { 276 if (pc_ >= base_) { 277 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 278 c0_ = *pc_--; 279 } else { 280 c0_ = KEY_EOF; 281 } 282 } 283 SetIsError()284 void SetIsError() 285 { 286 isError_ = true; 287 } 288 289 void PrintF(const char *fmt, ...); 290 JSThread *thread_; 291 uint8_t *base_; 292 uint8_t *pc_; 293 uint8_t *end_; 294 uint32_t flags_; 295 uint32_t c0_; 296 int captureCount_; 297 int stackCount_; 298 bool isError_; 299 bool isEmpty_; 300 char errorMsg_[TMP_BUF_SIZE] = {0}; // NOLINTNEXTLINE(modernize-avoid-c-arrays) 301 int hasNamedCaptures_ = -1; 302 int totalCaptureCount_ = -1; 303 DynChunk buffer_; 304 DynChunk groupNames_; 305 CVector<CString> newGroupNames_; 306 }; 307 } // namespace panda::ecmascript 308 #endif // ECMASCRIPT_REGEXP_PARSER_H 309