1 /* 2 * Copyright (c) 2021 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef ECMASCRIPT_REGEXP_PARSER_H 17 #define ECMASCRIPT_REGEXP_PARSER_H 18 19 #include <cstdarg> 20 #include <cstdio> 21 #include <cstdint> 22 #include "ecmascript/mem/chunk.h" 23 #include "ecmascript/mem/c_containers.h" 24 #include "ecmascript/mem/c_string.h" 25 #include "ecmascript/mem/dyn_chunk.h" 26 #include "ecmascript/regexp/regexp_opcode.h" 27 #include "unicode/stringpiece.h" 28 #include "unicode/uchar.h" 29 #include "unicode/utf16.h" 30 #include "unicode/utf8.h" 31 #include "unicode/utypes.h" 32 #include "unicode/udata.h" 33 34 namespace panda::ecmascript { 35 class RegExpParser { 36 public: 37 static constexpr auto FLAG_GLOBAL = (1U << 0U); 38 static constexpr auto FLAG_IGNORECASE = (1U << 1U); 39 static constexpr auto FLAG_MULTILINE = (1U << 2U); 40 static constexpr auto FLAG_DOTALL = (1U << 3U); 41 static constexpr auto FLAG_UTF16 = (1U << 4U); 42 static constexpr auto FLAG_STICKY = (1U << 5U); 43 static const uint32_t KEY_EOF = UINT32_MAX; 44 static constexpr int CLASS_RANGE_BASE = 0x40000000; 45 static constexpr uint32_t NUM_CAPTURE__OFFSET = 4; 46 static constexpr uint32_t NUM_STACK_OFFSET = 8; 47 static constexpr uint32_t OCTAL_VALUE = 8; 48 static constexpr uint32_t OCTAL_VALUE_RANGE = 32; 49 static constexpr uint32_t HEX_VALUE = 16; 50 static constexpr uint32_t DECIMAL_DIGITS_ADVANCE = 10; 51 static constexpr uint32_t FLAGS_OFFSET = 12; 52 static constexpr uint32_t OP_START_OFFSET = 16; 53 static constexpr uint32_t UNICODE_HEX_VALUE = 4; 54 static constexpr uint32_t UNICODE_HEX_ADVANCE = 2; 55 static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3; 56 static constexpr uint32_t UTF8_CHAR_LEN_MAX = 6; 57 static int Canonicalize(int c, bool isUnicode); 58 RegExpParser(Chunk * chunk)59 explicit RegExpParser(Chunk *chunk) 60 : base_(nullptr), 61 pc_(nullptr), 62 end_(nullptr), 63 flags_(0), 64 c0_(KEY_EOF), 65 captureCount_(0), 66 stackCount_(0), 67 isError_(false), 68 buffer_(chunk), 69 groupNames_(chunk) 70 { 71 } 72 ~RegExpParser()73 ~RegExpParser() 74 { 75 Clear(); 76 } 77 78 NO_COPY_SEMANTIC(RegExpParser); 79 NO_MOVE_SEMANTIC(RegExpParser); 80 Init(char * source,size_t length,uint32_t flags)81 inline void Init(char *source, size_t length, uint32_t flags) 82 { 83 pc_ = reinterpret_cast<uint8_t *>(source); 84 base_ = pc_; 85 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 86 end_ = reinterpret_cast<uint8_t *>(source) + length - 1; 87 flags_ = flags; 88 } 89 90 void Parse(); 91 void ParseDisjunction(bool isBackward); 92 void ParseAlternative(bool isBackward); 93 bool ParseAssertionCapture(int *captureIndex, bool isBackward); 94 void ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd); 95 int ParseDecimalDigits(); 96 int ParseAtomEscape(bool isBackward); 97 int ParseCharacterEscape(); 98 bool ParseGroupSpecifier(const uint8_t **pp, CString &name); 99 int ParseCaptureCount(const char *groupName); 100 bool ParseClassRanges(RangeSet *result); 101 void ParseNonemptyClassRangesNoDash(DynChunk *buffer); 102 uint32_t ParseClassAtom(RangeSet *atom); 103 int ParseClassEscape(RangeSet *atom); 104 void ParseError(const char *errorMessage); 105 void ParseUnicodePropertyValueCharacters(bool *isValue); 106 int FindGroupName(const CString &name); 107 uint32_t ParseOctalLiteral(); 108 bool ParseHexEscape(int length, uint32_t *value); 109 bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value); 110 bool ParseUnicodeEscape(uint32_t *value); 111 bool ParserIntervalQuantifier(int *pmin, int *pmax); 112 bool HasNamedCaptures(); 113 int ParseEscape(const uint8_t **pp, int isUtf16); 114 int RecountCaptures(); 115 int IsIdentFirst(uint32_t c); 116 bool NeedIntersection(uint32_t c); 117 GetGroupNames()118 inline CVector<CString> GetGroupNames() const 119 { 120 return newGroupNames_; 121 } 122 GetGroupNamesSize()123 inline size_t GetGroupNamesSize() const 124 { 125 return groupNames_.size_; 126 } 127 IsError()128 inline bool IsError() const 129 { 130 return isError_; 131 } 132 GetOriginBuffer()133 inline uint8_t *GetOriginBuffer() const 134 { 135 return buffer_.buf_; 136 } 137 GetOriginBufferSize()138 inline size_t GetOriginBufferSize() const 139 { 140 return buffer_.size_; 141 } 142 GetErrorMsg()143 inline CString GetErrorMsg() const 144 { 145 if (isError_) { 146 return CString(errorMsg_); 147 } 148 return CString(""); 149 } 150 IsGlobal()151 inline bool IsGlobal() const 152 { 153 return (flags_ & FLAG_GLOBAL) != 0; 154 } 155 IsIgnoreCase()156 inline bool IsIgnoreCase() const 157 { 158 return (flags_ & FLAG_IGNORECASE) != 0; 159 } 160 IsMultiline()161 inline bool IsMultiline() const 162 { 163 return (flags_ & FLAG_MULTILINE) != 0; 164 } 165 IsDotAll()166 inline bool IsDotAll() const 167 { 168 return (flags_ & FLAG_DOTALL) != 0; 169 } 170 IsUtf16()171 inline bool IsUtf16() const 172 { 173 return (flags_ & FLAG_UTF16) != 0; 174 } 175 IsStick()176 inline bool IsStick() const 177 { 178 return (flags_ & FLAG_STICKY) != 0; 179 } 180 GetcurrentCharNext(int c)181 inline static int GetcurrentCharNext(int c) 182 { 183 int cur = c; 184 c = u_tolower(static_cast<UChar32>(c)); 185 if (c == cur) { 186 c = u_toupper(static_cast<UChar32>(c)); 187 } 188 if (((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) && 189 !((cur >= 'A' && cur <= 'Z') || (cur >= 'a' && cur <= 'z'))) { 190 c = cur; 191 } 192 return c; 193 } ProcessIntersection(RangeSet * result)194 inline static void ProcessIntersection(RangeSet *result) 195 { 196 RangeSet cr; 197 RangeSet cr1; 198 const uint32_t MINLOWERCHAR = 'a'; 199 const uint32_t MAXLOWERCHAR = 'z' + 1; 200 const uint32_t MINUPPERCHAR = 'A'; 201 const uint32_t MAXUPPERCHAR = 'Z' + 1; 202 // Range values for a and z + 1 203 cr.Insert(MINLOWERCHAR, MAXLOWERCHAR); 204 // Range values for A and Z + 1 205 cr.Insert(MINUPPERCHAR, MAXUPPERCHAR); 206 result->Inter(cr1, cr); 207 result->Insert(cr1); 208 } 209 private: 210 friend class RegExpExecutor; 211 static constexpr int TMP_BUF_SIZE = 128; Clear()212 void Clear() 213 { 214 base_ = nullptr; 215 pc_ = nullptr; 216 end_ = nullptr; 217 c0_ = KEY_EOF; 218 isError_ = false; 219 } 220 Advance()221 void Advance() 222 { 223 if (pc_ <= end_) { 224 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 225 c0_ = *pc_++; 226 } else { 227 c0_ = KEY_EOF; 228 } 229 } 230 Advance(int offset)231 void Advance(int offset) 232 { 233 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 234 pc_ += offset - 1; 235 Advance(); 236 } 237 Prev()238 void Prev() 239 { 240 if (pc_ >= base_) { 241 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 242 c0_ = *pc_--; 243 } else { 244 c0_ = KEY_EOF; 245 } 246 } 247 SetIsError()248 void SetIsError() 249 { 250 isError_ = true; 251 } 252 253 void PrintF(const char *fmt, ...); 254 uint8_t *base_; 255 uint8_t *pc_; 256 uint8_t *end_; 257 uint32_t flags_; 258 uint32_t c0_; 259 int captureCount_; 260 int stackCount_; 261 bool isError_; 262 char errorMsg_[TMP_BUF_SIZE] = {0}; // NOLINTNEXTLINE(modernize-avoid-c-arrays) 263 int hasNamedCaptures_ = -1; 264 int totalCaptureCount_ = -1; 265 DynChunk buffer_; 266 DynChunk groupNames_; 267 CVector<CString> newGroupNames_; 268 }; 269 } // namespace panda::ecmascript 270 #endif // ECMASCRIPT_REGEXP_PARSER_H 271