1 /* 2 * Copyright (c) 2021 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef ECMASCRIPT_REGEXP_PARSER_H 17 #define ECMASCRIPT_REGEXP_PARSER_H 18 19 #include <cstdarg> 20 #include <cstdio> 21 #include <cstdint> 22 #include "ecmascript/mem/chunk.h" 23 #include "ecmascript/mem/c_containers.h" 24 #include "ecmascript/mem/c_string.h" 25 #include "ecmascript/mem/dyn_chunk.h" 26 #include "ecmascript/regexp/regexp_opcode.h" 27 #include "unicode/stringpiece.h" 28 #include "unicode/uchar.h" 29 #include "unicode/utf16.h" 30 #include "unicode/utf8.h" 31 #include "unicode/utypes.h" 32 #include "unicode/udata.h" 33 34 namespace panda::ecmascript { 35 class RegExpParser { 36 public: 37 static constexpr auto FLAG_GLOBAL = (1U << 0U); 38 static constexpr auto FLAG_IGNORECASE = (1U << 1U); 39 static constexpr auto FLAG_MULTILINE = (1U << 2U); 40 static constexpr auto FLAG_DOTALL = (1U << 3U); 41 static constexpr auto FLAG_UTF16 = (1U << 4U); 42 static constexpr auto FLAG_STICKY = (1U << 5U); 43 static const uint32_t KEY_EOF = UINT32_MAX; 44 static constexpr int CLASS_RANGE_BASE = 0x40000000; 45 static constexpr uint32_t NUM_CAPTURE__OFFSET = 4; 46 static constexpr uint32_t NUM_STACK_OFFSET = 8; 47 static constexpr uint32_t OCTAL_VALUE = 8; 48 static constexpr uint32_t OCTAL_VALUE_RANGE = 32; 49 static constexpr uint32_t HEX_VALUE = 16; 50 static constexpr uint32_t DECIMAL_DIGITS_ADVANCE = 10; 51 static constexpr uint32_t FLAGS_OFFSET = 12; 52 static constexpr uint32_t OP_START_OFFSET = 16; 53 static constexpr uint32_t UNICODE_HEX_VALUE = 4; 54 static constexpr uint32_t UNICODE_HEX_ADVANCE = 2; 55 static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3; 56 static constexpr uint32_t UTF8_CHAR_LEN_MAX = 6; 57 static int Canonicalize(int c, bool isUnicode); 58 RegExpParser(Chunk * chunk)59 explicit RegExpParser(Chunk *chunk) 60 : base_(nullptr), 61 pc_(nullptr), 62 end_(nullptr), 63 flags_(0), 64 c0_(KEY_EOF), 65 captureCount_(0), 66 stackCount_(0), 67 isError_(false), 68 isEmpty_(false), 69 buffer_(chunk), 70 groupNames_(chunk) 71 { 72 } 73 ~RegExpParser()74 ~RegExpParser() 75 { 76 Clear(); 77 } 78 79 NO_COPY_SEMANTIC(RegExpParser); 80 NO_MOVE_SEMANTIC(RegExpParser); 81 Init(char * source,size_t length,uint32_t flags)82 inline void Init(char *source, size_t length, uint32_t flags) 83 { 84 pc_ = reinterpret_cast<uint8_t *>(source); 85 base_ = pc_; 86 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 87 end_ = reinterpret_cast<uint8_t *>(source) + length - 1; 88 flags_ = flags; 89 } 90 91 void Parse(); 92 void ParseDisjunction(bool isBackward); 93 void ParseAlternative(bool isBackward); 94 bool ParseAssertionCapture(int *captureIndex, bool isBackward); 95 void ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd); 96 int ParseDecimalDigits(); 97 int ParseAtomEscape(bool isBackward); 98 int ParseCharacterEscape(); 99 bool ParseGroupSpecifier(const uint8_t **pp, CString &name); 100 int ParseCaptureCount(const char *groupName); 101 bool ParseClassRanges(RangeSet *result); 102 void ParseNonemptyClassRangesNoDash(DynChunk *buffer); 103 uint32_t ParseClassAtom(RangeSet *atom); 104 int ParseClassEscape(RangeSet *atom); 105 void ParseError(const char *errorMessage); 106 void ParseUnicodePropertyValueCharacters(bool *isValue); 107 int FindGroupName(const CString &name); 108 uint32_t ParseOctalLiteral(); 109 bool ParseHexEscape(int length, uint32_t *value); 110 bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value); 111 bool ParseUnicodeEscape(uint32_t *value); 112 bool ParserIntervalQuantifier(int *pmin, int *pmax); 113 bool HasNamedCaptures(); 114 int ParseEscape(const uint8_t **pp, int isUtf16); 115 int RecountCaptures(); 116 int IsIdentFirst(uint32_t c); 117 bool NeedIntersection(uint32_t c); 118 GetGroupNames()119 inline CVector<CString> GetGroupNames() const 120 { 121 return newGroupNames_; 122 } 123 GetGroupNamesSize()124 inline size_t GetGroupNamesSize() const 125 { 126 return groupNames_.size_; 127 } 128 IsError()129 inline bool IsError() const 130 { 131 return isError_; 132 } 133 GetOriginBuffer()134 inline uint8_t *GetOriginBuffer() const 135 { 136 return buffer_.buf_; 137 } 138 GetOriginBufferSize()139 inline size_t GetOriginBufferSize() const 140 { 141 return buffer_.size_; 142 } 143 GetErrorMsg()144 inline CString GetErrorMsg() const 145 { 146 if (isError_) { 147 return CString(errorMsg_); 148 } 149 return CString(""); 150 } 151 IsGlobal()152 inline bool IsGlobal() const 153 { 154 return (flags_ & FLAG_GLOBAL) != 0; 155 } 156 IsIgnoreCase()157 inline bool IsIgnoreCase() const 158 { 159 return (flags_ & FLAG_IGNORECASE) != 0; 160 } 161 IsMultiline()162 inline bool IsMultiline() const 163 { 164 return (flags_ & FLAG_MULTILINE) != 0; 165 } 166 IsDotAll()167 inline bool IsDotAll() const 168 { 169 return (flags_ & FLAG_DOTALL) != 0; 170 } 171 IsUtf16()172 inline bool IsUtf16() const 173 { 174 return (flags_ & FLAG_UTF16) != 0; 175 } 176 IsStick()177 inline bool IsStick() const 178 { 179 return (flags_ & FLAG_STICKY) != 0; 180 } 181 GetcurrentCharNext(int c)182 inline static int GetcurrentCharNext(int c) 183 { 184 int cur = c; 185 c = u_tolower(static_cast<UChar32>(c)); 186 if (c == cur) { 187 c = u_toupper(static_cast<UChar32>(c)); 188 } 189 if (((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) && 190 !((cur >= 'A' && cur <= 'Z') || (cur >= 'a' && cur <= 'z'))) { 191 c = cur; 192 } 193 return c; 194 } ProcessIntersection(RangeSet * result)195 inline static void ProcessIntersection(RangeSet *result) 196 { 197 RangeSet cr; 198 RangeSet cr1; 199 const uint32_t MINLOWERCHAR = 'a'; 200 const uint32_t MAXLOWERCHAR = 'z' + 1; 201 const uint32_t MINUPPERCHAR = 'A'; 202 const uint32_t MAXUPPERCHAR = 'Z' + 1; 203 // Range values for a and z + 1 204 cr.Insert(MINLOWERCHAR, MAXLOWERCHAR); 205 // Range values for A and Z + 1 206 cr.Insert(MINUPPERCHAR, MAXUPPERCHAR); 207 result->Inter(cr1, cr); 208 result->Insert(cr1); 209 } 210 private: 211 friend class RegExpExecutor; 212 static constexpr int TMP_BUF_SIZE = 128; Clear()213 void Clear() 214 { 215 base_ = nullptr; 216 pc_ = nullptr; 217 end_ = nullptr; 218 c0_ = KEY_EOF; 219 isError_ = false; 220 isEmpty_ = false; 221 } 222 Advance()223 void Advance() 224 { 225 if (pc_ <= end_) { 226 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 227 c0_ = *pc_++; 228 } else { 229 c0_ = KEY_EOF; 230 } 231 } 232 Advance(int offset)233 void Advance(int offset) 234 { 235 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 236 pc_ += offset - 1; 237 Advance(); 238 } 239 Prev()240 void Prev() 241 { 242 if (pc_ >= base_) { 243 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 244 c0_ = *pc_--; 245 } else { 246 c0_ = KEY_EOF; 247 } 248 } 249 SetIsError()250 void SetIsError() 251 { 252 isError_ = true; 253 } 254 255 void PrintF(const char *fmt, ...); 256 uint8_t *base_; 257 uint8_t *pc_; 258 uint8_t *end_; 259 uint32_t flags_; 260 uint32_t c0_; 261 int captureCount_; 262 int stackCount_; 263 bool isError_; 264 bool isEmpty_; 265 char errorMsg_[TMP_BUF_SIZE] = {0}; // NOLINTNEXTLINE(modernize-avoid-c-arrays) 266 int hasNamedCaptures_ = -1; 267 int totalCaptureCount_ = -1; 268 DynChunk buffer_; 269 DynChunk groupNames_; 270 CVector<CString> newGroupNames_; 271 }; 272 } // namespace panda::ecmascript 273 #endif // ECMASCRIPT_REGEXP_PARSER_H 274