1 /* 2 * Copyright (c) 2021 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef ECMASCRIPT_REGEXP_PARSER_H 17 #define ECMASCRIPT_REGEXP_PARSER_H 18 19 #include <cstdarg> 20 #include <cstdio> 21 #include <cstdint> 22 #include "ecmascript/mem/chunk.h" 23 #include "ecmascript/regexp/dyn_chunk.h" 24 #include "ecmascript/regexp/regexp_opcode.h" 25 #include "unicode/stringpiece.h" 26 #include "unicode/uchar.h" 27 #include "unicode/utf16.h" 28 #include "unicode/utf8.h" 29 #include "unicode/utypes.h" 30 31 namespace panda::ecmascript { 32 class RegExpParser { 33 public: 34 static constexpr auto FLAG_GLOBAL = (1U << 0U); 35 static constexpr auto FLAG_IGNORECASE = (1U << 1U); 36 static constexpr auto FLAG_MULTILINE = (1U << 2U); 37 static constexpr auto FLAG_DOTALL = (1U << 3U); 38 static constexpr auto FLAG_UTF16 = (1U << 4U); 39 static constexpr auto FLAG_STICKY = (1U << 5U); 40 static const int KEY_EOF = -1; 41 static constexpr int CLASS_RANGE_BASE = 0x40000000; 42 static constexpr uint32_t NUM_CAPTURE__OFFSET = 4; 43 static constexpr uint32_t NUM_STACK_OFFSET = 8; 44 static constexpr uint32_t OCTAL_VALUE = 8; 45 static constexpr uint32_t OCTAL_VALUE_RANGE = 32; 46 static constexpr uint32_t HEX_VALUE = 16; 47 static constexpr int32_t DECIMAL_DIGITS_ADVANCE = 10; 48 static constexpr uint32_t FLAGS_OFFSET = 12; 49 static constexpr uint32_t OP_START_OFFSET = 16; 50 static constexpr uint32_t UNICODE_HEX_VALUE = 4; 51 static constexpr uint32_t UNICODE_HEX_ADVANCE = 2; 52 static constexpr uint32_t CAPTURE_CONUT_ADVANCE = 3; 53 RegExpParser(Chunk * chunk)54 explicit RegExpParser(Chunk *chunk) 55 : base_(nullptr), 56 pc_(nullptr), 57 end_(nullptr), 58 flags_(0), 59 c0_(KEY_EOF), 60 captureCount_(0), 61 stackCount_(0), 62 isError_(false), 63 buffer_(chunk), 64 groupNames_(chunk) 65 { 66 } 67 ~RegExpParser()68 ~RegExpParser() 69 { 70 Clear(); 71 } 72 73 NO_COPY_SEMANTIC(RegExpParser); 74 NO_MOVE_SEMANTIC(RegExpParser); 75 Init(char * source,size_t length,uint32_t flags)76 inline void Init(char *source, size_t length, uint32_t flags) 77 { 78 pc_ = reinterpret_cast<uint8_t *>(source); 79 base_ = pc_; 80 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 81 end_ = reinterpret_cast<uint8_t *>(source) + length - 1; 82 flags_ = flags; 83 } 84 85 void Parse(); 86 void ParseDisjunction(bool isBackward); 87 void ParseAlternative(bool isBackward); 88 bool ParseAssertionCapture(int *captureIndex, bool isBackward); 89 void ParseQuantifier(size_t atomBcStart, int captureStart, int captureEnd); 90 int ParseDecimalDigits(); 91 int ParseAtomEscape(bool isBackward); 92 int ParseCharacterEscape(); 93 bool ParseGroupSpecifier(const uint8_t **pp, CString &name); 94 int ParseCaptureCount(const char *groupName); 95 bool ParseClassRanges(RangeSet *result); 96 void ParseNonemptyClassRangesNoDash(DynChunk *buffer); 97 uint32_t ParseClassAtom(RangeSet *atom); 98 int ParseClassEscape(RangeSet *atom); 99 void ParseError(const char *errorMessage); 100 void ParseUnicodePropertyValueCharacters(bool *isValue); 101 int FindGroupName(const CString &name); 102 uint32_t ParseOctalLiteral(); 103 bool ParseHexEscape(int length, uint32_t *value); 104 bool ParseUnlimitedLengthHexNumber(uint32_t maxValue, uint32_t *value); 105 bool ParseUnicodeEscape(uint32_t *value); 106 bool ParserIntervalQuantifier(int *pmin, int *pmax); 107 IsError()108 inline bool IsError() const 109 { 110 return isError_; 111 } 112 GetOriginBuffer()113 inline uint8_t *GetOriginBuffer() const 114 { 115 return buffer_.buf_; 116 } 117 GetOriginBufferSize()118 inline size_t GetOriginBufferSize() const 119 { 120 return buffer_.size_; 121 } 122 GetErrorMsg()123 inline CString GetErrorMsg() const 124 { 125 if (isError_) { 126 return CString(errorMsg_); 127 } 128 return CString(""); 129 } 130 IsGlobal()131 inline bool IsGlobal() const 132 { 133 return (flags_ & FLAG_GLOBAL) != 0; 134 } 135 IsIgnoreCase()136 inline bool IsIgnoreCase() const 137 { 138 return (flags_ & FLAG_IGNORECASE) != 0; 139 } 140 IsMultiline()141 inline bool IsMultiline() const 142 { 143 return (flags_ & FLAG_MULTILINE) != 0; 144 } 145 IsDotAll()146 inline bool IsDotAll() const 147 { 148 return (flags_ & FLAG_DOTALL) != 0; 149 } 150 IsUtf16()151 inline bool IsUtf16() const 152 { 153 return (flags_ & FLAG_UTF16) != 0; 154 } 155 IsStick()156 inline bool IsStick() const 157 { 158 return (flags_ & FLAG_STICKY) != 0; 159 } 160 Canonicalize(int c,bool isUnicode)161 inline static int Canonicalize(int c, bool isUnicode) 162 { 163 if (c < TMP_BUF_SIZE) { // NOLINTNEXTLINE(readability-magic-numbers) 164 if (c >= 'a' && c <= 'z') { 165 c = c - 'a' + 'A'; 166 } 167 } else { 168 if (isUnicode) { 169 c = u_toupper(static_cast<UChar32>(c)); 170 } 171 } 172 return c; 173 } 174 175 private: 176 friend class RegExpExecutor; 177 static constexpr int TMP_BUF_SIZE = 128; Clear()178 void Clear() 179 { 180 base_ = nullptr; 181 pc_ = nullptr; 182 end_ = nullptr; 183 c0_ = KEY_EOF; 184 isError_ = false; 185 } 186 Advance()187 void Advance() 188 { 189 if (pc_ <= end_) { 190 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 191 c0_ = *pc_++; 192 } else { 193 c0_ = KEY_EOF; 194 } 195 } 196 Advance(int offset)197 void Advance(int offset) 198 { 199 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 200 pc_ += offset - 1; 201 Advance(); 202 } 203 Prev()204 void Prev() 205 { 206 if (pc_ >= base_) { 207 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 208 c0_ = *pc_--; 209 } else { 210 c0_ = KEY_EOF; 211 } 212 } 213 SetIsError()214 void SetIsError() 215 { 216 isError_ = true; 217 } 218 219 void PrintF(const char *fmt, ...); 220 uint8_t *base_; 221 uint8_t *pc_; 222 uint8_t *end_; 223 uint32_t flags_; 224 int c0_; 225 int captureCount_; 226 int stackCount_; 227 bool isError_; 228 char errorMsg_[TMP_BUF_SIZE] = {0}; // NOLINTNEXTLINE(modernize-avoid-c-arrays) 229 DynChunk buffer_; 230 DynChunk groupNames_; 231 }; 232 } // namespace panda::ecmascript 233 #endif // ECMASCRIPT_REGEXP_PARSER_H