1 /* 2 * Copyright (c) 2025 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef COMMON_INTERFACES_OBJECTS_STRING_BASE_STRING_DECLARE_H 17 #define COMMON_INTERFACES_OBJECTS_STRING_BASE_STRING_DECLARE_H 18 19 #include "common_interfaces/base/bit_field.h" 20 #include "common_interfaces/base/mem.h" 21 #include "common_interfaces/objects/base_object.h" 22 #include "common_interfaces/objects/utils/field_macro.h" 23 #include "common_interfaces/objects/utils/objects_traits.h" 24 #include "common_interfaces/objects/readonly_handle.h" 25 #include "libpandabase/utils/span.h" 26 27 #include <type_traits> 28 29 namespace common { 30 class LineString; 31 class TreeString; 32 class SlicedString; 33 34 using ::panda::Span; 35 36 /* 37 +-----------------------------+ <-- offset 0 38 | BaseObject fields | 39 +-----------------------------+ <-- offset = BaseObjectSize() 40 | LengthAndFlags (uint32_t) | <-- LENGTH_AND_FLAGS_OFFSET 41 +-----------------------------+ 42 | RawHashcode (uint32_t) | <-- RAW_HASHCODE_OFFSET 43 +-----------------------------+ <-- SIZE (== BaseString::SIZE) 44 */ 45 /* 46 +-----------------------------+ 47 | LengthAndFlags (uint32_t) | 48 +-----------------------------+ 49 Bit layout: 50 [0] : CompressedStatusBit (1 bit) 51 [1] : IsInternBit (1 bit) 52 [2 - 31] : LengthBits (30 bits) 53 */ 54 class BaseString : public BaseObject { 55 public: 56 BASE_CAST_CHECK(BaseString, IsString); 57 NO_MOVE_SEMANTIC_CC(BaseString); 58 NO_COPY_SEMANTIC_CC(BaseString); 59 static constexpr uint32_t RAW_HASH_LENGTH = 31; 60 static constexpr uint32_t IS_INTEGER_MASK = 1U << RAW_HASH_LENGTH; 61 static constexpr uint32_t MAX_INTEGER_HASH_NUMBER = 0x3B9AC9FF; 62 static constexpr uint32_t MAX_CACHED_INTEGER_SIZE = 9; 63 static constexpr size_t MAX_STRING_LENGTH = 0x40000000U; // 30 bits for string length, 2 bits for special meaning 64 static constexpr uint32_t MAX_ELEMENT_INDEX_LEN = 10; 65 static constexpr size_t HASH_SHIFT = 5; 66 static constexpr size_t LENGTH_AND_FLAGS_OFFSET = BaseObjectSize(); 67 static constexpr uint32_t STRING_LENGTH_BITS_NUM = 30; 68 69 enum CompressedStatus { 70 STRING_COMPRESSED, 71 STRING_UNCOMPRESSED, 72 }; 73 74 enum TrimMode : uint8_t { 75 TRIM, 76 TRIM_START, 77 TRIM_END, 78 }; 79 80 enum IsIntegerStatus { 81 NOT_INTEGER = 0, 82 IS_INTEGER, 83 }; 84 85 enum ConcatOptStatus { 86 BEGIN_STRING_ADD = 1, 87 IN_STRING_ADD, 88 CONFIRMED_IN_STRING_ADD, 89 END_STRING_ADD, 90 INVALID_STRING_ADD, 91 }; 92 93 using CompressedStatusBit = BitField<CompressedStatus, 0>; // 1 94 using IsInternBit = CompressedStatusBit::NextFlag; // 1 95 using LengthBits = IsInternBit::NextField<uint32_t, STRING_LENGTH_BITS_NUM>; // 30 96 static_assert(LengthBits::START_BIT + LengthBits::SIZE == sizeof(uint32_t) * BITS_PER_BYTE, 97 "LengthBits does not match the field size"); 98 99 PRIMITIVE_FIELD(LengthAndFlags, uint32_t, LENGTH_AND_FLAGS_OFFSET, MIX_HASHCODE_OFFSET) 100 101 using RawHashcode = BitField<uint32_t, 0, RAW_HASH_LENGTH>; // 31 102 using IsIntegerBit = RawHashcode::NextField<IsIntegerStatus, 1>; // 1 103 // In last bit of mix_hash we store if this string is small-integer number or not. 104 PRIMITIVE_FIELD(MixHashcode, uint32_t, MIX_HASHCODE_OFFSET, SIZE) 105 106 static inline uint32_t MixHashcode(uint32_t hashcode, bool isInteger); 107 108 template <typename ReadBarrier> IsInteger(ReadBarrier && readBarrier)109 inline bool IsInteger(ReadBarrier &&readBarrier) 110 { 111 uint32_t hashcode = GetHashcode(std::forward<ReadBarrier>(readBarrier)); 112 return IsIntegerBit::Decode(hashcode) == IS_INTEGER; 113 } 114 IsString()115 bool IsString() const 116 { 117 return GetBaseClass()->IsString(); 118 } 119 IsLineString()120 bool IsLineString() const 121 { 122 return GetBaseClass()->IsLineString(); 123 } 124 IsTreeString()125 bool IsTreeString() const 126 { 127 return GetBaseClass()->IsTreeString(); 128 } 129 IsSlicedString()130 bool IsSlicedString() const 131 { 132 return GetBaseClass()->IsSlicedString(); 133 } 134 135 bool IsUtf8() const; 136 137 bool IsUtf16() const; 138 139 // require is LineString 140 uint16_t *GetData() const; 141 const uint8_t *GetDataUtf8() const; 142 const uint16_t *GetDataUtf16() const; 143 144 // require is LineString 145 uint8_t *GetDataUtf8Writable(); 146 uint16_t *GetDataUtf16Writable(); 147 148 uint32_t GetLength() const; 149 150 void InitLengthAndFlags(uint32_t length, bool compressed = false, bool isIntern = false); 151 152 template <typename ReadBarrier> 153 size_t GetUtf8Length(ReadBarrier &&readBarrier, bool modify = true, bool isGetBufferSize = false) const; 154 155 void SetIsInternString(); 156 157 bool IsInternString() const; 158 159 void ClearInternStringFlag(); 160 161 bool TryGetHashCode(uint32_t *hash) const; 162 163 // not change this data structure. 164 // if string is not flat, this func has low efficiency. 165 template <typename ReadBarrier> 166 uint32_t PUBLIC_API GetHashcode(ReadBarrier &&readBarrier); 167 168 template<class ReadBarrier> 169 uint32_t ComputeHashcode(ReadBarrier &&readBarrier) const; 170 171 template <typename ReadBarrier> 172 std::pair<uint32_t, bool> PUBLIC_API ComputeRawHashcode(ReadBarrier &&readBarrier) const; 173 174 template <bool verify = true, typename ReadBarrier> 175 uint16_t At(ReadBarrier &&readBarrier, int32_t index) const; 176 177 // require is LineString 178 void WriteData(uint32_t index, uint16_t src); 179 180 // Compares string1 + string2 by bytes, It doesn't check canonical unicode equivalence. 181 template <typename ReadBarrier> 182 bool EqualToSplicedString(ReadBarrier &&readBarrier, const BaseString *str1, const BaseString *str2); 183 184 // It allows user to copy into buffer even if maxLength < length 185 template <typename ReadBarrier> 186 size_t WriteUtf8(ReadBarrier &&readBarrier, uint8_t *buf, size_t maxLength, 187 bool isWriteBuffer = false) const; 188 template <typename ReadBarrier> 189 size_t CopyDataToUtf16(ReadBarrier &&readBarrier, uint16_t *buf, uint32_t length, uint32_t bufLength) const; 190 191 // It allows user to copy into buffer even if maxLength < length 192 template <typename ReadBarrier> 193 size_t WriteUtf16(ReadBarrier &&readBarrier, uint16_t *buf, uint32_t targetLength, uint32_t bufLength) const; 194 195 template <typename ReadBarrier> 196 size_t WriteOneByte(ReadBarrier &&readBarrier, uint8_t *buf, size_t maxLength) const; 197 template <typename ReadBarrier> 198 size_t CopyDataRegionUtf8(ReadBarrier &&readBarrier, uint8_t *buf, size_t start, size_t length, 199 size_t maxLength, 200 bool modify = true, bool isWriteBuffer = false) const; 201 template <typename ReadBarrier> 202 uint32_t CopyDataUtf16(ReadBarrier &&readBarrier, uint16_t *buf, uint32_t maxLength) const; 203 template <typename ReadBarrier> 204 std::u16string ToU16String(ReadBarrier &&readBarrier, uint32_t len = 0); 205 206 template <typename ReadBarrier, typename Vec, 207 std::enable_if_t<objects_traits::is_std_vector_of_v<std::decay_t<Vec>, uint8_t>, int> = 0> 208 Span<const uint8_t> ToUtf8Span(ReadBarrier &&readBarrier, Vec &buf, bool modify = true, bool cesu8 = false); 209 210 template <typename ReadBarrier, typename Vec, 211 std::enable_if_t<objects_traits::is_std_vector_of_v<std::decay_t<Vec>, uint8_t>, int> = 0> 212 Span<const uint8_t> DebuggerToUtf8Span(ReadBarrier &&readBarrier, Vec &buf, bool modify = true); 213 214 template <typename ReadBarrier> 215 void WriteData(ReadBarrier &&readBarrier, BaseString *src, uint32_t start, uint32_t destSize, 216 uint32_t length); 217 218 template <typename ReadBarrier> 219 bool IsFlat(ReadBarrier &&readBarrier) const; 220 221 bool NotTreeString() const; 222 223 CommonType GetStringType() const; 224 225 template <class T1, class T2> 226 static uint32_t CalculateDataConcatHashCode(const T1 *dataFirst, size_t sizeFirst, const T2 *dataSecond, 227 size_t sizeSecond); 228 229 static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress); 230 static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length); 231 232 template <typename Allocator, objects_traits::enable_if_is_allocate<Allocator, BaseObject *> = 0> 233 static BaseString *CreateFromUtf8(Allocator &&allocate, const uint8_t *utf8Data, uint32_t utf8Len, 234 bool canBeCompress); 235 236 template <typename Allocator, objects_traits::enable_if_is_allocate<Allocator, BaseObject *> = 0> 237 static BaseString *CreateFromUtf8CompressedSubString(Allocator &&allocate, 238 const ReadOnlyHandle<BaseString> string, 239 uint32_t offset, uint32_t utf8Len); 240 241 template <typename Allocator, objects_traits::enable_if_is_allocate<Allocator, BaseObject *> = 0> 242 static LineString *CreateLineString(Allocator &&allocator, size_t length, bool compressed); 243 244 template <typename Allocator, objects_traits::enable_if_is_allocate<Allocator, BaseObject *> = 0> 245 static BaseString *CreateFromUtf16(Allocator &&allocator, const uint16_t *utf16Data, uint32_t utf16Len, 246 bool canBeCompress); 247 248 template <typename Allocator, typename WriteBarrier, 249 objects_traits::enable_if_is_allocate<Allocator, BaseObject *> = 0, 250 objects_traits::enable_if_is_write_barrier<WriteBarrier> = 0> 251 static SlicedString *CreateSlicedString(Allocator &&allocator, WriteBarrier &&writeBarrier, 252 ReadOnlyHandle<BaseString> parent); 253 254 template <typename Allocator, typename WriteBarrier, 255 objects_traits::enable_if_is_allocate<Allocator, BaseObject *> = 0, 256 objects_traits::enable_if_is_write_barrier<WriteBarrier> = 0> 257 static TreeString *CreateTreeString(Allocator &&allocator, WriteBarrier &&writeBarrier, 258 ReadOnlyHandle<BaseString> left, ReadOnlyHandle<BaseString> right, 259 uint32_t length, bool compressed); 260 261 // Check that two spans are equal. Should have the same length. 262 /* static */ 263 template <typename T, typename T1> 264 static bool StringsAreEquals(Span<const T> &str1, Span<const T1> &str2); 265 266 // Converts utf8Data to utf16 and compare it with given utf16_data. 267 static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data, 268 uint32_t utf16Len); 269 270 // Compares strings by bytes, It doesn't check canonical unicode equivalence. 271 template <typename ReadBarrier> 272 static PUBLIC_API bool StringsAreEqual(ReadBarrier &&readBarrier, BaseString *str1, BaseString *str2); 273 // Two strings have the same type of utf encoding format. 274 template <typename ReadBarrier> 275 static bool StringsAreEqualDiffUtfEncoding(ReadBarrier &&readBarrier, BaseString *str1, BaseString *str2); 276 277 // Compares strings by bytes, It doesn't check canonical unicode equivalence. 278 // not change str1 data structure. 279 // if str1 is not flat, this func has low efficiency. 280 template <typename ReadBarrier> 281 static bool StringIsEqualUint8Data(ReadBarrier &&readBarrier, const BaseString *str1, 282 const uint8_t *dataAddr, uint32_t dataLen, 283 bool canBeCompress); 284 // Compares strings by bytes, It doesn't check canonical unicode equivalence. 285 // not change str1 data structure. 286 // if str1 is not flat, this func has low efficiency. 287 template <typename ReadBarrier> 288 static bool StringsAreEqualUtf16(ReadBarrier &&readBarrier, const BaseString *str1, 289 const uint16_t *utf16Data, uint32_t utf16Len); 290 291 static bool CanBeCompressed(const uint8_t *utf8Data, uint32_t utf8Len); 292 293 static bool CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Len); 294 295 static bool CanBeCompressed(const BaseString *string); 296 297 // single char copy for loop 298 template <typename DstType, typename SrcType> 299 static void CopyChars(DstType *dst, SrcType *src, uint32_t count); 300 301 // To change the hash algorithm of BaseString, please modify BaseString::CalculateConcatHashCode 302 // and BaseStringHashHelper::ComputeHashForDataPlatform simultaneously!! 303 static PUBLIC_API uint32_t ComputeHashForData(const uint8_t *data, size_t size, uint32_t hashSeed); 304 305 static PUBLIC_API uint32_t ComputeHashForData(const uint16_t *data, size_t size, uint32_t hashSeed); 306 307 static bool IsASCIICharacter(uint16_t data); 308 309 template <typename T1, typename T2> 310 static int32_t IndexOf(Span<const T1> &lhsSp, Span<const T2> &rhsSp, int32_t pos, int32_t max); 311 312 template <typename T1, typename T2> 313 static int32_t LastIndexOf(Span<const T1> &lhsSp, Span<const T2> &rhsSp, int32_t pos); 314 315 template <typename Char, typename ReadBarrier> 316 static void WriteToFlat(ReadBarrier &&readBarrier, BaseString *src, Char *buf, uint32_t maxLength); 317 318 template <typename Char, typename ReadBarrier> 319 static void WriteToFlatWithPos(ReadBarrier &&readBarrier, BaseString *src, Char *buf, uint32_t length, 320 uint32_t pos); 321 322 template <typename ReadBarrier, typename Vec, 323 std::enable_if_t<objects_traits::is_std_vector_of_v<std::decay_t<Vec>, uint8_t>, int> = 0> 324 static const uint8_t *PUBLIC_API GetUtf8DataFlat(ReadBarrier &&readBarrier, const BaseString *src, Vec &buf); 325 326 template <typename ReadBarrier> 327 static const uint8_t *PUBLIC_API GetNonTreeUtf8Data(ReadBarrier &&readBarrier, const BaseString *src); 328 329 template <typename ReadBarrier, typename Vec, 330 std::enable_if_t<objects_traits::is_std_vector_of_v<std::decay_t<Vec>, uint16_t>, int> = 0> 331 static const uint16_t *PUBLIC_API GetUtf16DataFlat(ReadBarrier &&readBarrier, const BaseString *src, 332 Vec &buf); 333 334 template <typename ReadBarrier> 335 static const uint16_t *PUBLIC_API GetNonTreeUtf16Data(ReadBarrier &&readBarrier, const BaseString *src); 336 337 template <typename T> 338 static bool HashIntegerString(const T *data, size_t size, uint32_t *hash, uint32_t hashSeed); 339 }; 340 } // namespace common 341 #endif // COMMON_INTERFACES_OBJECTS_STRING_BASE_STRING_DECLARE_H