1 // Copyright 2017 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_OBJECTS_STRING_H_ 6 #define V8_OBJECTS_STRING_H_ 7 8 #include <memory> 9 10 #include "src/base/bits.h" 11 #include "src/base/export-template.h" 12 #include "src/base/strings.h" 13 #include "src/common/globals.h" 14 #include "src/objects/instance-type.h" 15 #include "src/objects/map.h" 16 #include "src/objects/name.h" 17 #include "src/objects/smi.h" 18 #include "src/strings/unicode-decoder.h" 19 20 // Has to be the last include (doesn't have include guards): 21 #include "src/objects/object-macros.h" 22 23 namespace v8 { 24 namespace internal { 25 26 class SharedStringAccessGuardIfNeeded; 27 28 enum InstanceType : uint16_t; 29 30 enum AllowNullsFlag { ALLOW_NULLS, DISALLOW_NULLS }; 31 enum RobustnessFlag { ROBUST_STRING_TRAVERSAL, FAST_STRING_TRAVERSAL }; 32 33 // The characteristics of a string are stored in its map. Retrieving these 34 // few bits of information is moderately expensive, involving two memory 35 // loads where the second is dependent on the first. To improve efficiency 36 // the shape of the string is given its own class so that it can be retrieved 37 // once and used for several string operations. A StringShape is small enough 38 // to be passed by value and is immutable, but be aware that flattening a 39 // string can potentially alter its shape. Also be aware that a GC caused by 40 // something else can alter the shape of a string due to ConsString 41 // shortcutting. Keeping these restrictions in mind has proven to be error- 42 // prone and so we no longer put StringShapes in variables unless there is a 43 // concrete performance benefit at that particular point in the code. 44 class StringShape { 45 public: 46 V8_INLINE explicit StringShape(const String s); 47 V8_INLINE explicit StringShape(const String s, PtrComprCageBase cage_base); 48 V8_INLINE explicit StringShape(Map s); 49 V8_INLINE explicit StringShape(InstanceType t); 50 V8_INLINE bool IsSequential() const; 51 V8_INLINE bool IsExternal() const; 52 V8_INLINE bool IsCons() const; 53 V8_INLINE bool IsSliced() const; 54 V8_INLINE bool IsThin() const; 55 V8_INLINE bool IsDirect() const; 56 V8_INLINE bool IsIndirect() const; 57 V8_INLINE bool IsUncachedExternal() const; 58 V8_INLINE bool IsExternalOneByte() const; 59 V8_INLINE bool IsExternalTwoByte() const; 60 V8_INLINE bool IsSequentialOneByte() const; 61 V8_INLINE bool IsSequentialTwoByte() const; 62 V8_INLINE bool IsInternalized() const; 63 V8_INLINE bool IsShared() const; 64 V8_INLINE bool CanMigrateInParallel() const; 65 V8_INLINE StringRepresentationTag representation_tag() const; 66 V8_INLINE uint32_t encoding_tag() const; 67 V8_INLINE uint32_t representation_and_encoding_tag() const; 68 V8_INLINE uint32_t representation_encoding_and_shared_tag() const; 69 #ifdef DEBUG type()70 inline uint32_t type() const { return type_; } invalidate()71 inline void invalidate() { valid_ = false; } valid()72 inline bool valid() const { return valid_; } 73 #else invalidate()74 inline void invalidate() {} 75 #endif 76 77 // Run different behavior for each concrete string class type, as defined by 78 // the dispatcher. 79 template <typename TDispatcher, typename TResult, typename... TArgs> 80 inline TResult DispatchToSpecificTypeWithoutCast(TArgs&&... args); 81 template <typename TDispatcher, typename TResult, typename... TArgs> 82 inline TResult DispatchToSpecificType(String str, TArgs&&... args); 83 84 private: 85 uint32_t type_; 86 #ifdef DEBUG set_valid()87 inline void set_valid() { valid_ = true; } 88 bool valid_; 89 #else set_valid()90 inline void set_valid() {} 91 #endif 92 }; 93 94 #include "torque-generated/src/objects/string-tq.inc" 95 96 // The String abstract class captures JavaScript string values: 97 // 98 // Ecma-262: 99 // 4.3.16 String Value 100 // A string value is a member of the type String and is a finite 101 // ordered sequence of zero or more 16-bit unsigned integer values. 102 // 103 // All string values have a length field. 104 class String : public TorqueGeneratedString<String, Name> { 105 public: 106 enum Encoding { ONE_BYTE_ENCODING, TWO_BYTE_ENCODING }; 107 108 // Representation of the flat content of a String. 109 // A non-flat string doesn't have flat content. 110 // A flat string has content that's encoded as a sequence of either 111 // one-byte chars or two-byte UC16. 112 // Returned by String::GetFlatContent(). 113 // Not safe to use from concurrent background threads. 114 // TODO(solanes): Move FlatContent into FlatStringReader, and make it private. 115 // This would de-duplicate code, as well as taking advantage of the fact that 116 // FlatStringReader is relocatable. 117 class FlatContent { 118 public: 119 inline ~FlatContent(); 120 121 // Returns true if the string is flat and this structure contains content. IsFlat()122 bool IsFlat() const { return state_ != NON_FLAT; } 123 // Returns true if the structure contains one-byte content. IsOneByte()124 bool IsOneByte() const { return state_ == ONE_BYTE; } 125 // Returns true if the structure contains two-byte content. IsTwoByte()126 bool IsTwoByte() const { return state_ == TWO_BYTE; } 127 128 // Return the one byte content of the string. Only use if IsOneByte() 129 // returns true. ToOneByteVector()130 base::Vector<const uint8_t> ToOneByteVector() const { 131 DCHECK_EQ(ONE_BYTE, state_); 132 return base::Vector<const uint8_t>(onebyte_start, length_); 133 } 134 // Return the two-byte content of the string. Only use if IsTwoByte() 135 // returns true. ToUC16Vector()136 base::Vector<const base::uc16> ToUC16Vector() const { 137 DCHECK_EQ(TWO_BYTE, state_); 138 return base::Vector<const base::uc16>(twobyte_start, length_); 139 } 140 Get(int i)141 base::uc16 Get(int i) const { 142 DCHECK(i < length_); 143 DCHECK(state_ != NON_FLAT); 144 if (state_ == ONE_BYTE) return onebyte_start[i]; 145 return twobyte_start[i]; 146 } 147 UsesSameString(const FlatContent & other)148 bool UsesSameString(const FlatContent& other) const { 149 return onebyte_start == other.onebyte_start; 150 } 151 152 // It is almost always a bug if the contents of a FlatContent changes during 153 // its lifetime, which can happen due to GC or bugs in concurrent string 154 // access. Rarely, callers need the ability to GC and have ensured safety in 155 // other ways, such as in IrregexpInterpreter. Those callers can disable the 156 // checksum verification with this call. UnsafeDisableChecksumVerification()157 void UnsafeDisableChecksumVerification() { 158 #ifdef ENABLE_SLOW_DCHECKS 159 checksum_ = kChecksumVerificationDisabled; 160 #endif 161 } 162 length()163 int length() const { return length_; } 164 165 private: 166 enum State { NON_FLAT, ONE_BYTE, TWO_BYTE }; 167 168 // Constructors only used by String::GetFlatContent(). 169 inline FlatContent(const uint8_t* start, int length, 170 const DisallowGarbageCollection& no_gc); 171 inline FlatContent(const base::uc16* start, int length, 172 const DisallowGarbageCollection& no_gc); FlatContent(const DisallowGarbageCollection & no_gc)173 explicit FlatContent(const DisallowGarbageCollection& no_gc) 174 : onebyte_start(nullptr), length_(0), state_(NON_FLAT), no_gc_(no_gc) {} 175 176 union { 177 const uint8_t* onebyte_start; 178 const base::uc16* twobyte_start; 179 }; 180 int length_; 181 State state_; 182 const DisallowGarbageCollection& no_gc_; 183 184 static constexpr uint32_t kChecksumVerificationDisabled = 0; 185 186 #ifdef ENABLE_SLOW_DCHECKS 187 inline uint32_t ComputeChecksum() const; 188 189 uint32_t checksum_; 190 #endif 191 192 friend class String; 193 friend class IterableSubString; 194 }; 195 196 template <typename IsolateT> 197 EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE) 198 void MakeThin(IsolateT* isolate, String canonical); 199 200 template <typename Char> 201 V8_INLINE base::Vector<const Char> GetCharVector( 202 const DisallowGarbageCollection& no_gc); 203 204 // Get chars from sequential or external strings. May only be called when a 205 // SharedStringAccessGuard is not needed (i.e. on the main thread or on 206 // read-only strings). 207 template <typename Char> 208 inline const Char* GetChars(PtrComprCageBase cage_base, 209 const DisallowGarbageCollection& no_gc) const; 210 211 // Get chars from sequential or external strings. 212 template <typename Char> 213 inline const Char* GetChars( 214 PtrComprCageBase cage_base, const DisallowGarbageCollection& no_gc, 215 const SharedStringAccessGuardIfNeeded& access_guard) const; 216 217 // Returns the address of the character at an offset into this string. 218 // Requires: this->IsFlat() 219 const byte* AddressOfCharacterAt(int start_index, 220 const DisallowGarbageCollection& no_gc); 221 222 // Forward declare the non-atomic (set_)length defined in torque. 223 using TorqueGeneratedString::length; 224 using TorqueGeneratedString::set_length; 225 DECL_RELEASE_ACQUIRE_INT_ACCESSORS(length) 226 227 // Returns whether this string has only one-byte chars, i.e. all of them can 228 // be one-byte encoded. This might be the case even if the string is 229 // two-byte. Such strings may appear when the embedder prefers 230 // two-byte external representations even for one-byte data. 231 DECL_GETTER(IsOneByteRepresentation, bool) 232 DECL_GETTER(IsTwoByteRepresentation, bool) 233 234 // Cons and slices have an encoding flag that may not represent the actual 235 // encoding of the underlying string. This is taken into account here. 236 // This function is static because that helps it get inlined. 237 // Requires: string.IsFlat() 238 static inline bool IsOneByteRepresentationUnderneath(String string); 239 240 // Get and set individual two byte chars in the string. 241 inline void Set(int index, uint16_t value); 242 // Get individual two byte char in the string. Repeated calls 243 // to this method are not efficient unless the string is flat. 244 // If it is called from a background thread, the LocalIsolate version should 245 // be used. 246 V8_INLINE uint16_t Get(int index) const; 247 V8_INLINE uint16_t Get(int index, Isolate* isolate) const; 248 V8_INLINE uint16_t Get(int index, LocalIsolate* local_isolate) const; 249 // Method to pass down the access_guard. Useful for recursive calls such as 250 // ThinStrings where we go String::Get into ThinString::Get into String::Get 251 // again for the internalized string. 252 V8_INLINE uint16_t 253 Get(int index, PtrComprCageBase cage_base, 254 const SharedStringAccessGuardIfNeeded& access_guard) const; 255 256 // ES6 section 7.1.3.1 ToNumber Applied to the String Type 257 static Handle<Object> ToNumber(Isolate* isolate, Handle<String> subject); 258 259 // Flattens the string. Checks first inline to see if it is 260 // necessary. Does nothing if the string is not a cons string. 261 // Flattening allocates a sequential string with the same data as 262 // the given string and mutates the cons string to a degenerate 263 // form, where the first component is the new sequential string and 264 // the second component is the empty string. If allocation fails, 265 // this function returns a failure. If flattening succeeds, this 266 // function returns the sequential string that is now the first 267 // component of the cons string. 268 // 269 // Degenerate cons strings are handled specially by the garbage 270 // collector (see IsShortcutCandidate). 271 272 static V8_INLINE Handle<String> Flatten( 273 Isolate* isolate, Handle<String> string, 274 AllocationType allocation = AllocationType::kYoung); 275 static V8_INLINE Handle<String> Flatten( 276 LocalIsolate* isolate, Handle<String> string, 277 AllocationType allocation = AllocationType::kYoung); 278 279 // Tries to return the content of a flat string as a structure holding either 280 // a flat vector of char or of base::uc16. 281 // If the string isn't flat, and therefore doesn't have flat content, the 282 // returned structure will report so, and can't provide a vector of either 283 // kind. 284 // When using a SharedStringAccessGuard, the guard's must outlive the 285 // returned FlatContent. 286 V8_EXPORT_PRIVATE V8_INLINE FlatContent 287 GetFlatContent(const DisallowGarbageCollection& no_gc); 288 V8_EXPORT_PRIVATE V8_INLINE FlatContent 289 GetFlatContent(const DisallowGarbageCollection& no_gc, 290 const SharedStringAccessGuardIfNeeded&); 291 292 // Returns the parent of a sliced string or first part of a flat cons string. 293 // Requires: StringShape(this).IsIndirect() && this->IsFlat() 294 inline String GetUnderlying() const; 295 296 // Shares the string. Checks inline if the string is already shared or can be 297 // shared by transitioning its map in-place. If neither is possible, flattens 298 // and copies into a new shared sequential string. 299 static inline Handle<String> Share(Isolate* isolate, Handle<String> string); 300 301 // String relational comparison, implemented according to ES6 section 7.2.11 302 // Abstract Relational Comparison (step 5): The comparison of Strings uses a 303 // simple lexicographic ordering on sequences of code unit values. There is no 304 // attempt to use the more complex, semantically oriented definitions of 305 // character or string equality and collating order defined in the Unicode 306 // specification. Therefore String values that are canonically equal according 307 // to the Unicode standard could test as unequal. In effect this algorithm 308 // assumes that both Strings are already in normalized form. Also, note that 309 // for strings containing supplementary characters, lexicographic ordering on 310 // sequences of UTF-16 code unit values differs from that on sequences of code 311 // point values. 312 V8_WARN_UNUSED_RESULT static ComparisonResult Compare(Isolate* isolate, 313 Handle<String> x, 314 Handle<String> y); 315 316 // Perform ES6 21.1.3.8, including checking arguments. 317 static Object IndexOf(Isolate* isolate, Handle<Object> receiver, 318 Handle<Object> search, Handle<Object> position); 319 // Perform string match of pattern on subject, starting at start index. 320 // Caller must ensure that 0 <= start_index <= sub->length(), as this does not 321 // check any arguments. 322 static int IndexOf(Isolate* isolate, Handle<String> receiver, 323 Handle<String> search, int start_index); 324 325 static Object LastIndexOf(Isolate* isolate, Handle<Object> receiver, 326 Handle<Object> search, Handle<Object> position); 327 328 // Encapsulates logic related to a match and its capture groups as required 329 // by GetSubstitution. 330 class Match { 331 public: 332 virtual Handle<String> GetMatch() = 0; 333 virtual Handle<String> GetPrefix() = 0; 334 virtual Handle<String> GetSuffix() = 0; 335 336 // A named capture can be unmatched (either not specified in the pattern, 337 // or specified but unmatched in the current string), or matched. 338 enum CaptureState { UNMATCHED, MATCHED }; 339 340 virtual int CaptureCount() = 0; 341 virtual bool HasNamedCaptures() = 0; 342 virtual MaybeHandle<String> GetCapture(int i, bool* capture_exists) = 0; 343 virtual MaybeHandle<String> GetNamedCapture(Handle<String> name, 344 CaptureState* state) = 0; 345 346 virtual ~Match() = default; 347 }; 348 349 // ES#sec-getsubstitution 350 // GetSubstitution(matched, str, position, captures, replacement) 351 // Expand the $-expressions in the string and return a new string with 352 // the result. 353 // A {start_index} can be passed to specify where to start scanning the 354 // replacement string. 355 V8_WARN_UNUSED_RESULT static MaybeHandle<String> GetSubstitution( 356 Isolate* isolate, Match* match, Handle<String> replacement, 357 int start_index = 0); 358 359 // String equality operations. 360 inline bool Equals(String other) const; 361 inline static bool Equals(Isolate* isolate, Handle<String> one, 362 Handle<String> two); 363 364 enum class EqualityType { kWholeString, kPrefix, kNoLengthCheck }; 365 366 // Check if this string matches the given vector of characters, either as a 367 // whole string or just a prefix. 368 // 369 // The Isolate is passed as "evidence" that this call is on the main thread, 370 // and to distiguish from the LocalIsolate overload. 371 template <EqualityType kEqType = EqualityType::kWholeString, typename Char> 372 inline bool IsEqualTo(base::Vector<const Char> str, Isolate* isolate) const; 373 374 // Check if this string matches the given vector of characters, either as a 375 // whole string or just a prefix. 376 // 377 // This is main-thread only, like the Isolate* overload, but additionally 378 // computes the PtrComprCageBase for IsEqualToImpl. 379 template <EqualityType kEqType = EqualityType::kWholeString, typename Char> 380 inline bool IsEqualTo(base::Vector<const Char> str) const; 381 382 // Check if this string matches the given vector of characters, either as a 383 // whole string or just a prefix. 384 // 385 // The LocalIsolate is passed to provide access to the string access lock, 386 // which is taken when reading the string's contents on a background thread. 387 template <EqualityType kEqType = EqualityType::kWholeString, typename Char> 388 inline bool IsEqualTo(base::Vector<const Char> str, 389 LocalIsolate* isolate) const; 390 391 V8_EXPORT_PRIVATE bool HasOneBytePrefix(base::Vector<const char> str); 392 V8_EXPORT_PRIVATE inline bool IsOneByteEqualTo(base::Vector<const char> str); 393 394 // Returns true if the |str| is a valid ECMAScript identifier. 395 static bool IsIdentifier(Isolate* isolate, Handle<String> str); 396 397 // Return a UTF8 representation of the string. The string is null 398 // terminated but may optionally contain nulls. Length is returned 399 // in length_output if length_output is not a null pointer The string 400 // should be nearly flat, otherwise the performance of this method may 401 // be very slow (quadratic in the length). Setting robustness_flag to 402 // ROBUST_STRING_TRAVERSAL invokes behaviour that is robust This means it 403 // handles unexpected data without causing assert failures and it does not 404 // do any heap allocations. This is useful when printing stack traces. 405 std::unique_ptr<char[]> ToCString(AllowNullsFlag allow_nulls, 406 RobustnessFlag robustness_flag, int offset, 407 int length, int* length_output = nullptr); 408 V8_EXPORT_PRIVATE std::unique_ptr<char[]> ToCString( 409 AllowNullsFlag allow_nulls = DISALLOW_NULLS, 410 RobustnessFlag robustness_flag = FAST_STRING_TRAVERSAL, 411 int* length_output = nullptr); 412 413 // Externalization. 414 V8_EXPORT_PRIVATE bool MakeExternal( 415 v8::String::ExternalStringResource* resource); 416 V8_EXPORT_PRIVATE bool MakeExternal( 417 v8::String::ExternalOneByteStringResource* resource); 418 bool SupportsExternalization(); 419 420 // Conversion. 421 // "array index": an index allowed by the ES spec for JSArrays. 422 inline bool AsArrayIndex(uint32_t* index); 423 424 // This is used for calculating array indices but differs from an 425 // Array Index in the regard that this does not support the full 426 // array index range. This only supports positive numbers less than 427 // or equal to INT_MAX. 428 // 429 // String::AsArrayIndex might be a better fit if you're looking to 430 // calculate the array index. 431 // 432 // if val < 0 or val > INT_MAX, returns -1 433 // if 0 <= val <= INT_MAX, returns val 434 static int32_t ToArrayIndex(Address addr); 435 436 // "integer index": the string is the decimal representation of an 437 // integer in the range of a size_t. Useful for TypedArray accesses. 438 inline bool AsIntegerIndex(size_t* index); 439 440 // Trimming. 441 enum TrimMode { kTrim, kTrimStart, kTrimEnd }; 442 443 V8_EXPORT_PRIVATE void PrintOn(FILE* out); 444 V8_EXPORT_PRIVATE void PrintOn(std::ostream& out); 445 446 // For use during stack traces. Performs rudimentary sanity check. 447 bool LooksValid(); 448 449 // Printing utility functions. 450 // - PrintUC16 prints the raw string contents to the given stream. 451 // Non-printable characters are formatted as hex, but otherwise the string 452 // is printed as-is. 453 // - StringShortPrint and StringPrint have extra formatting: they add a 454 // prefix and suffix depending on the string kind, may add other information 455 // such as the string heap object address, may truncate long strings, etc. 456 const char* PrefixForDebugPrint() const; 457 const char* SuffixForDebugPrint() const; 458 void StringShortPrint(StringStream* accumulator); 459 void PrintUC16(std::ostream& os, int start = 0, int end = -1); 460 void PrintUC16(StringStream* accumulator, int start, int end); 461 462 // Dispatched behavior. 463 #if defined(DEBUG) || defined(OBJECT_PRINT) 464 char* ToAsciiArray(); 465 #endif 466 DECL_PRINTER(String) 467 DECL_VERIFIER(String) 468 469 inline bool IsFlat() const; 470 inline bool IsFlat(PtrComprCageBase cage_base) const; 471 472 inline bool IsShared() const; 473 inline bool IsShared(PtrComprCageBase cage_base) const; 474 475 // Max char codes. 476 static const int32_t kMaxOneByteCharCode = unibrow::Latin1::kMaxChar; 477 static const uint32_t kMaxOneByteCharCodeU = unibrow::Latin1::kMaxChar; 478 static const int kMaxUtf16CodeUnit = 0xffff; 479 static const uint32_t kMaxUtf16CodeUnitU = kMaxUtf16CodeUnit; 480 static const base::uc32 kMaxCodePoint = 0x10ffff; 481 482 // Maximal string length. 483 // The max length is different on 32 and 64 bit platforms. Max length for 484 // 32-bit platforms is ~268.4M chars. On 64-bit platforms, max length is 485 // ~536.8M chars. 486 // See include/v8.h for the definition. 487 static const int kMaxLength = v8::String::kMaxLength; 488 // There are several defining limits imposed by our current implementation: 489 // - any string's length must fit into a Smi. 490 static_assert(kMaxLength <= kSmiMaxValue, 491 "String length must fit into a Smi"); 492 // - adding two string lengths must still fit into a 32-bit int without 493 // overflow 494 static_assert(kMaxLength * 2 <= kMaxInt, 495 "String::kMaxLength * 2 must fit into an int32"); 496 // - any heap object's size in bytes must be able to fit into a Smi, because 497 // its space on the heap might be filled with a Filler; for strings this 498 // means SeqTwoByteString::kMaxSize must be able to fit into a Smi. 499 static_assert(kMaxLength * 2 + kHeaderSize <= kSmiMaxValue, 500 "String object size in bytes must fit into a Smi"); 501 // - any heap object's size in bytes must be able to fit into an int, because 502 // that's what our object handling code uses almost everywhere. 503 static_assert(kMaxLength * 2 + kHeaderSize <= kMaxInt, 504 "String object size in bytes must fit into an int"); 505 506 // Max length for computing hash. For strings longer than this limit the 507 // string length is used as the hash value. 508 static const int kMaxHashCalcLength = 16383; 509 510 // Limit for truncation in short printing. 511 static const int kMaxShortPrintLength = 1024; 512 513 // Helper function for flattening strings. 514 template <typename sinkchar> 515 EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE) 516 static void WriteToFlat(String source, sinkchar* sink, int from, int to); 517 template <typename sinkchar> 518 static void WriteToFlat(String source, sinkchar* sink, int from, int to, 519 PtrComprCageBase cage_base, 520 const SharedStringAccessGuardIfNeeded&); 521 IsAscii(const char * chars,int length)522 static inline bool IsAscii(const char* chars, int length) { 523 return IsAscii(reinterpret_cast<const uint8_t*>(chars), length); 524 } 525 IsAscii(const uint8_t * chars,int length)526 static inline bool IsAscii(const uint8_t* chars, int length) { 527 return NonAsciiStart(chars, length) >= length; 528 } 529 NonOneByteStart(const base::uc16 * chars,int length)530 static inline int NonOneByteStart(const base::uc16* chars, int length) { 531 DCHECK(IsAligned(reinterpret_cast<Address>(chars), sizeof(base::uc16))); 532 const uint16_t* start = chars; 533 const uint16_t* limit = chars + length; 534 535 if (static_cast<size_t>(length) >= kUIntptrSize) { 536 // Check unaligned chars. 537 while (!IsAligned(reinterpret_cast<Address>(chars), kUIntptrSize)) { 538 if (*chars > unibrow::Latin1::kMaxChar) { 539 return static_cast<int>(chars - start); 540 } 541 ++chars; 542 } 543 544 // Check aligned words. 545 STATIC_ASSERT(unibrow::Latin1::kMaxChar == 0xFF); 546 #ifdef V8_TARGET_LITTLE_ENDIAN 547 const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFFFF * 0xFF00; 548 #else 549 const uintptr_t non_one_byte_mask = kUintptrAllBitsSet / 0xFFFF * 0x00FF; 550 #endif 551 while (chars + sizeof(uintptr_t) <= limit) { 552 if (*reinterpret_cast<const uintptr_t*>(chars) & non_one_byte_mask) { 553 break; 554 } 555 chars += (sizeof(uintptr_t) / sizeof(base::uc16)); 556 } 557 } 558 559 // Check remaining unaligned chars, or find non-one-byte char in word. 560 while (chars < limit) { 561 if (*chars > unibrow::Latin1::kMaxChar) { 562 return static_cast<int>(chars - start); 563 } 564 ++chars; 565 } 566 567 return static_cast<int>(chars - start); 568 } 569 IsOneByte(const base::uc16 * chars,int length)570 static inline bool IsOneByte(const base::uc16* chars, int length) { 571 return NonOneByteStart(chars, length) >= length; 572 } 573 574 // May only be called when a SharedStringAccessGuard is not needed (i.e. on 575 // the main thread or on read-only strings). 576 template <class Visitor> 577 static inline ConsString VisitFlat(Visitor* visitor, String string, 578 int offset = 0); 579 580 template <class Visitor> 581 static inline ConsString VisitFlat( 582 Visitor* visitor, String string, int offset, 583 const SharedStringAccessGuardIfNeeded& access_guard); 584 585 template <typename IsolateT> 586 static Handle<FixedArray> CalculateLineEnds(IsolateT* isolate, 587 Handle<String> string, 588 bool include_ending_line); 589 590 // Returns true if string can be internalized without copying. In such cases 591 // the string is inserted into the string table and its map is changed to an 592 // internalized equivalent. 593 static inline bool IsInPlaceInternalizable(String string); 594 static inline bool IsInPlaceInternalizable(InstanceType instance_type); 595 596 static inline bool IsInPlaceInternalizableExcludingExternal( 597 InstanceType instance_type); 598 599 private: 600 friend class Name; 601 friend class StringTableInsertionKey; 602 friend class SharedStringTableInsertionKey; 603 friend class InternalizedStringKey; 604 605 // Implementation of the Get() public methods. Do not use directly. 606 V8_INLINE uint16_t 607 GetImpl(int index, PtrComprCageBase cage_base, 608 const SharedStringAccessGuardIfNeeded& access_guard) const; 609 610 // Implementation of the IsEqualTo() public methods. Do not use directly. 611 template <EqualityType kEqType, typename Char> 612 V8_INLINE bool IsEqualToImpl( 613 base::Vector<const Char> str, PtrComprCageBase cage_base, 614 const SharedStringAccessGuardIfNeeded& access_guard) const; 615 616 // Out-of-line IsEqualToImpl for ConsString. 617 template <typename Char> 618 V8_NOINLINE static bool IsConsStringEqualToImpl( 619 ConsString string, int slice_offset, base::Vector<const Char> str, 620 PtrComprCageBase cage_base, 621 const SharedStringAccessGuardIfNeeded& access_guard); 622 623 V8_EXPORT_PRIVATE static Handle<String> SlowFlatten( 624 Isolate* isolate, Handle<ConsString> cons, AllocationType allocation); 625 626 V8_EXPORT_PRIVATE V8_INLINE static base::Optional<FlatContent> 627 TryGetFlatContentFromDirectString(PtrComprCageBase cage_base, 628 const DisallowGarbageCollection& no_gc, 629 String string, int offset, int length, 630 const SharedStringAccessGuardIfNeeded&); 631 V8_EXPORT_PRIVATE FlatContent 632 SlowGetFlatContent(const DisallowGarbageCollection& no_gc, 633 const SharedStringAccessGuardIfNeeded&); 634 635 V8_EXPORT_PRIVATE static Handle<String> SlowShare(Isolate* isolate, 636 Handle<String> source); 637 638 // Slow case of String::Equals. This implementation works on any strings 639 // but it is most efficient on strings that are almost flat. 640 V8_EXPORT_PRIVATE bool SlowEquals(String other) const; 641 V8_EXPORT_PRIVATE bool SlowEquals( 642 String other, const SharedStringAccessGuardIfNeeded&) const; 643 644 V8_EXPORT_PRIVATE static bool SlowEquals(Isolate* isolate, Handle<String> one, 645 Handle<String> two); 646 647 // Slow case of AsArrayIndex. 648 V8_EXPORT_PRIVATE bool SlowAsArrayIndex(uint32_t* index); 649 V8_EXPORT_PRIVATE bool SlowAsIntegerIndex(size_t* index); 650 651 // Compute and set the hash code. 652 V8_EXPORT_PRIVATE uint32_t ComputeAndSetHash(); 653 V8_EXPORT_PRIVATE uint32_t 654 ComputeAndSetHash(const SharedStringAccessGuardIfNeeded&); 655 656 TQ_OBJECT_CONSTRUCTORS(String) 657 }; 658 659 // clang-format off 660 extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE) 661 void String::WriteToFlat(String source, uint8_t* sink, int from, int to); 662 extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE) 663 void String::WriteToFlat(String source, uint16_t* sink, int from, int to); 664 extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE) 665 void String::WriteToFlat(String source, uint8_t* sink, int from, int to, 666 PtrComprCageBase cage_base, 667 const SharedStringAccessGuardIfNeeded&); 668 extern template EXPORT_TEMPLATE_DECLARE(V8_EXPORT_PRIVATE) 669 void String::WriteToFlat(String source, uint16_t* sink, int from, int to, 670 PtrComprCageBase cage_base, 671 const SharedStringAccessGuardIfNeeded&); 672 // clang-format on 673 674 class SubStringRange { 675 public: 676 inline SubStringRange(String string, const DisallowGarbageCollection& no_gc, 677 int first = 0, int length = -1); 678 class iterator; 679 inline iterator begin(); 680 inline iterator end(); 681 682 private: 683 String string_; 684 int first_; 685 int length_; 686 const DisallowGarbageCollection& no_gc_; 687 }; 688 689 // The SeqString abstract class captures sequential string values. 690 class SeqString : public TorqueGeneratedSeqString<SeqString, String> { 691 public: 692 // Truncate the string in-place if possible and return the result. 693 // In case of new_length == 0, the empty string is returned without 694 // truncating the original string. 695 V8_WARN_UNUSED_RESULT static Handle<String> Truncate(Handle<SeqString> string, 696 int new_length); 697 698 TQ_OBJECT_CONSTRUCTORS(SeqString) 699 }; 700 701 class InternalizedString 702 : public TorqueGeneratedInternalizedString<InternalizedString, String> { 703 public: 704 // TODO(neis): Possibly move some stuff from String here. 705 706 TQ_OBJECT_CONSTRUCTORS(InternalizedString) 707 }; 708 709 // The OneByteString class captures sequential one-byte string objects. 710 // Each character in the OneByteString is an one-byte character. 711 class SeqOneByteString 712 : public TorqueGeneratedSeqOneByteString<SeqOneByteString, SeqString> { 713 public: 714 static const bool kHasOneByteEncoding = true; 715 using Char = uint8_t; 716 717 // Dispatched behavior. The non SharedStringAccessGuardIfNeeded method is also 718 // defined for convenience and it will check that the access guard is not 719 // needed. 720 inline uint8_t Get(int index) const; 721 inline uint8_t Get(int index, PtrComprCageBase cage_base, 722 const SharedStringAccessGuardIfNeeded& access_guard) const; 723 inline void SeqOneByteStringSet(int index, uint16_t value); 724 inline void SeqOneByteStringSetChars(int index, const uint8_t* string, 725 int length); 726 727 // Get the address of the characters in this string. 728 inline Address GetCharsAddress() const; 729 730 // Get a pointer to the characters of the string. May only be called when a 731 // SharedStringAccessGuard is not needed (i.e. on the main thread or on 732 // read-only strings). 733 inline uint8_t* GetChars(const DisallowGarbageCollection& no_gc) const; 734 735 // Get a pointer to the characters of the string. 736 inline uint8_t* GetChars( 737 const DisallowGarbageCollection& no_gc, 738 const SharedStringAccessGuardIfNeeded& access_guard) const; 739 740 // Clear uninitialized padding space. This ensures that the snapshot content 741 // is deterministic. 742 void clear_padding(); 743 744 // Maximal memory usage for a single sequential one-byte string. 745 static const int kMaxCharsSize = kMaxLength; 746 static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxCharsSize + kHeaderSize); 747 STATIC_ASSERT((kMaxSize - kHeaderSize) >= String::kMaxLength); 748 749 int AllocatedSize(); 750 751 // A SeqOneByteString have different maps depending on whether it is shared. 752 static inline bool IsCompatibleMap(Map map, ReadOnlyRoots roots); 753 754 class BodyDescriptor; 755 756 TQ_OBJECT_CONSTRUCTORS(SeqOneByteString) 757 }; 758 759 // The TwoByteString class captures sequential unicode string objects. 760 // Each character in the TwoByteString is a two-byte uint16_t. 761 class SeqTwoByteString 762 : public TorqueGeneratedSeqTwoByteString<SeqTwoByteString, SeqString> { 763 public: 764 static const bool kHasOneByteEncoding = false; 765 using Char = uint16_t; 766 767 // Dispatched behavior. 768 inline uint16_t Get( 769 int index, PtrComprCageBase cage_base, 770 const SharedStringAccessGuardIfNeeded& access_guard) const; 771 inline void SeqTwoByteStringSet(int index, uint16_t value); 772 773 // Get the address of the characters in this string. 774 inline Address GetCharsAddress() const; 775 776 // Get a pointer to the characters of the string. May only be called when a 777 // SharedStringAccessGuard is not needed (i.e. on the main thread or on 778 // read-only strings). 779 inline base::uc16* GetChars(const DisallowGarbageCollection& no_gc) const; 780 781 // Get a pointer to the characters of the string. 782 inline base::uc16* GetChars( 783 const DisallowGarbageCollection& no_gc, 784 const SharedStringAccessGuardIfNeeded& access_guard) const; 785 786 // Clear uninitialized padding space. This ensures that the snapshot content 787 // is deterministic. 788 void clear_padding(); 789 790 // Maximal memory usage for a single sequential two-byte string. 791 static const int kMaxCharsSize = kMaxLength * 2; 792 static const int kMaxSize = OBJECT_POINTER_ALIGN(kMaxCharsSize + kHeaderSize); 793 STATIC_ASSERT(static_cast<int>((kMaxSize - kHeaderSize) / sizeof(uint16_t)) >= 794 String::kMaxLength); 795 796 int AllocatedSize(); 797 798 // A SeqTwoByteString have different maps depending on whether it is shared. 799 static inline bool IsCompatibleMap(Map map, ReadOnlyRoots roots); 800 801 class BodyDescriptor; 802 803 TQ_OBJECT_CONSTRUCTORS(SeqTwoByteString) 804 }; 805 806 // The ConsString class describes string values built by using the 807 // addition operator on strings. A ConsString is a pair where the 808 // first and second components are pointers to other string values. 809 // One or both components of a ConsString can be pointers to other 810 // ConsStrings, creating a binary tree of ConsStrings where the leaves 811 // are non-ConsString string values. The string value represented by 812 // a ConsString can be obtained by concatenating the leaf string 813 // values in a left-to-right depth-first traversal of the tree. 814 class ConsString : public TorqueGeneratedConsString<ConsString, String> { 815 public: 816 // Doesn't check that the result is a string, even in debug mode. This is 817 // useful during GC where the mark bits confuse the checks. 818 inline Object unchecked_first() const; 819 820 // Doesn't check that the result is a string, even in debug mode. This is 821 // useful during GC where the mark bits confuse the checks. 822 inline Object unchecked_second() const; 823 824 V8_INLINE bool IsFlat(PtrComprCageBase cage_base) const; 825 826 // Dispatched behavior. 827 V8_EXPORT_PRIVATE uint16_t 828 Get(int index, PtrComprCageBase cage_base, 829 const SharedStringAccessGuardIfNeeded& access_guard) const; 830 831 // Minimum length for a cons string. 832 static const int kMinLength = 13; 833 834 class BodyDescriptor; 835 836 DECL_VERIFIER(ConsString) 837 838 TQ_OBJECT_CONSTRUCTORS(ConsString) 839 }; 840 841 // The ThinString class describes string objects that are just references 842 // to another string object. They are used for in-place internalization when 843 // the original string cannot actually be internalized in-place: in these 844 // cases, the original string is converted to a ThinString pointing at its 845 // internalized version (which is allocated as a new object). 846 // In terms of memory layout and most algorithms operating on strings, 847 // ThinStrings can be thought of as "one-part cons strings". 848 class ThinString : public TorqueGeneratedThinString<ThinString, String> { 849 public: 850 DECL_GETTER(unchecked_actual, HeapObject) 851 852 V8_EXPORT_PRIVATE uint16_t 853 Get(int index, PtrComprCageBase cage_base, 854 const SharedStringAccessGuardIfNeeded& access_guard) const; 855 856 DECL_VERIFIER(ThinString) 857 858 class BodyDescriptor; 859 860 TQ_OBJECT_CONSTRUCTORS(ThinString) 861 }; 862 863 // The Sliced String class describes strings that are substrings of another 864 // sequential string. The motivation is to save time and memory when creating 865 // a substring. A Sliced String is described as a pointer to the parent, 866 // the offset from the start of the parent string and the length. Using 867 // a Sliced String therefore requires unpacking of the parent string and 868 // adding the offset to the start address. A substring of a Sliced String 869 // are not nested since the double indirection is simplified when creating 870 // such a substring. 871 // Currently missing features are: 872 // - truncating sliced string to enable otherwise unneeded parent to be GC'ed. 873 class SlicedString : public TorqueGeneratedSlicedString<SlicedString, String> { 874 public: 875 inline void set_parent(String parent, 876 WriteBarrierMode mode = UPDATE_WRITE_BARRIER); 877 // Dispatched behavior. 878 V8_EXPORT_PRIVATE uint16_t 879 Get(int index, PtrComprCageBase cage_base, 880 const SharedStringAccessGuardIfNeeded& access_guard) const; 881 882 // Minimum length for a sliced string. 883 static const int kMinLength = 13; 884 885 class BodyDescriptor; 886 887 DECL_VERIFIER(SlicedString) 888 889 TQ_OBJECT_CONSTRUCTORS(SlicedString) 890 }; 891 892 // The ExternalString class describes string values that are backed by 893 // a string resource that lies outside the V8 heap. ExternalStrings 894 // consist of the length field common to all strings, a pointer to the 895 // external resource. It is important to ensure (externally) that the 896 // resource is not deallocated while the ExternalString is live in the 897 // V8 heap. 898 // 899 // The API expects that all ExternalStrings are created through the 900 // API. Therefore, ExternalStrings should not be used internally. 901 class ExternalString 902 : public TorqueGeneratedExternalString<ExternalString, String> { 903 public: 904 DECL_VERIFIER(ExternalString) 905 906 // Size of uncached external strings. 907 static const int kUncachedSize = 908 kResourceOffset + FIELD_SIZE(kResourceOffset); 909 910 inline void AllocateExternalPointerEntries(Isolate* isolate); 911 912 // Return whether the external string data pointer is not cached. 913 inline bool is_uncached() const; 914 // Size in bytes of the external payload. 915 int ExternalPayloadSize() const; 916 917 // Used in the serializer/deserializer. 918 DECL_GETTER(resource_as_address, Address) 919 inline void set_address_as_resource(Isolate* isolate, Address address); 920 inline uint32_t GetResourceRefForDeserialization(); 921 inline void SetResourceRefForSerialization(uint32_t ref); 922 923 // Disposes string's resource object if it has not already been disposed. 924 inline void DisposeResource(Isolate* isolate); 925 926 STATIC_ASSERT(kResourceOffset == Internals::kStringResourceOffset); 927 static const int kSizeOfAllExternalStrings = kHeaderSize; 928 929 private: 930 // Hide generated accessors. 931 DECL_ACCESSORS(resource, void*) 932 DECL_ACCESSORS(resource_data, void*) 933 934 TQ_OBJECT_CONSTRUCTORS(ExternalString) 935 }; 936 937 // The ExternalOneByteString class is an external string backed by an 938 // one-byte string. 939 class ExternalOneByteString 940 : public TorqueGeneratedExternalOneByteString<ExternalOneByteString, 941 ExternalString> { 942 public: 943 static const bool kHasOneByteEncoding = true; 944 945 using Resource = v8::String::ExternalOneByteStringResource; 946 947 // The underlying resource. 948 DECL_GETTER(resource, const Resource*) 949 950 // It is assumed that the previous resource is null. If it is not null, then 951 // it is the responsability of the caller the handle the previous resource. 952 inline void SetResource(Isolate* isolate, const Resource* buffer); 953 954 // Used only during serialization. 955 inline void set_resource(Isolate* isolate, const Resource* buffer); 956 957 // Update the pointer cache to the external character array. 958 // The cached pointer is always valid, as the external character array does = 959 // not move during lifetime. Deserialization is the only exception, after 960 // which the pointer cache has to be refreshed. 961 inline void update_data_cache(Isolate* isolate); 962 963 inline const uint8_t* GetChars(PtrComprCageBase cage_base) const; 964 965 // Dispatched behavior. 966 inline uint8_t Get(int index, PtrComprCageBase cage_base, 967 const SharedStringAccessGuardIfNeeded& access_guard) const; 968 969 class BodyDescriptor; 970 971 STATIC_ASSERT(kSize == kSizeOfAllExternalStrings); 972 973 TQ_OBJECT_CONSTRUCTORS(ExternalOneByteString) 974 975 private: 976 // The underlying resource as a non-const pointer. 977 DECL_GETTER(mutable_resource, Resource*) 978 }; 979 980 // The ExternalTwoByteString class is an external string backed by a UTF-16 981 // encoded string. 982 class ExternalTwoByteString 983 : public TorqueGeneratedExternalTwoByteString<ExternalTwoByteString, 984 ExternalString> { 985 public: 986 static const bool kHasOneByteEncoding = false; 987 988 using Resource = v8::String::ExternalStringResource; 989 990 // The underlying string resource. 991 DECL_GETTER(resource, const Resource*) 992 993 // It is assumed that the previous resource is null. If it is not null, then 994 // it is the responsability of the caller the handle the previous resource. 995 inline void SetResource(Isolate* isolate, const Resource* buffer); 996 997 // Used only during serialization. 998 inline void set_resource(Isolate* isolate, const Resource* buffer); 999 1000 // Update the pointer cache to the external character array. 1001 // The cached pointer is always valid, as the external character array does = 1002 // not move during lifetime. Deserialization is the only exception, after 1003 // which the pointer cache has to be refreshed. 1004 inline void update_data_cache(Isolate* isolate); 1005 1006 inline const uint16_t* GetChars(PtrComprCageBase cage_base) const; 1007 1008 // Dispatched behavior. 1009 inline uint16_t Get( 1010 int index, PtrComprCageBase cage_base, 1011 const SharedStringAccessGuardIfNeeded& access_guard) const; 1012 1013 // For regexp code. 1014 inline const uint16_t* ExternalTwoByteStringGetData(unsigned start); 1015 1016 class BodyDescriptor; 1017 1018 STATIC_ASSERT(kSize == kSizeOfAllExternalStrings); 1019 1020 TQ_OBJECT_CONSTRUCTORS(ExternalTwoByteString) 1021 1022 private: 1023 // The underlying resource as a non-const pointer. 1024 DECL_GETTER(mutable_resource, Resource*) 1025 }; 1026 1027 // A flat string reader provides random access to the contents of a 1028 // string independent of the character width of the string. The handle 1029 // must be valid as long as the reader is being used. 1030 // Not safe to use from concurrent background threads. 1031 class V8_EXPORT_PRIVATE FlatStringReader : public Relocatable { 1032 public: 1033 FlatStringReader(Isolate* isolate, Handle<String> str); 1034 void PostGarbageCollection() override; 1035 inline base::uc32 Get(int index) const; 1036 template <typename Char> 1037 inline Char Get(int index) const; 1038 int length() const { return length_; } 1039 1040 private: 1041 Handle<String> str_; 1042 bool is_one_byte_; 1043 int const length_; 1044 const void* start_; 1045 }; 1046 1047 // This maintains an off-stack representation of the stack frames required 1048 // to traverse a ConsString, allowing an entirely iterative and restartable 1049 // traversal of the entire string 1050 class ConsStringIterator { 1051 public: 1052 inline ConsStringIterator() = default; 1053 inline explicit ConsStringIterator(ConsString cons_string, int offset = 0) { 1054 Reset(cons_string, offset); 1055 } 1056 ConsStringIterator(const ConsStringIterator&) = delete; 1057 ConsStringIterator& operator=(const ConsStringIterator&) = delete; 1058 inline void Reset(ConsString cons_string, int offset = 0) { 1059 depth_ = 0; 1060 // Next will always return nullptr. 1061 if (cons_string.is_null()) return; 1062 Initialize(cons_string, offset); 1063 } 1064 // Returns nullptr when complete. 1065 inline String Next(int* offset_out) { 1066 *offset_out = 0; 1067 if (depth_ == 0) return String(); 1068 return Continue(offset_out); 1069 } 1070 1071 private: 1072 static const int kStackSize = 32; 1073 // Use a mask instead of doing modulo operations for stack wrapping. 1074 static const int kDepthMask = kStackSize - 1; 1075 static_assert(base::bits::IsPowerOfTwo(kStackSize), 1076 "kStackSize must be power of two"); 1077 static inline int OffsetForDepth(int depth); 1078 1079 inline void PushLeft(ConsString string); 1080 inline void PushRight(ConsString string); 1081 inline void AdjustMaximumDepth(); 1082 inline void Pop(); 1083 inline bool StackBlown() { return maximum_depth_ - depth_ == kStackSize; } 1084 V8_EXPORT_PRIVATE void Initialize(ConsString cons_string, int offset); 1085 V8_EXPORT_PRIVATE String Continue(int* offset_out); 1086 String NextLeaf(bool* blew_stack); 1087 String Search(int* offset_out); 1088 1089 // Stack must always contain only frames for which right traversal 1090 // has not yet been performed. 1091 ConsString frames_[kStackSize]; 1092 ConsString root_; 1093 int depth_; 1094 int maximum_depth_; 1095 int consumed_; 1096 }; 1097 1098 class StringCharacterStream; 1099 1100 template <typename Char> 1101 struct CharTraits; 1102 1103 template <> 1104 struct CharTraits<uint8_t> { 1105 using String = SeqOneByteString; 1106 using ExternalString = ExternalOneByteString; 1107 }; 1108 1109 template <> 1110 struct CharTraits<uint16_t> { 1111 using String = SeqTwoByteString; 1112 using ExternalString = ExternalTwoByteString; 1113 }; 1114 1115 } // namespace internal 1116 } // namespace v8 1117 1118 #include "src/objects/object-macros-undef.h" 1119 1120 #endif // V8_OBJECTS_STRING_H_ 1121