1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/parsing/scanner-character-streams.h"
6
7 #include <memory>
8 #include <vector>
9
10 #include "include/v8-callbacks.h"
11 #include "include/v8-primitive.h"
12 #include "src/base/strings.h"
13 #include "src/common/globals.h"
14 #include "src/execution/isolate-utils.h"
15 #include "src/handles/handles.h"
16 #include "src/logging/runtime-call-stats-scope.h"
17 #include "src/objects/objects-inl.h"
18 #include "src/parsing/scanner.h"
19 #include "src/strings/unicode-inl.h"
20
21 namespace v8 {
22 namespace internal {
23
24 class V8_NODISCARD ScopedExternalStringLock {
25 public:
ScopedExternalStringLock(ExternalString string)26 explicit ScopedExternalStringLock(ExternalString string) {
27 DCHECK(!string.is_null());
28 if (string.IsExternalOneByteString()) {
29 resource_ = ExternalOneByteString::cast(string).resource();
30 } else {
31 DCHECK(string.IsExternalTwoByteString());
32 resource_ = ExternalTwoByteString::cast(string).resource();
33 }
34 DCHECK(resource_);
35 resource_->Lock();
36 }
37
38 // Copying a lock increases the locking depth.
ScopedExternalStringLock(const ScopedExternalStringLock & other)39 ScopedExternalStringLock(const ScopedExternalStringLock& other) V8_NOEXCEPT
40 : resource_(other.resource_) {
41 resource_->Lock();
42 }
43
~ScopedExternalStringLock()44 ~ScopedExternalStringLock() { resource_->Unlock(); }
45
46 private:
47 // Not nullptr.
48 const v8::String::ExternalStringResourceBase* resource_;
49 };
50
51 namespace {
52 const unibrow::uchar kUtf8Bom = 0xFEFF;
53 } // namespace
54
55 template <typename Char>
56 struct Range {
57 const Char* start;
58 const Char* end;
59
lengthv8::internal::Range60 size_t length() { return static_cast<size_t>(end - start); }
unaligned_startv8::internal::Range61 bool unaligned_start() const {
62 return reinterpret_cast<intptr_t>(start) % sizeof(Char) == 1;
63 }
64 };
65
66 // A Char stream backed by an on-heap SeqOneByteString or SeqTwoByteString.
67 template <typename Char>
68 class OnHeapStream {
69 public:
70 using String = typename CharTraits<Char>::String;
71
OnHeapStream(Handle<String> string,size_t start_offset,size_t end)72 OnHeapStream(Handle<String> string, size_t start_offset, size_t end)
73 : string_(string), start_offset_(start_offset), length_(end) {}
74
OnHeapStream(const OnHeapStream &)75 OnHeapStream(const OnHeapStream&) V8_NOEXCEPT : start_offset_(0), length_(0) {
76 UNREACHABLE();
77 }
78
79 // The no_gc argument is only here because of the templated way this class
80 // is used along with other implementations that require V8 heap access.
GetDataAt(size_t pos,RuntimeCallStats * stats,DisallowGarbageCollection * no_gc)81 Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
82 DisallowGarbageCollection* no_gc) {
83 return {&string_->GetChars(*no_gc)[start_offset_ + std::min(length_, pos)],
84 &string_->GetChars(*no_gc)[start_offset_ + length_]};
85 }
86
87 static const bool kCanBeCloned = false;
88 static const bool kCanAccessHeap = true;
89
90 private:
91 Handle<String> string_;
92 const size_t start_offset_;
93 const size_t length_;
94 };
95
96 // A Char stream backed by an off-heap ExternalOneByteString or
97 // ExternalTwoByteString.
98 template <typename Char>
99 class ExternalStringStream {
100 using ExternalString = typename CharTraits<Char>::ExternalString;
101
102 public:
ExternalStringStream(ExternalString string,size_t start_offset,size_t length)103 ExternalStringStream(ExternalString string, size_t start_offset,
104 size_t length)
105 : lock_(string),
106 data_(string.GetChars(GetPtrComprCageBase(string)) + start_offset),
107 length_(length) {}
108
ExternalStringStream(const ExternalStringStream & other)109 ExternalStringStream(const ExternalStringStream& other) V8_NOEXCEPT
110 : lock_(other.lock_),
111 data_(other.data_),
112 length_(other.length_) {}
113
114 // The no_gc argument is only here because of the templated way this class
115 // is used along with other implementations that require V8 heap access.
GetDataAt(size_t pos,RuntimeCallStats * stats,DisallowGarbageCollection * no_gc=nullptr)116 Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
117 DisallowGarbageCollection* no_gc = nullptr) {
118 return {&data_[std::min(length_, pos)], &data_[length_]};
119 }
120
121 static const bool kCanBeCloned = true;
122 static const bool kCanAccessHeap = false;
123
124 private:
125 ScopedExternalStringLock lock_;
126 const Char* const data_;
127 const size_t length_;
128 };
129
130 // A Char stream backed by a C array. Testing only.
131 template <typename Char>
132 class TestingStream {
133 public:
TestingStream(const Char * data,size_t length)134 TestingStream(const Char* data, size_t length)
135 : data_(data), length_(length) {}
136 // The no_gc argument is only here because of the templated way this class
137 // is used along with other implementations that require V8 heap access.
GetDataAt(size_t pos,RuntimeCallStats * stats,DisallowGarbageCollection * no_gc=nullptr)138 Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
139 DisallowGarbageCollection* no_gc = nullptr) {
140 return {&data_[std::min(length_, pos)], &data_[length_]};
141 }
142
143 static const bool kCanBeCloned = true;
144 static const bool kCanAccessHeap = false;
145
146 private:
147 const Char* const data_;
148 const size_t length_;
149 };
150
151 // A Char stream backed by multiple source-stream provided off-heap chunks.
152 template <typename Char>
153 class ChunkedStream {
154 public:
ChunkedStream(ScriptCompiler::ExternalSourceStream * source)155 explicit ChunkedStream(ScriptCompiler::ExternalSourceStream* source)
156 : source_(source), chunks_(std::make_shared<std::vector<Chunk>>()) {}
157
ChunkedStream(const ChunkedStream & other)158 ChunkedStream(const ChunkedStream& other) V8_NOEXCEPT
159 : source_(nullptr),
160 chunks_(other.chunks_) {}
161
162 // The no_gc argument is only here because of the templated way this class
163 // is used along with other implementations that require V8 heap access.
GetDataAt(size_t pos,RuntimeCallStats * stats,DisallowGarbageCollection * no_gc=nullptr)164 Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
165 DisallowGarbageCollection* no_gc = nullptr) {
166 Chunk& chunk = FindChunk(pos, stats);
167 size_t buffer_end = chunk.length;
168 size_t buffer_pos = std::min(buffer_end, pos - chunk.position);
169 return {&chunk.data.get()[buffer_pos], &chunk.data.get()[buffer_end]};
170 }
171
172 static const bool kCanBeCloned = true;
173 static const bool kCanAccessHeap = false;
174
175 private:
176 struct Chunk {
Chunkv8::internal::ChunkedStream::Chunk177 Chunk(const Char* const data, size_t position, size_t length)
178 : data(data), position(position), length(length) {}
179 std::unique_ptr<const Char[]> data;
180 // The logical position of data.
181 const size_t position;
182 const size_t length;
end_positionv8::internal::ChunkedStream::Chunk183 size_t end_position() const { return position + length; }
184 };
185
FindChunk(size_t position,RuntimeCallStats * stats)186 Chunk& FindChunk(size_t position, RuntimeCallStats* stats) {
187 while (V8_UNLIKELY(chunks_->empty())) FetchChunk(size_t{0}, stats);
188
189 // Walk forwards while the position is in front of the current chunk.
190 while (position >= chunks_->back().end_position() &&
191 chunks_->back().length > 0) {
192 FetchChunk(chunks_->back().end_position(), stats);
193 }
194
195 // Walk backwards.
196 for (auto reverse_it = chunks_->rbegin(); reverse_it != chunks_->rend();
197 ++reverse_it) {
198 if (reverse_it->position <= position) return *reverse_it;
199 }
200
201 UNREACHABLE();
202 }
203
ProcessChunk(const uint8_t * data,size_t position,size_t length)204 virtual void ProcessChunk(const uint8_t* data, size_t position,
205 size_t length) {
206 // Incoming data has to be aligned to Char size.
207 DCHECK_EQ(0, length % sizeof(Char));
208 chunks_->emplace_back(reinterpret_cast<const Char*>(data), position,
209 length / sizeof(Char));
210 }
211
FetchChunk(size_t position,RuntimeCallStats * stats)212 void FetchChunk(size_t position, RuntimeCallStats* stats) {
213 // Cloned ChunkedStreams have a null source, and therefore can't fetch any
214 // new data.
215 DCHECK_NOT_NULL(source_);
216
217 const uint8_t* data = nullptr;
218 size_t length;
219 {
220 RCS_SCOPE(stats, RuntimeCallCounterId::kGetMoreDataCallback);
221 length = source_->GetMoreData(&data);
222 }
223 ProcessChunk(data, position, length);
224 }
225
226 ScriptCompiler::ExternalSourceStream* source_;
227
228 protected:
229 std::shared_ptr<std::vector<struct Chunk>> chunks_;
230 };
231
232 // Provides a buffered utf-16 view on the bytes from the underlying ByteStream.
233 // Chars are buffered if either the underlying stream isn't utf-16 or the
234 // underlying utf-16 stream might move (is on-heap).
235 template <template <typename T> class ByteStream>
236 class BufferedCharacterStream : public Utf16CharacterStream {
237 public:
238 template <class... TArgs>
BufferedCharacterStream(size_t pos,TArgs...args)239 BufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) {
240 buffer_pos_ = pos;
241 }
242
can_be_cloned() const243 bool can_be_cloned() const final {
244 return ByteStream<uint16_t>::kCanBeCloned;
245 }
246
Clone() const247 std::unique_ptr<Utf16CharacterStream> Clone() const override {
248 CHECK(can_be_cloned());
249 return std::unique_ptr<Utf16CharacterStream>(
250 new BufferedCharacterStream<ByteStream>(*this));
251 }
252
253 protected:
ReadBlock(size_t position)254 bool ReadBlock(size_t position) final {
255 buffer_pos_ = position;
256 buffer_start_ = &buffer_[0];
257 buffer_cursor_ = buffer_start_;
258
259 DisallowGarbageCollection no_gc;
260 Range<uint8_t> range =
261 byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
262 if (range.length() == 0) {
263 buffer_end_ = buffer_start_;
264 return false;
265 }
266
267 size_t length = std::min({kBufferSize, range.length()});
268 i::CopyChars(buffer_, range.start, length);
269 buffer_end_ = &buffer_[length];
270 return true;
271 }
272
can_access_heap() const273 bool can_access_heap() const final {
274 return ByteStream<uint8_t>::kCanAccessHeap;
275 }
276
277 private:
BufferedCharacterStream(const BufferedCharacterStream<ByteStream> & other)278 BufferedCharacterStream(const BufferedCharacterStream<ByteStream>& other)
279 : byte_stream_(other.byte_stream_) {}
280
281 static const size_t kBufferSize = 512;
282 base::uc16 buffer_[kBufferSize];
283 ByteStream<uint8_t> byte_stream_;
284 };
285
286 // Provides a unbuffered utf-16 view on the bytes from the underlying
287 // ByteStream.
288 template <template <typename T> class ByteStream>
289 class UnbufferedCharacterStream : public Utf16CharacterStream {
290 public:
291 template <class... TArgs>
UnbufferedCharacterStream(size_t pos,TArgs...args)292 UnbufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) {
293 buffer_pos_ = pos;
294 }
295
can_access_heap() const296 bool can_access_heap() const final {
297 return ByteStream<uint16_t>::kCanAccessHeap;
298 }
299
can_be_cloned() const300 bool can_be_cloned() const final {
301 return ByteStream<uint16_t>::kCanBeCloned;
302 }
303
Clone() const304 std::unique_ptr<Utf16CharacterStream> Clone() const override {
305 return std::unique_ptr<Utf16CharacterStream>(
306 new UnbufferedCharacterStream<ByteStream>(*this));
307 }
308
309 protected:
ReadBlock(size_t position)310 bool ReadBlock(size_t position) final {
311 buffer_pos_ = position;
312 DisallowGarbageCollection no_gc;
313 Range<uint16_t> range =
314 byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
315 buffer_start_ = range.start;
316 buffer_end_ = range.end;
317 buffer_cursor_ = buffer_start_;
318 if (range.length() == 0) return false;
319
320 DCHECK(!range.unaligned_start());
321 DCHECK_LE(buffer_start_, buffer_end_);
322 return true;
323 }
324
UnbufferedCharacterStream(const UnbufferedCharacterStream<ByteStream> & other)325 UnbufferedCharacterStream(const UnbufferedCharacterStream<ByteStream>& other)
326 : byte_stream_(other.byte_stream_) {}
327
328 ByteStream<uint16_t> byte_stream_;
329 };
330
331 // Provides a unbuffered utf-16 view on the bytes from the underlying
332 // ByteStream.
333 class RelocatingCharacterStream final
334 : public UnbufferedCharacterStream<OnHeapStream> {
335 public:
336 template <class... TArgs>
RelocatingCharacterStream(Isolate * isolate,size_t pos,TArgs...args)337 RelocatingCharacterStream(Isolate* isolate, size_t pos, TArgs... args)
338 : UnbufferedCharacterStream<OnHeapStream>(pos, args...),
339 isolate_(isolate) {
340 isolate->main_thread_local_heap()->AddGCEpilogueCallback(
341 UpdateBufferPointersCallback, this);
342 }
343
344 private:
~RelocatingCharacterStream()345 ~RelocatingCharacterStream() final {
346 isolate_->main_thread_local_heap()->RemoveGCEpilogueCallback(
347 UpdateBufferPointersCallback, this);
348 }
349
UpdateBufferPointersCallback(void * stream)350 static void UpdateBufferPointersCallback(void* stream) {
351 reinterpret_cast<RelocatingCharacterStream*>(stream)
352 ->UpdateBufferPointers();
353 }
354
UpdateBufferPointers()355 void UpdateBufferPointers() {
356 DisallowGarbageCollection no_gc;
357 Range<uint16_t> range =
358 byte_stream_.GetDataAt(buffer_pos_, runtime_call_stats(), &no_gc);
359 if (range.start != buffer_start_) {
360 buffer_cursor_ = (buffer_cursor_ - buffer_start_) + range.start;
361 buffer_start_ = range.start;
362 buffer_end_ = range.end;
363 }
364 }
365
366 Isolate* isolate_;
367 };
368
369 // ----------------------------------------------------------------------------
370 // BufferedUtf16CharacterStreams
371 //
372 // A buffered character stream based on a random access character
373 // source (ReadBlock can be called with pos() pointing to any position,
374 // even positions before the current).
375 //
376 // TODO(verwaest): Remove together with Utf8 external streaming streams.
377 class BufferedUtf16CharacterStream : public Utf16CharacterStream {
378 public:
379 BufferedUtf16CharacterStream();
380
381 protected:
382 static const size_t kBufferSize = 512;
383
384 bool ReadBlock(size_t position) final;
385
386 // FillBuffer should read up to kBufferSize characters at position and store
387 // them into buffer_[0..]. It returns the number of characters stored.
388 virtual size_t FillBuffer(size_t position) = 0;
389
390 // Fixed sized buffer that this class reads from.
391 // The base class' buffer_start_ should always point to buffer_.
392 base::uc16 buffer_[kBufferSize];
393 };
394
BufferedUtf16CharacterStream()395 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
396 : Utf16CharacterStream(buffer_, buffer_, buffer_, 0) {}
397
ReadBlock(size_t position)398 bool BufferedUtf16CharacterStream::ReadBlock(size_t position) {
399 DCHECK_EQ(buffer_start_, buffer_);
400
401 buffer_pos_ = position;
402 buffer_cursor_ = buffer_;
403 buffer_end_ = buffer_ + FillBuffer(position);
404 DCHECK_EQ(pos(), position);
405 DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize);
406 return buffer_cursor_ < buffer_end_;
407 }
408
409 // ----------------------------------------------------------------------------
410 // Windows1252CharacterStream - chunked streaming of windows-1252 data.
411 //
412 // Similar to BufferedCharacterStream, but does the translation of
413 // windows-1252 that are incompatible with their latin-1 equivalents.
414
415 namespace {
416
417 static const base::uc16 kWindows1252ToUC16[256] = {
418 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, // 00-07
419 0x0008, 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x000E, 0x000F, // 08-0F
420 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, // 10-17
421 0x0018, 0x0019, 0x001A, 0x001B, 0x001C, 0x001D, 0x001E, 0x001F, // 18-1F
422 0x0020, 0x0021, 0x0022, 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, // 20-27
423 0x0028, 0x0029, 0x002A, 0x002B, 0x002C, 0x002D, 0x002E, 0x002F, // 28-2F
424 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, // 30-37
425 0x0038, 0x0039, 0x003A, 0x003B, 0x003C, 0x003D, 0x003E, 0x003F, // 38-3F
426 0x0040, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047, // 40-47
427 0x0048, 0x0049, 0x004A, 0x004B, 0x004C, 0x004D, 0x004E, 0x004F, // 48-4F
428 0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057, // 50-57
429 0x0058, 0x0059, 0x005A, 0x005B, 0x005C, 0x005D, 0x005E, 0x005F, // 58-5F
430 0x0060, 0x0061, 0x0062, 0x0063, 0x0064, 0x0065, 0x0066, 0x0067, // 60-67
431 0x0068, 0x0069, 0x006A, 0x006B, 0x006C, 0x006D, 0x006E, 0x006F, // 68-6F
432 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, 0x0075, 0x0076, 0x0077, // 70-77
433 0x0078, 0x0079, 0x007A, 0x007B, 0x007C, 0x007D, 0x007E, 0x007F, // 78-7F
434 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
435 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
436 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
437 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
438 0x00A0, 0x00A1, 0x00A2, 0x00A3, 0x00A4, 0x00A5, 0x00A6, 0x00A7, // A0-A7
439 0x00A8, 0x00A9, 0x00AA, 0x00AB, 0x00AC, 0x00AD, 0x00AE, 0x00AF, // A8-AF
440 0x00B0, 0x00B1, 0x00B2, 0x00B3, 0x00B4, 0x00B5, 0x00B6, 0x00B7, // B0-B7
441 0x00B8, 0x00B9, 0x00BA, 0x00BB, 0x00BC, 0x00BD, 0x00BE, 0x00BF, // B8-BF
442 0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x00C6, 0x00C7, // C0-C7
443 0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x00CC, 0x00CD, 0x00CE, 0x00CF, // C8-CF
444 0x00D0, 0x00D1, 0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D7, // D0-D7
445 0x00D8, 0x00D9, 0x00DA, 0x00DB, 0x00DC, 0x00DD, 0x00DE, 0x00DF, // D8-DF
446 0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x00E6, 0x00E7, // E0-E7
447 0x00E8, 0x00E9, 0x00EA, 0x00EB, 0x00EC, 0x00ED, 0x00EE, 0x00EF, // E8-EF
448 0x00F0, 0x00F1, 0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F7, // F0-F7
449 0x00F8, 0x00F9, 0x00FA, 0x00FB, 0x00FC, 0x00FD, 0x00FE, 0x00FF // F8-FF
450 };
451
452 } // namespace
453
454 class Windows1252CharacterStream final : public Utf16CharacterStream {
455 public:
Windows1252CharacterStream(size_t pos,ScriptCompiler::ExternalSourceStream * source_stream)456 Windows1252CharacterStream(
457 size_t pos, ScriptCompiler::ExternalSourceStream* source_stream)
458 : byte_stream_(source_stream) {
459 buffer_pos_ = pos;
460 }
461
can_be_cloned() const462 bool can_be_cloned() const final {
463 return ChunkedStream<uint16_t>::kCanBeCloned;
464 }
465
Clone() const466 std::unique_ptr<Utf16CharacterStream> Clone() const override {
467 CHECK(can_be_cloned());
468 return std::unique_ptr<Utf16CharacterStream>(
469 new Windows1252CharacterStream(*this));
470 }
471
472 protected:
ReadBlock(size_t position)473 bool ReadBlock(size_t position) final {
474 buffer_pos_ = position;
475 buffer_start_ = &buffer_[0];
476 buffer_cursor_ = buffer_start_;
477
478 DisallowGarbageCollection no_gc;
479 Range<uint8_t> range =
480 byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
481 if (range.length() == 0) {
482 buffer_end_ = buffer_start_;
483 return false;
484 }
485
486 size_t length = std::min({kBufferSize, range.length()});
487 std::transform(range.start, range.start + length, &buffer_[0],
488 [](uint8_t c) { return kWindows1252ToUC16[c]; });
489 buffer_end_ = &buffer_[length];
490 return true;
491 }
492
can_access_heap() const493 bool can_access_heap() const final {
494 return ChunkedStream<uint8_t>::kCanAccessHeap;
495 }
496
497 private:
Windows1252CharacterStream(const Windows1252CharacterStream & other)498 Windows1252CharacterStream(const Windows1252CharacterStream& other)
499 V8_NOEXCEPT : byte_stream_(other.byte_stream_) {}
500
501 static const size_t kBufferSize = 512;
502 base::uc16 buffer_[kBufferSize];
503 ChunkedStream<uint8_t> byte_stream_;
504 };
505
506 // ----------------------------------------------------------------------------
507 // Utf8ExternalStreamingStream - chunked streaming of Utf-8 data.
508 //
509 // This implementation is fairly complex, since data arrives in chunks which
510 // may 'cut' arbitrarily into utf-8 characters. Also, seeking to a given
511 // character position is tricky because the byte position cannot be derived
512 // from the character position.
513 //
514 // TODO(verwaest): Decode utf8 chunks into utf16 chunks on the blink side
515 // instead so we don't need to buffer.
516
517 class Utf8ExternalStreamingStream final : public BufferedUtf16CharacterStream {
518 public:
Utf8ExternalStreamingStream(ScriptCompiler::ExternalSourceStream * source_stream)519 Utf8ExternalStreamingStream(
520 ScriptCompiler::ExternalSourceStream* source_stream)
521 : chunks_(std::make_shared<std::vector<Chunk>>()),
522 current_({0, {0, 0, 0, unibrow::Utf8::State::kAccept}}),
523 source_stream_(source_stream) {}
524 ~Utf8ExternalStreamingStream() final = default;
525
can_access_heap() const526 bool can_access_heap() const final { return false; }
527
can_be_cloned() const528 bool can_be_cloned() const final { return true; }
529
Clone() const530 std::unique_ptr<Utf16CharacterStream> Clone() const override {
531 return std::unique_ptr<Utf16CharacterStream>(
532 new Utf8ExternalStreamingStream(*this));
533 }
534
535 protected:
536 size_t FillBuffer(size_t position) final;
537
538 private:
539 // A position within the data stream. It stores:
540 // - The 'physical' position (# of bytes in the stream),
541 // - the 'logical' position (# of ucs-2 characters, also within the stream),
542 // - a possibly incomplete utf-8 char at the current 'physical' position.
543 struct StreamPosition {
544 size_t bytes;
545 size_t chars;
546 uint32_t incomplete_char;
547 unibrow::Utf8::State state;
548 };
549
550 // Position contains a StreamPosition and the index of the chunk the position
551 // points into. (The chunk_no could be derived from pos, but that'd be
552 // an expensive search through all chunks.)
553 struct Position {
554 size_t chunk_no;
555 StreamPosition pos;
556 };
557
558 // A chunk in the list of chunks, containing:
559 // - The chunk data (data pointer and length), and
560 // - the position at the first byte of the chunk.
561 struct Chunk {
Chunkv8::internal::Utf8ExternalStreamingStream::Chunk562 Chunk(const uint8_t* data, size_t length, StreamPosition start)
563 : data(data), length(length), start(start) {}
564 std::unique_ptr<const uint8_t[]> data;
565 size_t length;
566 StreamPosition start;
567 };
568
Utf8ExternalStreamingStream(const Utf8ExternalStreamingStream & source_stream)569 Utf8ExternalStreamingStream(const Utf8ExternalStreamingStream& source_stream)
570 V8_NOEXCEPT : chunks_(source_stream.chunks_),
571 current_({0, {0, 0, 0, unibrow::Utf8::State::kAccept}}),
572 source_stream_(nullptr) {}
573
574 // Within the current chunk, skip forward from current_ towards position.
575 bool SkipToPosition(size_t position);
576 // Within the current chunk, fill the buffer_ (while it has capacity).
577 void FillBufferFromCurrentChunk();
578 // Fetch a new chunk (assuming current_ is at the end of the current data).
579 bool FetchChunk();
580 // Search through the chunks and set current_ to point to the given position.
581 // (This call is potentially expensive.)
582 void SearchPosition(size_t position);
583
GetChunk(size_t chunk_no)584 Chunk& GetChunk(size_t chunk_no) { return (*chunks_)[chunk_no]; }
585
586 std::shared_ptr<std::vector<Chunk>> chunks_;
587 Position current_;
588 ScriptCompiler::ExternalSourceStream* source_stream_;
589 };
590
SkipToPosition(size_t position)591 bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
592 DCHECK_LE(current_.pos.chars, position); // We can only skip forward.
593
594 // Already there? Then return immediately.
595 if (current_.pos.chars == position) return true;
596
597 const Chunk& chunk = GetChunk(current_.chunk_no);
598 DCHECK(current_.pos.bytes >= chunk.start.bytes);
599
600 unibrow::Utf8::State state = chunk.start.state;
601 uint32_t incomplete_char = chunk.start.incomplete_char;
602 size_t it = current_.pos.bytes - chunk.start.bytes;
603 const uint8_t* cursor = &chunk.data.get()[it];
604 const uint8_t* end = &chunk.data.get()[chunk.length];
605
606 size_t chars = current_.pos.chars;
607
608 if (V8_UNLIKELY(current_.pos.bytes < 3 && chars == 0)) {
609 while (cursor < end) {
610 unibrow::uchar t =
611 unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
612 if (t == unibrow::Utf8::kIncomplete) continue;
613 if (t != kUtf8Bom) {
614 chars++;
615 if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
616 }
617 break;
618 }
619 }
620
621 while (cursor < end && chars < position) {
622 unibrow::uchar t =
623 unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
624 if (t != unibrow::Utf8::kIncomplete) {
625 chars++;
626 if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
627 }
628 }
629
630 current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data.get());
631 current_.pos.chars = chars;
632 current_.pos.incomplete_char = incomplete_char;
633 current_.pos.state = state;
634 current_.chunk_no += (cursor == end);
635
636 return current_.pos.chars == position;
637 }
638
FillBufferFromCurrentChunk()639 void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
640 DCHECK_LT(current_.chunk_no, chunks_->size());
641 DCHECK_EQ(buffer_start_, buffer_cursor_);
642 DCHECK_LT(buffer_end_ + 1, buffer_start_ + kBufferSize);
643
644 const Chunk& chunk = GetChunk(current_.chunk_no);
645
646 // The buffer_ is writable, but buffer_*_ members are const. So we get a
647 // non-const pointer into buffer that points to the same char as buffer_end_.
648 uint16_t* output_cursor = buffer_ + (buffer_end_ - buffer_start_);
649 DCHECK_EQ(output_cursor, buffer_end_);
650
651 unibrow::Utf8::State state = current_.pos.state;
652 uint32_t incomplete_char = current_.pos.incomplete_char;
653
654 // If the current chunk is the last (empty) chunk we'll have to process
655 // any left-over, partial characters.
656 if (chunk.length == 0) {
657 unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
658 if (t != unibrow::Utf8::kBufferEmpty) {
659 DCHECK_EQ(t, unibrow::Utf8::kBadChar);
660 *output_cursor = static_cast<base::uc16>(t);
661 buffer_end_++;
662 current_.pos.chars++;
663 current_.pos.incomplete_char = 0;
664 current_.pos.state = state;
665 }
666 return;
667 }
668
669 size_t it = current_.pos.bytes - chunk.start.bytes;
670 const uint8_t* cursor = chunk.data.get() + it;
671 const uint8_t* end = chunk.data.get() + chunk.length;
672
673 // Deal with possible BOM.
674 if (V8_UNLIKELY(current_.pos.bytes < 3 && current_.pos.chars == 0)) {
675 while (cursor < end) {
676 unibrow::uchar t =
677 unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
678 if (V8_LIKELY(t < kUtf8Bom)) {
679 *(output_cursor++) =
680 static_cast<base::uc16>(t); // The most frequent case.
681 } else if (t == unibrow::Utf8::kIncomplete) {
682 continue;
683 } else if (t == kUtf8Bom) {
684 // BOM detected at beginning of the stream. Don't copy it.
685 } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
686 *(output_cursor++) = static_cast<base::uc16>(t);
687 } else {
688 *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
689 *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
690 }
691 break;
692 }
693 }
694
695 const uint16_t* max_buffer_end = buffer_start_ + kBufferSize;
696 while (cursor < end && output_cursor + 1 < max_buffer_end) {
697 unibrow::uchar t =
698 unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
699 if (V8_LIKELY(t <= unibrow::Utf16::kMaxNonSurrogateCharCode)) {
700 *(output_cursor++) =
701 static_cast<base::uc16>(t); // The most frequent case.
702 } else if (t == unibrow::Utf8::kIncomplete) {
703 continue;
704 } else {
705 *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
706 *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
707 }
708 // Fast path for ascii sequences.
709 size_t remaining = end - cursor;
710 size_t max_buffer = max_buffer_end - output_cursor;
711 int max_length = static_cast<int>(std::min(remaining, max_buffer));
712 DCHECK_EQ(state, unibrow::Utf8::State::kAccept);
713 int ascii_length = NonAsciiStart(cursor, max_length);
714 CopyChars(output_cursor, cursor, ascii_length);
715 cursor += ascii_length;
716 output_cursor += ascii_length;
717 }
718
719 current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data.get());
720 current_.pos.chars += (output_cursor - buffer_end_);
721 current_.pos.incomplete_char = incomplete_char;
722 current_.pos.state = state;
723 current_.chunk_no += (cursor == end);
724
725 buffer_end_ = output_cursor;
726 }
727
FetchChunk()728 bool Utf8ExternalStreamingStream::FetchChunk() {
729 RCS_SCOPE(runtime_call_stats(), RuntimeCallCounterId::kGetMoreDataCallback);
730 DCHECK_EQ(current_.chunk_no, chunks_->size());
731 DCHECK(chunks_->empty() || chunks_->back().length != 0);
732
733 // Clone Utf8ExternalStreamingStreams have a null source stream, and
734 // therefore can't fetch any new data.
735 DCHECK_NOT_NULL(source_stream_);
736
737 // Utf8ExternalStreamingStreams that have been cloned are not allowed to fetch
738 // any more.
739 DCHECK_EQ(chunks_.use_count(), 1);
740
741 const uint8_t* chunk = nullptr;
742 size_t length = source_stream_->GetMoreData(&chunk);
743 chunks_->emplace_back(chunk, length, current_.pos);
744 return length > 0;
745 }
746
SearchPosition(size_t position)747 void Utf8ExternalStreamingStream::SearchPosition(size_t position) {
748 // If current_ already points to the right position, we're done.
749 //
750 // This is expected to be the common case, since we typically call
751 // FillBuffer right after the current buffer.
752 if (current_.pos.chars == position) return;
753
754 // No chunks. Fetch at least one, so we can assume !chunks_->empty() below.
755 if (chunks_->empty()) {
756 DCHECK_EQ(current_.chunk_no, 0u);
757 DCHECK_EQ(current_.pos.bytes, 0u);
758 DCHECK_EQ(current_.pos.chars, 0u);
759 FetchChunk();
760 }
761
762 // Search for the last chunk whose start position is less or equal to
763 // position.
764 size_t chunk_no = chunks_->size() - 1;
765 while (chunk_no > 0 && GetChunk(chunk_no).start.chars > position) {
766 chunk_no--;
767 }
768
769 // Did we find the terminating (zero-length) chunk? Then we're seeking
770 // behind the end of the data, and position does not exist.
771 // Set current_ to point to the terminating chunk.
772 if (GetChunk(chunk_no).length == 0) {
773 current_ = {chunk_no, GetChunk(chunk_no).start};
774 return;
775 }
776
777 // Did we find the non-last chunk? Then our position must be within chunk_no.
778 if (chunk_no + 1 < chunks_->size()) {
779 // Fancy-pants optimization for ASCII chunks within a utf-8 stream.
780 // (Many web sites declare utf-8 encoding, but use only (or almost only) the
781 // ASCII subset for their JavaScript sources. We can exploit this, by
782 // checking whether the # bytes in a chunk are equal to the # chars, and if
783 // so avoid the expensive SkipToPosition.)
784 bool ascii_only_chunk =
785 GetChunk(chunk_no).start.incomplete_char == 0 &&
786 (GetChunk(chunk_no + 1).start.bytes - GetChunk(chunk_no).start.bytes) ==
787 (GetChunk(chunk_no + 1).start.chars -
788 GetChunk(chunk_no).start.chars);
789 if (ascii_only_chunk) {
790 size_t skip = position - GetChunk(chunk_no).start.chars;
791 current_ = {chunk_no,
792 {GetChunk(chunk_no).start.bytes + skip,
793 GetChunk(chunk_no).start.chars + skip, 0,
794 unibrow::Utf8::State::kAccept}};
795 } else {
796 current_ = {chunk_no, GetChunk(chunk_no).start};
797 SkipToPosition(position);
798 }
799
800 // Since position was within the chunk, SkipToPosition should have found
801 // something.
802 DCHECK_EQ(position, current_.pos.chars);
803 return;
804 }
805
806 // What's left: We're in the last, non-terminating chunk. Our position
807 // may be in the chunk, but it may also be in 'future' chunks, which we'll
808 // have to obtain.
809 DCHECK_EQ(chunk_no, chunks_->size() - 1);
810 current_ = {chunk_no, GetChunk(chunk_no).start};
811 bool have_more_data = true;
812 bool found = SkipToPosition(position);
813 while (have_more_data && !found) {
814 DCHECK_EQ(current_.chunk_no, chunks_->size());
815 have_more_data = FetchChunk();
816 found = have_more_data && SkipToPosition(position);
817 }
818
819 // We'll return with a postion != the desired position only if we're out
820 // of data. In that case, we'll point to the terminating chunk.
821 DCHECK_EQ(found, current_.pos.chars == position);
822 DCHECK_EQ(have_more_data, chunks_->back().length != 0);
823 DCHECK_IMPLIES(!found, !have_more_data);
824 DCHECK_IMPLIES(!found, current_.chunk_no == chunks_->size() - 1);
825 }
826
FillBuffer(size_t position)827 size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
828 buffer_cursor_ = buffer_;
829 buffer_end_ = buffer_;
830
831 SearchPosition(position);
832 bool out_of_data = current_.chunk_no != chunks_->size() &&
833 GetChunk(current_.chunk_no).length == 0 &&
834 current_.pos.incomplete_char == 0;
835
836 if (out_of_data) return 0;
837
838 // Fill the buffer, until we have at least one char (or are out of data).
839 // (The embedder might give us 1-byte blocks within a utf-8 char, so we
840 // can't guarantee progress with one chunk. Thus we iterate.)
841 while (!out_of_data && buffer_cursor_ == buffer_end_) {
842 // At end of current data, but there might be more? Then fetch it.
843 if (current_.chunk_no == chunks_->size()) {
844 out_of_data = !FetchChunk();
845 }
846 FillBufferFromCurrentChunk();
847 }
848
849 DCHECK_EQ(current_.pos.chars - position,
850 static_cast<size_t>(buffer_end_ - buffer_cursor_));
851 return buffer_end_ - buffer_cursor_;
852 }
853
854 // ----------------------------------------------------------------------------
855 // ScannerStream: Create stream instances.
856
For(Isolate * isolate,Handle<String> data)857 Utf16CharacterStream* ScannerStream::For(Isolate* isolate,
858 Handle<String> data) {
859 return ScannerStream::For(isolate, data, 0, data->length());
860 }
861
For(Isolate * isolate,Handle<String> data,int start_pos,int end_pos)862 Utf16CharacterStream* ScannerStream::For(Isolate* isolate, Handle<String> data,
863 int start_pos, int end_pos) {
864 DCHECK_GE(start_pos, 0);
865 DCHECK_LE(start_pos, end_pos);
866 DCHECK_LE(end_pos, data->length());
867 size_t start_offset = 0;
868 if (data->IsSlicedString()) {
869 SlicedString string = SlicedString::cast(*data);
870 start_offset = string.offset();
871 String parent = string.parent();
872 if (parent.IsThinString()) parent = ThinString::cast(parent).actual();
873 data = handle(parent, isolate);
874 } else {
875 data = String::Flatten(isolate, data);
876 }
877 if (data->IsExternalOneByteString()) {
878 return new BufferedCharacterStream<ExternalStringStream>(
879 static_cast<size_t>(start_pos), ExternalOneByteString::cast(*data),
880 start_offset, static_cast<size_t>(end_pos));
881 } else if (data->IsExternalTwoByteString()) {
882 return new UnbufferedCharacterStream<ExternalStringStream>(
883 static_cast<size_t>(start_pos), ExternalTwoByteString::cast(*data),
884 start_offset, static_cast<size_t>(end_pos));
885 } else if (data->IsSeqOneByteString()) {
886 return new BufferedCharacterStream<OnHeapStream>(
887 static_cast<size_t>(start_pos), Handle<SeqOneByteString>::cast(data),
888 start_offset, static_cast<size_t>(end_pos));
889 } else if (data->IsSeqTwoByteString()) {
890 return new RelocatingCharacterStream(
891 isolate, static_cast<size_t>(start_pos),
892 Handle<SeqTwoByteString>::cast(data), start_offset,
893 static_cast<size_t>(end_pos));
894 } else {
895 UNREACHABLE();
896 }
897 }
898
ForTesting(const char * data)899 std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
900 const char* data) {
901 return ScannerStream::ForTesting(data, strlen(data));
902 }
903
ForTesting(const char * data,size_t length)904 std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
905 const char* data, size_t length) {
906 if (data == nullptr) {
907 DCHECK_EQ(length, 0);
908
909 // We don't want to pass in a null pointer into the the character stream,
910 // because then the one-past-the-end pointer is undefined, so instead pass
911 // through this static array.
912 static const char non_null_empty_string[1] = {0};
913 data = non_null_empty_string;
914 }
915
916 return std::unique_ptr<Utf16CharacterStream>(
917 new BufferedCharacterStream<TestingStream>(
918 0, reinterpret_cast<const uint8_t*>(data), length));
919 }
920
ForTesting(const uint16_t * data,size_t length)921 std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
922 const uint16_t* data, size_t length) {
923 if (data == nullptr) {
924 DCHECK_EQ(length, 0);
925
926 // We don't want to pass in a null pointer into the the character stream,
927 // because then the one-past-the-end pointer is undefined, so instead pass
928 // through this static array.
929 static const uint16_t non_null_empty_uint16_t_string[1] = {0};
930 data = non_null_empty_uint16_t_string;
931 }
932
933 return std::unique_ptr<Utf16CharacterStream>(
934 new UnbufferedCharacterStream<TestingStream>(0, data, length));
935 }
936
For(ScriptCompiler::ExternalSourceStream * source_stream,v8::ScriptCompiler::StreamedSource::Encoding encoding)937 Utf16CharacterStream* ScannerStream::For(
938 ScriptCompiler::ExternalSourceStream* source_stream,
939 v8::ScriptCompiler::StreamedSource::Encoding encoding) {
940 switch (encoding) {
941 case v8::ScriptCompiler::StreamedSource::TWO_BYTE:
942 return new UnbufferedCharacterStream<ChunkedStream>(
943 static_cast<size_t>(0), source_stream);
944 case v8::ScriptCompiler::StreamedSource::ONE_BYTE:
945 return new BufferedCharacterStream<ChunkedStream>(static_cast<size_t>(0),
946 source_stream);
947 case v8::ScriptCompiler::StreamedSource::WINDOWS_1252:
948 return new Windows1252CharacterStream(static_cast<size_t>(0),
949 source_stream);
950 case v8::ScriptCompiler::StreamedSource::UTF8:
951 return new Utf8ExternalStreamingStream(source_stream);
952 }
953 UNREACHABLE();
954 }
955
956 } // namespace internal
957 } // namespace v8
958