• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/parsing/scanner-character-streams.h"
6 
7 #include <memory>
8 #include <vector>
9 
10 #include "include/v8.h"
11 #include "src/common/globals.h"
12 #include "src/handles/handles.h"
13 #include "src/logging/counters.h"
14 #include "src/objects/objects-inl.h"
15 #include "src/parsing/scanner.h"
16 #include "src/strings/unicode-inl.h"
17 
18 namespace v8 {
19 namespace internal {
20 
21 class ScopedExternalStringLock {
22  public:
ScopedExternalStringLock(ExternalString string)23   explicit ScopedExternalStringLock(ExternalString string) {
24     DCHECK(!string.is_null());
25     if (string.IsExternalOneByteString()) {
26       resource_ = ExternalOneByteString::cast(string).resource();
27     } else {
28       DCHECK(string.IsExternalTwoByteString());
29       resource_ = ExternalTwoByteString::cast(string).resource();
30     }
31     DCHECK(resource_);
32     resource_->Lock();
33   }
34 
35   // Copying a lock increases the locking depth.
ScopedExternalStringLock(const ScopedExternalStringLock & other)36   ScopedExternalStringLock(const ScopedExternalStringLock& other) V8_NOEXCEPT
37       : resource_(other.resource_) {
38     resource_->Lock();
39   }
40 
~ScopedExternalStringLock()41   ~ScopedExternalStringLock() { resource_->Unlock(); }
42 
43  private:
44   // Not nullptr.
45   const v8::String::ExternalStringResourceBase* resource_;
46 };
47 
48 namespace {
49 const unibrow::uchar kUtf8Bom = 0xFEFF;
50 }  // namespace
51 
52 template <typename Char>
53 struct Range {
54   const Char* start;
55   const Char* end;
56 
lengthv8::internal::Range57   size_t length() { return static_cast<size_t>(end - start); }
unaligned_startv8::internal::Range58   bool unaligned_start() const {
59     return reinterpret_cast<intptr_t>(start) % sizeof(Char) == 1;
60   }
61 };
62 
63 // A Char stream backed by an on-heap SeqOneByteString or SeqTwoByteString.
64 template <typename Char>
65 class OnHeapStream {
66  public:
67   using String = typename CharTraits<Char>::String;
68 
OnHeapStream(Handle<String> string,size_t start_offset,size_t end)69   OnHeapStream(Handle<String> string, size_t start_offset, size_t end)
70       : string_(string), start_offset_(start_offset), length_(end) {}
71 
OnHeapStream(const OnHeapStream &)72   OnHeapStream(const OnHeapStream&) V8_NOEXCEPT : start_offset_(0), length_(0) {
73     UNREACHABLE();
74   }
75 
76   // The no_gc argument is only here because of the templated way this class
77   // is used along with other implementations that require V8 heap access.
GetDataAt(size_t pos,RuntimeCallStats * stats,DisallowHeapAllocation * no_gc)78   Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
79                         DisallowHeapAllocation* no_gc) {
80     return {&string_->GetChars(*no_gc)[start_offset_ + Min(length_, pos)],
81             &string_->GetChars(*no_gc)[start_offset_ + length_]};
82   }
83 
84   static const bool kCanBeCloned = false;
85   static const bool kCanAccessHeap = true;
86 
87  private:
88   Handle<String> string_;
89   const size_t start_offset_;
90   const size_t length_;
91 };
92 
93 // A Char stream backed by an off-heap ExternalOneByteString or
94 // ExternalTwoByteString.
95 template <typename Char>
96 class ExternalStringStream {
97   using ExternalString = typename CharTraits<Char>::ExternalString;
98 
99  public:
ExternalStringStream(ExternalString string,size_t start_offset,size_t length)100   ExternalStringStream(ExternalString string, size_t start_offset,
101                        size_t length)
102       : lock_(string),
103         data_(string.GetChars() + start_offset),
104         length_(length) {}
105 
ExternalStringStream(const ExternalStringStream & other)106   ExternalStringStream(const ExternalStringStream& other) V8_NOEXCEPT
107       : lock_(other.lock_),
108         data_(other.data_),
109         length_(other.length_) {}
110 
111   // The no_gc argument is only here because of the templated way this class
112   // is used along with other implementations that require V8 heap access.
GetDataAt(size_t pos,RuntimeCallStats * stats,DisallowHeapAllocation * no_gc=nullptr)113   Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
114                         DisallowHeapAllocation* no_gc = nullptr) {
115     return {&data_[Min(length_, pos)], &data_[length_]};
116   }
117 
118   static const bool kCanBeCloned = true;
119   static const bool kCanAccessHeap = false;
120 
121  private:
122   ScopedExternalStringLock lock_;
123   const Char* const data_;
124   const size_t length_;
125 };
126 
127 // A Char stream backed by a C array. Testing only.
128 template <typename Char>
129 class TestingStream {
130  public:
TestingStream(const Char * data,size_t length)131   TestingStream(const Char* data, size_t length)
132       : data_(data), length_(length) {}
133   // The no_gc argument is only here because of the templated way this class
134   // is used along with other implementations that require V8 heap access.
GetDataAt(size_t pos,RuntimeCallStats * stats,DisallowHeapAllocation * no_gc=nullptr)135   Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
136                         DisallowHeapAllocation* no_gc = nullptr) {
137     return {&data_[Min(length_, pos)], &data_[length_]};
138   }
139 
140   static const bool kCanBeCloned = true;
141   static const bool kCanAccessHeap = false;
142 
143  private:
144   const Char* const data_;
145   const size_t length_;
146 };
147 
148 // A Char stream backed by multiple source-stream provided off-heap chunks.
149 template <typename Char>
150 class ChunkedStream {
151  public:
ChunkedStream(ScriptCompiler::ExternalSourceStream * source)152   explicit ChunkedStream(ScriptCompiler::ExternalSourceStream* source)
153       : source_(source) {}
154 
ChunkedStream(const ChunkedStream &)155   ChunkedStream(const ChunkedStream&) V8_NOEXCEPT {
156     // TODO(rmcilroy): Implement cloning for chunked streams.
157     UNREACHABLE();
158   }
159 
160   // The no_gc argument is only here because of the templated way this class
161   // is used along with other implementations that require V8 heap access.
GetDataAt(size_t pos,RuntimeCallStats * stats,DisallowHeapAllocation * no_gc=nullptr)162   Range<Char> GetDataAt(size_t pos, RuntimeCallStats* stats,
163                         DisallowHeapAllocation* no_gc = nullptr) {
164     Chunk chunk = FindChunk(pos, stats);
165     size_t buffer_end = chunk.length;
166     size_t buffer_pos = Min(buffer_end, pos - chunk.position);
167     return {&chunk.data[buffer_pos], &chunk.data[buffer_end]};
168   }
169 
~ChunkedStream()170   ~ChunkedStream() {
171     for (Chunk& chunk : chunks_) delete[] chunk.data;
172   }
173 
174   static const bool kCanBeCloned = false;
175   static const bool kCanAccessHeap = false;
176 
177  private:
178   struct Chunk {
Chunkv8::internal::ChunkedStream::Chunk179     Chunk(const Char* const data, size_t position, size_t length)
180         : data(data), position(position), length(length) {}
181     const Char* const data;
182     // The logical position of data.
183     const size_t position;
184     const size_t length;
end_positionv8::internal::ChunkedStream::Chunk185     size_t end_position() const { return position + length; }
186   };
187 
FindChunk(size_t position,RuntimeCallStats * stats)188   Chunk FindChunk(size_t position, RuntimeCallStats* stats) {
189     while (V8_UNLIKELY(chunks_.empty())) FetchChunk(size_t{0}, stats);
190 
191     // Walk forwards while the position is in front of the current chunk.
192     while (position >= chunks_.back().end_position() &&
193            chunks_.back().length > 0) {
194       FetchChunk(chunks_.back().end_position(), stats);
195     }
196 
197     // Walk backwards.
198     for (auto reverse_it = chunks_.rbegin(); reverse_it != chunks_.rend();
199          ++reverse_it) {
200       if (reverse_it->position <= position) return *reverse_it;
201     }
202 
203     UNREACHABLE();
204   }
205 
ProcessChunk(const uint8_t * data,size_t position,size_t length)206   virtual void ProcessChunk(const uint8_t* data, size_t position,
207                             size_t length) {
208     // Incoming data has to be aligned to Char size.
209     DCHECK_EQ(0, length % sizeof(Char));
210     chunks_.emplace_back(reinterpret_cast<const Char*>(data), position,
211                          length / sizeof(Char));
212   }
213 
FetchChunk(size_t position,RuntimeCallStats * stats)214   void FetchChunk(size_t position, RuntimeCallStats* stats) {
215     const uint8_t* data = nullptr;
216     size_t length;
217     {
218       RuntimeCallTimerScope scope(stats,
219                                   RuntimeCallCounterId::kGetMoreDataCallback);
220       length = source_->GetMoreData(&data);
221     }
222     ProcessChunk(data, position, length);
223   }
224 
225   ScriptCompiler::ExternalSourceStream* source_;
226 
227  protected:
228   std::vector<struct Chunk> chunks_;
229 };
230 
231 // Provides a buffered utf-16 view on the bytes from the underlying ByteStream.
232 // Chars are buffered if either the underlying stream isn't utf-16 or the
233 // underlying utf-16 stream might move (is on-heap).
234 template <template <typename T> class ByteStream>
235 class BufferedCharacterStream : public Utf16CharacterStream {
236  public:
237   template <class... TArgs>
BufferedCharacterStream(size_t pos,TArgs...args)238   BufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) {
239     buffer_pos_ = pos;
240   }
241 
can_be_cloned() const242   bool can_be_cloned() const final {
243     return ByteStream<uint16_t>::kCanBeCloned;
244   }
245 
Clone() const246   std::unique_ptr<Utf16CharacterStream> Clone() const override {
247     CHECK(can_be_cloned());
248     return std::unique_ptr<Utf16CharacterStream>(
249         new BufferedCharacterStream<ByteStream>(*this));
250   }
251 
252  protected:
ReadBlock()253   bool ReadBlock() final {
254     size_t position = pos();
255     buffer_pos_ = position;
256     buffer_start_ = &buffer_[0];
257     buffer_cursor_ = buffer_start_;
258 
259     DisallowHeapAllocation no_gc;
260     Range<uint8_t> range =
261         byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
262     if (range.length() == 0) {
263       buffer_end_ = buffer_start_;
264       return false;
265     }
266 
267     size_t length = Min(kBufferSize, range.length());
268     i::CopyChars(buffer_, range.start, length);
269     buffer_end_ = &buffer_[length];
270     return true;
271   }
272 
can_access_heap() const273   bool can_access_heap() const final {
274     return ByteStream<uint8_t>::kCanAccessHeap;
275   }
276 
277  private:
BufferedCharacterStream(const BufferedCharacterStream<ByteStream> & other)278   BufferedCharacterStream(const BufferedCharacterStream<ByteStream>& other)
279       : byte_stream_(other.byte_stream_) {}
280 
281   static const size_t kBufferSize = 512;
282   uc16 buffer_[kBufferSize];
283   ByteStream<uint8_t> byte_stream_;
284 };
285 
286 // Provides a unbuffered utf-16 view on the bytes from the underlying
287 // ByteStream.
288 template <template <typename T> class ByteStream>
289 class UnbufferedCharacterStream : public Utf16CharacterStream {
290  public:
291   template <class... TArgs>
UnbufferedCharacterStream(size_t pos,TArgs...args)292   UnbufferedCharacterStream(size_t pos, TArgs... args) : byte_stream_(args...) {
293     buffer_pos_ = pos;
294   }
295 
can_access_heap() const296   bool can_access_heap() const final {
297     return ByteStream<uint16_t>::kCanAccessHeap;
298   }
299 
can_be_cloned() const300   bool can_be_cloned() const final {
301     return ByteStream<uint16_t>::kCanBeCloned;
302   }
303 
Clone() const304   std::unique_ptr<Utf16CharacterStream> Clone() const override {
305     return std::unique_ptr<Utf16CharacterStream>(
306         new UnbufferedCharacterStream<ByteStream>(*this));
307   }
308 
309  protected:
ReadBlock()310   bool ReadBlock() final {
311     size_t position = pos();
312     buffer_pos_ = position;
313     DisallowHeapAllocation no_gc;
314     Range<uint16_t> range =
315         byte_stream_.GetDataAt(position, runtime_call_stats(), &no_gc);
316     buffer_start_ = range.start;
317     buffer_end_ = range.end;
318     buffer_cursor_ = buffer_start_;
319     if (range.length() == 0) return false;
320 
321     DCHECK(!range.unaligned_start());
322     DCHECK_LE(buffer_start_, buffer_end_);
323     return true;
324   }
325 
UnbufferedCharacterStream(const UnbufferedCharacterStream<ByteStream> & other)326   UnbufferedCharacterStream(const UnbufferedCharacterStream<ByteStream>& other)
327       : byte_stream_(other.byte_stream_) {}
328 
329   ByteStream<uint16_t> byte_stream_;
330 };
331 
332 // Provides a unbuffered utf-16 view on the bytes from the underlying
333 // ByteStream.
334 class RelocatingCharacterStream final
335     : public UnbufferedCharacterStream<OnHeapStream> {
336  public:
337   template <class... TArgs>
RelocatingCharacterStream(Isolate * isolate,size_t pos,TArgs...args)338   RelocatingCharacterStream(Isolate* isolate, size_t pos, TArgs... args)
339       : UnbufferedCharacterStream<OnHeapStream>(pos, args...),
340         isolate_(isolate) {
341     isolate->heap()->AddGCEpilogueCallback(UpdateBufferPointersCallback,
342                                            v8::kGCTypeAll, this);
343   }
344 
345  private:
~RelocatingCharacterStream()346   ~RelocatingCharacterStream() final {
347     isolate_->heap()->RemoveGCEpilogueCallback(UpdateBufferPointersCallback,
348                                                this);
349   }
350 
UpdateBufferPointersCallback(v8::Isolate * v8_isolate,v8::GCType type,v8::GCCallbackFlags flags,void * stream)351   static void UpdateBufferPointersCallback(v8::Isolate* v8_isolate,
352                                            v8::GCType type,
353                                            v8::GCCallbackFlags flags,
354                                            void* stream) {
355     reinterpret_cast<RelocatingCharacterStream*>(stream)
356         ->UpdateBufferPointers();
357   }
358 
UpdateBufferPointers()359   void UpdateBufferPointers() {
360     DisallowHeapAllocation no_gc;
361     Range<uint16_t> range =
362         byte_stream_.GetDataAt(buffer_pos_, runtime_call_stats(), &no_gc);
363     if (range.start != buffer_start_) {
364       buffer_cursor_ = (buffer_cursor_ - buffer_start_) + range.start;
365       buffer_start_ = range.start;
366       buffer_end_ = range.end;
367     }
368   }
369 
370   Isolate* isolate_;
371 };
372 
373 // ----------------------------------------------------------------------------
374 // BufferedUtf16CharacterStreams
375 //
376 // A buffered character stream based on a random access character
377 // source (ReadBlock can be called with pos() pointing to any position,
378 // even positions before the current).
379 //
380 // TODO(verwaest): Remove together with Utf8 external streaming streams.
381 class BufferedUtf16CharacterStream : public Utf16CharacterStream {
382  public:
383   BufferedUtf16CharacterStream();
384 
385  protected:
386   static const size_t kBufferSize = 512;
387 
388   bool ReadBlock() final;
389 
390   // FillBuffer should read up to kBufferSize characters at position and store
391   // them into buffer_[0..]. It returns the number of characters stored.
392   virtual size_t FillBuffer(size_t position) = 0;
393 
394   // Fixed sized buffer that this class reads from.
395   // The base class' buffer_start_ should always point to buffer_.
396   uc16 buffer_[kBufferSize];
397 };
398 
BufferedUtf16CharacterStream()399 BufferedUtf16CharacterStream::BufferedUtf16CharacterStream()
400     : Utf16CharacterStream(buffer_, buffer_, buffer_, 0) {}
401 
ReadBlock()402 bool BufferedUtf16CharacterStream::ReadBlock() {
403   DCHECK_EQ(buffer_start_, buffer_);
404 
405   size_t position = pos();
406   buffer_pos_ = position;
407   buffer_cursor_ = buffer_;
408   buffer_end_ = buffer_ + FillBuffer(position);
409   DCHECK_EQ(pos(), position);
410   DCHECK_LE(buffer_end_, buffer_start_ + kBufferSize);
411   return buffer_cursor_ < buffer_end_;
412 }
413 
414 // ----------------------------------------------------------------------------
415 // Utf8ExternalStreamingStream - chunked streaming of Utf-8 data.
416 //
417 // This implementation is fairly complex, since data arrives in chunks which
418 // may 'cut' arbitrarily into utf-8 characters. Also, seeking to a given
419 // character position is tricky because the byte position cannot be derived
420 // from the character position.
421 //
422 // TODO(verwaest): Decode utf8 chunks into utf16 chunks on the blink side
423 // instead so we don't need to buffer.
424 
425 class Utf8ExternalStreamingStream final : public BufferedUtf16CharacterStream {
426  public:
Utf8ExternalStreamingStream(ScriptCompiler::ExternalSourceStream * source_stream)427   Utf8ExternalStreamingStream(
428       ScriptCompiler::ExternalSourceStream* source_stream)
429       : current_({0, {0, 0, 0, unibrow::Utf8::State::kAccept}}),
430         source_stream_(source_stream) {}
~Utf8ExternalStreamingStream()431   ~Utf8ExternalStreamingStream() final {
432     for (const Chunk& chunk : chunks_) delete[] chunk.data;
433   }
434 
can_access_heap() const435   bool can_access_heap() const final { return false; }
436 
can_be_cloned() const437   bool can_be_cloned() const final { return false; }
438 
Clone() const439   std::unique_ptr<Utf16CharacterStream> Clone() const override {
440     UNREACHABLE();
441   }
442 
443  protected:
444   size_t FillBuffer(size_t position) final;
445 
446  private:
447   // A position within the data stream. It stores:
448   // - The 'physical' position (# of bytes in the stream),
449   // - the 'logical' position (# of ucs-2 characters, also within the stream),
450   // - a possibly incomplete utf-8 char at the current 'physical' position.
451   struct StreamPosition {
452     size_t bytes;
453     size_t chars;
454     uint32_t incomplete_char;
455     unibrow::Utf8::State state;
456   };
457 
458   // Position contains a StreamPosition and the index of the chunk the position
459   // points into. (The chunk_no could be derived from pos, but that'd be
460   // an expensive search through all chunks.)
461   struct Position {
462     size_t chunk_no;
463     StreamPosition pos;
464   };
465 
466   // A chunk in the list of chunks, containing:
467   // - The chunk data (data pointer and length), and
468   // - the position at the first byte of the chunk.
469   struct Chunk {
470     const uint8_t* data;
471     size_t length;
472     StreamPosition start;
473   };
474 
475   // Within the current chunk, skip forward from current_ towards position.
476   bool SkipToPosition(size_t position);
477   // Within the current chunk, fill the buffer_ (while it has capacity).
478   void FillBufferFromCurrentChunk();
479   // Fetch a new chunk (assuming current_ is at the end of the current data).
480   bool FetchChunk();
481   // Search through the chunks and set current_ to point to the given position.
482   // (This call is potentially expensive.)
483   void SearchPosition(size_t position);
484 
485   std::vector<Chunk> chunks_;
486   Position current_;
487   ScriptCompiler::ExternalSourceStream* source_stream_;
488 };
489 
SkipToPosition(size_t position)490 bool Utf8ExternalStreamingStream::SkipToPosition(size_t position) {
491   DCHECK_LE(current_.pos.chars, position);  // We can only skip forward.
492 
493   // Already there? Then return immediately.
494   if (current_.pos.chars == position) return true;
495 
496   const Chunk& chunk = chunks_[current_.chunk_no];
497   DCHECK(current_.pos.bytes >= chunk.start.bytes);
498 
499   unibrow::Utf8::State state = chunk.start.state;
500   uint32_t incomplete_char = chunk.start.incomplete_char;
501   size_t it = current_.pos.bytes - chunk.start.bytes;
502   const uint8_t* cursor = &chunk.data[it];
503   const uint8_t* end = &chunk.data[chunk.length];
504 
505   size_t chars = current_.pos.chars;
506 
507   if (V8_UNLIKELY(current_.pos.bytes < 3 && chars == 0)) {
508     while (cursor < end) {
509       unibrow::uchar t =
510           unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
511       if (t == unibrow::Utf8::kIncomplete) continue;
512       if (t != kUtf8Bom) {
513         chars++;
514         if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
515       }
516       break;
517     }
518   }
519 
520   while (cursor < end && chars < position) {
521     unibrow::uchar t =
522         unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
523     if (t != unibrow::Utf8::kIncomplete) {
524       chars++;
525       if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) chars++;
526     }
527   }
528 
529   current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
530   current_.pos.chars = chars;
531   current_.pos.incomplete_char = incomplete_char;
532   current_.pos.state = state;
533   current_.chunk_no += (cursor == end);
534 
535   return current_.pos.chars == position;
536 }
537 
FillBufferFromCurrentChunk()538 void Utf8ExternalStreamingStream::FillBufferFromCurrentChunk() {
539   DCHECK_LT(current_.chunk_no, chunks_.size());
540   DCHECK_EQ(buffer_start_, buffer_cursor_);
541   DCHECK_LT(buffer_end_ + 1, buffer_start_ + kBufferSize);
542 
543   const Chunk& chunk = chunks_[current_.chunk_no];
544 
545   // The buffer_ is writable, but buffer_*_ members are const. So we get a
546   // non-const pointer into buffer that points to the same char as buffer_end_.
547   uint16_t* output_cursor = buffer_ + (buffer_end_ - buffer_start_);
548   DCHECK_EQ(output_cursor, buffer_end_);
549 
550   unibrow::Utf8::State state = current_.pos.state;
551   uint32_t incomplete_char = current_.pos.incomplete_char;
552 
553   // If the current chunk is the last (empty) chunk we'll have to process
554   // any left-over, partial characters.
555   if (chunk.length == 0) {
556     unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
557     if (t != unibrow::Utf8::kBufferEmpty) {
558       DCHECK_EQ(t, unibrow::Utf8::kBadChar);
559       *output_cursor = static_cast<uc16>(t);
560       buffer_end_++;
561       current_.pos.chars++;
562       current_.pos.incomplete_char = 0;
563       current_.pos.state = state;
564     }
565     return;
566   }
567 
568   size_t it = current_.pos.bytes - chunk.start.bytes;
569   const uint8_t* cursor = chunk.data + it;
570   const uint8_t* end = chunk.data + chunk.length;
571 
572   // Deal with possible BOM.
573   if (V8_UNLIKELY(current_.pos.bytes < 3 && current_.pos.chars == 0)) {
574     while (cursor < end) {
575       unibrow::uchar t =
576           unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
577       if (V8_LIKELY(t < kUtf8Bom)) {
578         *(output_cursor++) = static_cast<uc16>(t);  // The most frequent case.
579       } else if (t == unibrow::Utf8::kIncomplete) {
580         continue;
581       } else if (t == kUtf8Bom) {
582         // BOM detected at beginning of the stream. Don't copy it.
583       } else if (t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
584         *(output_cursor++) = static_cast<uc16>(t);
585       } else {
586         *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
587         *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
588       }
589       break;
590     }
591   }
592 
593   const uint16_t* max_buffer_end = buffer_start_ + kBufferSize;
594   while (cursor < end && output_cursor + 1 < max_buffer_end) {
595     unibrow::uchar t =
596         unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
597     if (V8_LIKELY(t <= unibrow::Utf16::kMaxNonSurrogateCharCode)) {
598       *(output_cursor++) = static_cast<uc16>(t);  // The most frequent case.
599     } else if (t == unibrow::Utf8::kIncomplete) {
600       continue;
601     } else {
602       *(output_cursor++) = unibrow::Utf16::LeadSurrogate(t);
603       *(output_cursor++) = unibrow::Utf16::TrailSurrogate(t);
604     }
605     // Fast path for ascii sequences.
606     size_t remaining = end - cursor;
607     size_t max_buffer = max_buffer_end - output_cursor;
608     int max_length = static_cast<int>(Min(remaining, max_buffer));
609     DCHECK_EQ(state, unibrow::Utf8::State::kAccept);
610     int ascii_length = NonAsciiStart(cursor, max_length);
611     CopyChars(output_cursor, cursor, ascii_length);
612     cursor += ascii_length;
613     output_cursor += ascii_length;
614   }
615 
616   current_.pos.bytes = chunk.start.bytes + (cursor - chunk.data);
617   current_.pos.chars += (output_cursor - buffer_end_);
618   current_.pos.incomplete_char = incomplete_char;
619   current_.pos.state = state;
620   current_.chunk_no += (cursor == end);
621 
622   buffer_end_ = output_cursor;
623 }
624 
FetchChunk()625 bool Utf8ExternalStreamingStream::FetchChunk() {
626   RuntimeCallTimerScope scope(runtime_call_stats(),
627                               RuntimeCallCounterId::kGetMoreDataCallback);
628   DCHECK_EQ(current_.chunk_no, chunks_.size());
629   DCHECK(chunks_.empty() || chunks_.back().length != 0);
630 
631   const uint8_t* chunk = nullptr;
632   size_t length = source_stream_->GetMoreData(&chunk);
633   chunks_.push_back({chunk, length, current_.pos});
634   return length > 0;
635 }
636 
SearchPosition(size_t position)637 void Utf8ExternalStreamingStream::SearchPosition(size_t position) {
638   // If current_ already points to the right position, we're done.
639   //
640   // This is expected to be the common case, since we typically call
641   // FillBuffer right after the current buffer.
642   if (current_.pos.chars == position) return;
643 
644   // No chunks. Fetch at least one, so we can assume !chunks_.empty() below.
645   if (chunks_.empty()) {
646     DCHECK_EQ(current_.chunk_no, 0u);
647     DCHECK_EQ(current_.pos.bytes, 0u);
648     DCHECK_EQ(current_.pos.chars, 0u);
649     FetchChunk();
650   }
651 
652   // Search for the last chunk whose start position is less or equal to
653   // position.
654   size_t chunk_no = chunks_.size() - 1;
655   while (chunk_no > 0 && chunks_[chunk_no].start.chars > position) {
656     chunk_no--;
657   }
658 
659   // Did we find the terminating (zero-length) chunk? Then we're seeking
660   // behind the end of the data, and position does not exist.
661   // Set current_ to point to the terminating chunk.
662   if (chunks_[chunk_no].length == 0) {
663     current_ = {chunk_no, chunks_[chunk_no].start};
664     return;
665   }
666 
667   // Did we find the non-last chunk? Then our position must be within chunk_no.
668   if (chunk_no + 1 < chunks_.size()) {
669     // Fancy-pants optimization for ASCII chunks within a utf-8 stream.
670     // (Many web sites declare utf-8 encoding, but use only (or almost only) the
671     //  ASCII subset for their JavaScript sources. We can exploit this, by
672     //  checking whether the # bytes in a chunk are equal to the # chars, and if
673     //  so avoid the expensive SkipToPosition.)
674     bool ascii_only_chunk =
675         chunks_[chunk_no].start.incomplete_char == 0 &&
676         (chunks_[chunk_no + 1].start.bytes - chunks_[chunk_no].start.bytes) ==
677             (chunks_[chunk_no + 1].start.chars - chunks_[chunk_no].start.chars);
678     if (ascii_only_chunk) {
679       size_t skip = position - chunks_[chunk_no].start.chars;
680       current_ = {chunk_no,
681                   {chunks_[chunk_no].start.bytes + skip,
682                    chunks_[chunk_no].start.chars + skip, 0,
683                    unibrow::Utf8::State::kAccept}};
684     } else {
685       current_ = {chunk_no, chunks_[chunk_no].start};
686       SkipToPosition(position);
687     }
688 
689     // Since position was within the chunk, SkipToPosition should have found
690     // something.
691     DCHECK_EQ(position, current_.pos.chars);
692     return;
693   }
694 
695   // What's left: We're in the last, non-terminating chunk. Our position
696   // may be in the chunk, but it may also be in 'future' chunks, which we'll
697   // have to obtain.
698   DCHECK_EQ(chunk_no, chunks_.size() - 1);
699   current_ = {chunk_no, chunks_[chunk_no].start};
700   bool have_more_data = true;
701   bool found = SkipToPosition(position);
702   while (have_more_data && !found) {
703     DCHECK_EQ(current_.chunk_no, chunks_.size());
704     have_more_data = FetchChunk();
705     found = have_more_data && SkipToPosition(position);
706   }
707 
708   // We'll return with a postion != the desired position only if we're out
709   // of data. In that case, we'll point to the terminating chunk.
710   DCHECK_EQ(found, current_.pos.chars == position);
711   DCHECK_EQ(have_more_data, chunks_.back().length != 0);
712   DCHECK_IMPLIES(!found, !have_more_data);
713   DCHECK_IMPLIES(!found, current_.chunk_no == chunks_.size() - 1);
714 }
715 
FillBuffer(size_t position)716 size_t Utf8ExternalStreamingStream::FillBuffer(size_t position) {
717   buffer_cursor_ = buffer_;
718   buffer_end_ = buffer_;
719 
720   SearchPosition(position);
721   bool out_of_data = current_.chunk_no != chunks_.size() &&
722                      chunks_[current_.chunk_no].length == 0 &&
723                      current_.pos.incomplete_char == 0;
724 
725   if (out_of_data) return 0;
726 
727   // Fill the buffer, until we have at least one char (or are out of data).
728   // (The embedder might give us 1-byte blocks within a utf-8 char, so we
729   //  can't guarantee progress with one chunk. Thus we iterate.)
730   while (!out_of_data && buffer_cursor_ == buffer_end_) {
731     // At end of current data, but there might be more? Then fetch it.
732     if (current_.chunk_no == chunks_.size()) {
733       out_of_data = !FetchChunk();
734     }
735     FillBufferFromCurrentChunk();
736   }
737 
738   DCHECK_EQ(current_.pos.chars - position,
739             static_cast<size_t>(buffer_end_ - buffer_cursor_));
740   return buffer_end_ - buffer_cursor_;
741 }
742 
743 // ----------------------------------------------------------------------------
744 // ScannerStream: Create stream instances.
745 
For(Isolate * isolate,Handle<String> data)746 Utf16CharacterStream* ScannerStream::For(Isolate* isolate,
747                                          Handle<String> data) {
748   return ScannerStream::For(isolate, data, 0, data->length());
749 }
750 
For(Isolate * isolate,Handle<String> data,int start_pos,int end_pos)751 Utf16CharacterStream* ScannerStream::For(Isolate* isolate, Handle<String> data,
752                                          int start_pos, int end_pos) {
753   DCHECK_GE(start_pos, 0);
754   DCHECK_LE(start_pos, end_pos);
755   DCHECK_LE(end_pos, data->length());
756   size_t start_offset = 0;
757   if (data->IsSlicedString()) {
758     SlicedString string = SlicedString::cast(*data);
759     start_offset = string.offset();
760     String parent = string.parent();
761     if (parent.IsThinString()) parent = ThinString::cast(parent).actual();
762     data = handle(parent, isolate);
763   } else {
764     data = String::Flatten(isolate, data);
765   }
766   if (data->IsExternalOneByteString()) {
767     return new BufferedCharacterStream<ExternalStringStream>(
768         static_cast<size_t>(start_pos), ExternalOneByteString::cast(*data),
769         start_offset, static_cast<size_t>(end_pos));
770   } else if (data->IsExternalTwoByteString()) {
771     return new UnbufferedCharacterStream<ExternalStringStream>(
772         static_cast<size_t>(start_pos), ExternalTwoByteString::cast(*data),
773         start_offset, static_cast<size_t>(end_pos));
774   } else if (data->IsSeqOneByteString()) {
775     return new BufferedCharacterStream<OnHeapStream>(
776         static_cast<size_t>(start_pos), Handle<SeqOneByteString>::cast(data),
777         start_offset, static_cast<size_t>(end_pos));
778   } else if (data->IsSeqTwoByteString()) {
779     return new RelocatingCharacterStream(
780         isolate, static_cast<size_t>(start_pos),
781         Handle<SeqTwoByteString>::cast(data), start_offset,
782         static_cast<size_t>(end_pos));
783   } else {
784     UNREACHABLE();
785   }
786 }
787 
ForTesting(const char * data)788 std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
789     const char* data) {
790   return ScannerStream::ForTesting(data, strlen(data));
791 }
792 
ForTesting(const char * data,size_t length)793 std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
794     const char* data, size_t length) {
795   if (data == nullptr) {
796     DCHECK_EQ(length, 0);
797 
798     // We don't want to pass in a null pointer into the the character stream,
799     // because then the one-past-the-end pointer is undefined, so instead pass
800     // through this static array.
801     static const char non_null_empty_string[1] = {0};
802     data = non_null_empty_string;
803   }
804 
805   return std::unique_ptr<Utf16CharacterStream>(
806       new BufferedCharacterStream<TestingStream>(
807           0, reinterpret_cast<const uint8_t*>(data), length));
808 }
809 
ForTesting(const uint16_t * data,size_t length)810 std::unique_ptr<Utf16CharacterStream> ScannerStream::ForTesting(
811     const uint16_t* data, size_t length) {
812   if (data == nullptr) {
813     DCHECK_EQ(length, 0);
814 
815     // We don't want to pass in a null pointer into the the character stream,
816     // because then the one-past-the-end pointer is undefined, so instead pass
817     // through this static array.
818     static const uint16_t non_null_empty_uint16_t_string[1] = {0};
819     data = non_null_empty_uint16_t_string;
820   }
821 
822   return std::unique_ptr<Utf16CharacterStream>(
823       new UnbufferedCharacterStream<TestingStream>(0, data, length));
824 }
825 
For(ScriptCompiler::ExternalSourceStream * source_stream,v8::ScriptCompiler::StreamedSource::Encoding encoding)826 Utf16CharacterStream* ScannerStream::For(
827     ScriptCompiler::ExternalSourceStream* source_stream,
828     v8::ScriptCompiler::StreamedSource::Encoding encoding) {
829   switch (encoding) {
830     case v8::ScriptCompiler::StreamedSource::TWO_BYTE:
831       return new UnbufferedCharacterStream<ChunkedStream>(
832           static_cast<size_t>(0), source_stream);
833     case v8::ScriptCompiler::StreamedSource::ONE_BYTE:
834       return new BufferedCharacterStream<ChunkedStream>(static_cast<size_t>(0),
835                                                         source_stream);
836     case v8::ScriptCompiler::StreamedSource::UTF8:
837       return new Utf8ExternalStreamingStream(source_stream);
838   }
839   UNREACHABLE();
840 }
841 
842 }  // namespace internal
843 }  // namespace v8
844