1 // Copyright 2014 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/strings/unicode-decoder.h"
6
7 #include "src/strings/unicode-inl.h"
8 #include "src/utils/memcopy.h"
9
10 namespace v8 {
11 namespace internal {
12
Utf8Decoder(const base::Vector<const uint8_t> & chars)13 Utf8Decoder::Utf8Decoder(const base::Vector<const uint8_t>& chars)
14 : encoding_(Encoding::kAscii),
15 non_ascii_start_(NonAsciiStart(chars.begin(), chars.length())),
16 utf16_length_(non_ascii_start_) {
17 if (non_ascii_start_ == chars.length()) return;
18
19 const uint8_t* cursor = chars.begin() + non_ascii_start_;
20 const uint8_t* end = chars.begin() + chars.length();
21
22 bool is_one_byte = true;
23 uint32_t incomplete_char = 0;
24 unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
25
26 while (cursor < end) {
27 unibrow::uchar t =
28 unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
29 if (t != unibrow::Utf8::kIncomplete) {
30 is_one_byte = is_one_byte && t <= unibrow::Latin1::kMaxChar;
31 utf16_length_++;
32 if (t > unibrow::Utf16::kMaxNonSurrogateCharCode) utf16_length_++;
33 }
34 }
35
36 unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
37 if (t != unibrow::Utf8::kBufferEmpty) {
38 is_one_byte = false;
39 utf16_length_++;
40 }
41
42 encoding_ = is_one_byte ? Encoding::kLatin1 : Encoding::kUtf16;
43 }
44
45 template <typename Char>
Decode(Char * out,const base::Vector<const uint8_t> & data)46 void Utf8Decoder::Decode(Char* out, const base::Vector<const uint8_t>& data) {
47 CopyChars(out, data.begin(), non_ascii_start_);
48
49 out += non_ascii_start_;
50
51 uint32_t incomplete_char = 0;
52 unibrow::Utf8::State state = unibrow::Utf8::State::kAccept;
53
54 const uint8_t* cursor = data.begin() + non_ascii_start_;
55 const uint8_t* end = data.begin() + data.length();
56
57 while (cursor < end) {
58 unibrow::uchar t =
59 unibrow::Utf8::ValueOfIncremental(&cursor, &state, &incomplete_char);
60 if (t != unibrow::Utf8::kIncomplete) {
61 if (sizeof(Char) == 1 || t <= unibrow::Utf16::kMaxNonSurrogateCharCode) {
62 *(out++) = static_cast<Char>(t);
63 } else {
64 *(out++) = unibrow::Utf16::LeadSurrogate(t);
65 *(out++) = unibrow::Utf16::TrailSurrogate(t);
66 }
67 }
68 }
69
70 unibrow::uchar t = unibrow::Utf8::ValueOfIncrementalFinish(&state);
71 if (t != unibrow::Utf8::kBufferEmpty) *out = static_cast<Char>(t);
72 }
73
74 template V8_EXPORT_PRIVATE void Utf8Decoder::Decode(
75 uint8_t* out, const base::Vector<const uint8_t>& data);
76
77 template V8_EXPORT_PRIVATE void Utf8Decoder::Decode(
78 uint16_t* out, const base::Vector<const uint8_t>& data);
79
80 } // namespace internal
81 } // namespace v8
82