1
2 /*
3 * Copyright (C) 2024 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 #include "src/trace_processor/importers/proto/string_encoding_utils.h"
19
20 #include <cinttypes>
21 #include <cstddef>
22 #include <cstdint>
23 #include <optional>
24 #include <string>
25
26 #include "perfetto/base/logging.h"
27
28 namespace perfetto {
29 namespace trace_processor {
30 namespace {
31
32 using CodePoint = uint32_t;
33
34 struct Utf8 {
35 static constexpr uint8_t kContinuationHeader = 0x80;
36 static constexpr uint8_t kContinuationValueMask = 0x3F;
37 static constexpr uint8_t kContinuationBits = 6;
38
39 static constexpr uint8_t k2ByteHeader = 0xC0;
40 static constexpr uint8_t k2ByteValueMask = 0x1F;
41
42 static constexpr uint8_t k3ByteHeader = 0xE0;
43 static constexpr uint8_t k3ByteValueMask = 0x0F;
44
45 static constexpr uint8_t k4ByteHeader = 0xF0;
46 static constexpr uint8_t k4ByteValueMask = 0x07;
47
48 static constexpr CodePoint k1ByteMaxCodepoint = 0x7F;
49 static constexpr CodePoint k2ByteMaxCodepoint = 0x7FF;
50 static constexpr CodePoint k3ByteMaxCodepoint = 0xFFFF;
51 static constexpr CodePoint k4ByteMaxCodepoint = 0x10FFFF;
52
Appendperfetto::trace_processor::__anon34fd468b0111::Utf853 static void Append(CodePoint code_point, std::string& out) {
54 if (code_point <= k1ByteMaxCodepoint) {
55 out.push_back(static_cast<char>(code_point));
56 return;
57 }
58
59 if (code_point <= k2ByteMaxCodepoint) {
60 uint8_t byte_2 =
61 kContinuationHeader + (code_point & kContinuationValueMask);
62 code_point >>= kContinuationBits;
63 uint8_t byte_1 = k2ByteHeader + (code_point & k2ByteValueMask);
64 out.push_back(static_cast<char>(byte_1));
65 out.push_back(static_cast<char>(byte_2));
66 return;
67 }
68
69 if (code_point <= k3ByteMaxCodepoint) {
70 uint8_t byte_3 =
71 kContinuationHeader + (code_point & kContinuationValueMask);
72 code_point >>= kContinuationBits;
73 uint8_t byte_2 =
74 kContinuationHeader + (code_point & kContinuationValueMask);
75 code_point >>= kContinuationBits;
76 uint8_t byte_1 = k3ByteHeader + (code_point & k3ByteValueMask);
77 out.push_back(static_cast<char>(byte_1));
78 out.push_back(static_cast<char>(byte_2));
79 out.push_back(static_cast<char>(byte_3));
80 return;
81 }
82
83 if (code_point <= k4ByteMaxCodepoint) {
84 uint8_t byte_4 =
85 kContinuationHeader + (code_point & kContinuationValueMask);
86 code_point >>= kContinuationBits;
87 uint8_t byte_3 =
88 kContinuationHeader + (code_point & kContinuationValueMask);
89 code_point >>= kContinuationBits;
90 uint8_t byte_2 =
91 kContinuationHeader + (code_point & kContinuationValueMask);
92 code_point >>= kContinuationBits;
93 uint8_t byte_1 = k4ByteHeader + (code_point & k4ByteValueMask);
94 out.push_back(static_cast<char>(byte_1));
95 out.push_back(static_cast<char>(byte_2));
96 out.push_back(static_cast<char>(byte_3));
97 out.push_back(static_cast<char>(byte_4));
98 return;
99 }
100
101 PERFETTO_FATAL("Invalid code point for UTF8 conversion: %" PRIu32,
102 code_point);
103 }
104 };
105
106 enum class Endianess {
107 kBigEndian,
108 kLittleEndian,
109 };
110
111 template <Endianess endianess>
112 class Utf16Iterator {
113 public:
114 using Utf16CodeUnit = uint16_t;
115
116 static constexpr Utf16CodeUnit kSurrogateMask = 0xFC00;
117 static constexpr Utf16CodeUnit kHighSurrogate = 0xD800;
118 static constexpr Utf16CodeUnit kLowSurrogate = 0xDC00;
119
120 static constexpr CodePoint kSurrogateCodepointOffset = 0x10000;
121 static constexpr uint32_t kSurrogateCodepointBits = 10;
122 static constexpr CodePoint kSurrogateCodepointMask =
123 (1u << kSurrogateCodepointBits) - 1;
124
125 CodePoint kInvalidCodePoint = 0xFFFD;
126
Utf16Iterator(protozero::ConstBytes bytes)127 explicit Utf16Iterator(protozero::ConstBytes bytes)
128 : current_(reinterpret_cast<const uint8_t*>(bytes.data)),
129 end_(reinterpret_cast<const uint8_t*>(bytes.data + bytes.size)) {}
130
HasMore() const131 bool HasMore() const { return current_ != end_; }
132
NextCodePoint()133 CodePoint NextCodePoint() {
134 std::optional<Utf16CodeUnit> maybe_surrogate = NextCodeUnit();
135 if (!maybe_surrogate) {
136 return kInvalidCodePoint;
137 }
138
139 if (PERFETTO_UNLIKELY(IsLowSurrogate(*maybe_surrogate))) {
140 return kInvalidCodePoint;
141 }
142
143 if (PERFETTO_LIKELY(!IsHighSurrogate(*maybe_surrogate))) {
144 return *maybe_surrogate;
145 }
146
147 Utf16CodeUnit high = *maybe_surrogate;
148
149 maybe_surrogate = NextCodeUnit();
150 if (!maybe_surrogate) {
151 return kInvalidCodePoint;
152 }
153
154 if (PERFETTO_UNLIKELY(!IsLowSurrogate(*maybe_surrogate))) {
155 return kInvalidCodePoint;
156 }
157
158 Utf16CodeUnit low = *maybe_surrogate;
159
160 CodePoint code_point = (high & kSurrogateCodepointMask);
161 code_point <<= kSurrogateCodepointBits;
162 code_point += (low & kSurrogateCodepointMask);
163 code_point += kSurrogateCodepointOffset;
164
165 return code_point;
166 }
167
168 private:
NextCodeUnit()169 std::optional<Utf16CodeUnit> NextCodeUnit() {
170 if (current_ == end_) {
171 return std::nullopt;
172 }
173 uint16_t byte_0 = static_cast<uint16_t>(*current_);
174 ++current_;
175
176 if (current_ == end_) {
177 return std::nullopt;
178 }
179 uint16_t byte_1 = static_cast<uint16_t>(*current_);
180 ++current_;
181
182 if (endianess == Endianess::kBigEndian) {
183 return (byte_0 << 8) + byte_1;
184 }
185
186 return byte_0 + (byte_1 << 8);
187 }
188
IsLowSurrogate(Utf16CodeUnit code_unit)189 static bool IsLowSurrogate(Utf16CodeUnit code_unit) {
190 return (code_unit & kSurrogateMask) == kLowSurrogate;
191 }
192
IsHighSurrogate(Utf16CodeUnit code_unit)193 static bool IsHighSurrogate(Utf16CodeUnit code_unit) {
194 return (code_unit & kSurrogateMask) == kHighSurrogate;
195 }
196
197 const uint8_t* current_;
198 const uint8_t* const end_;
199 };
200
201 using Utf16LeIterator = Utf16Iterator<Endianess::kLittleEndian>;
202 using Utf16BeIterator = Utf16Iterator<Endianess::kBigEndian>;
203
204 } // namespace
205
ConvertLatin1ToUtf8(protozero::ConstBytes latin1)206 std::string ConvertLatin1ToUtf8(protozero::ConstBytes latin1) {
207 size_t res_size = latin1.size;
208 for (size_t i = 0; i < latin1.size; ++i) {
209 CodePoint code_point = latin1.data[i];
210 if (code_point > Utf8::k1ByteMaxCodepoint) {
211 ++res_size;
212 }
213 }
214
215 std::string res;
216 res.reserve(res_size);
217 for (size_t i = 0; i < latin1.size; ++i) {
218 CodePoint code_point = latin1.data[i];
219 Utf8::Append(code_point, res);
220 }
221 return res;
222 }
223
ConvertUtf16LeToUtf8(protozero::ConstBytes utf16_le)224 std::string ConvertUtf16LeToUtf8(protozero::ConstBytes utf16_le) {
225 std::string res;
226 for (Utf16LeIterator iter(utf16_le); iter.HasMore();) {
227 Utf8::Append(iter.NextCodePoint(), res);
228 }
229 return res;
230 }
231
ConvertUtf16BeToUtf8(protozero::ConstBytes utf16_le)232 std::string ConvertUtf16BeToUtf8(protozero::ConstBytes utf16_le) {
233 std::string res;
234 for (Utf16BeIterator iter(utf16_le); iter.HasMore();) {
235 Utf8::Append(iter.NextCodePoint(), res);
236 }
237 return res;
238 }
239
240 } // namespace trace_processor
241 } // namespace perfetto
242