• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /*
3  * Copyright (C) 2024 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 #include "src/trace_processor/importers/proto/string_encoding_utils.h"
19 
20 #include <cinttypes>
21 #include <cstddef>
22 #include <cstdint>
23 #include <optional>
24 #include <string>
25 
26 #include "perfetto/base/logging.h"
27 
28 namespace perfetto {
29 namespace trace_processor {
30 namespace {
31 
32 using CodePoint = uint32_t;
33 
34 struct Utf8 {
35   static constexpr uint8_t kContinuationHeader = 0x80;
36   static constexpr uint8_t kContinuationValueMask = 0x3F;
37   static constexpr uint8_t kContinuationBits = 6;
38 
39   static constexpr uint8_t k2ByteHeader = 0xC0;
40   static constexpr uint8_t k2ByteValueMask = 0x1F;
41 
42   static constexpr uint8_t k3ByteHeader = 0xE0;
43   static constexpr uint8_t k3ByteValueMask = 0x0F;
44 
45   static constexpr uint8_t k4ByteHeader = 0xF0;
46   static constexpr uint8_t k4ByteValueMask = 0x07;
47 
48   static constexpr CodePoint k1ByteMaxCodepoint = 0x7F;
49   static constexpr CodePoint k2ByteMaxCodepoint = 0x7FF;
50   static constexpr CodePoint k3ByteMaxCodepoint = 0xFFFF;
51   static constexpr CodePoint k4ByteMaxCodepoint = 0x10FFFF;
52 
Appendperfetto::trace_processor::__anon34fd468b0111::Utf853   static void Append(CodePoint code_point, std::string& out) {
54     if (code_point <= k1ByteMaxCodepoint) {
55       out.push_back(static_cast<char>(code_point));
56       return;
57     }
58 
59     if (code_point <= k2ByteMaxCodepoint) {
60       uint8_t byte_2 =
61           kContinuationHeader + (code_point & kContinuationValueMask);
62       code_point >>= kContinuationBits;
63       uint8_t byte_1 = k2ByteHeader + (code_point & k2ByteValueMask);
64       out.push_back(static_cast<char>(byte_1));
65       out.push_back(static_cast<char>(byte_2));
66       return;
67     }
68 
69     if (code_point <= k3ByteMaxCodepoint) {
70       uint8_t byte_3 =
71           kContinuationHeader + (code_point & kContinuationValueMask);
72       code_point >>= kContinuationBits;
73       uint8_t byte_2 =
74           kContinuationHeader + (code_point & kContinuationValueMask);
75       code_point >>= kContinuationBits;
76       uint8_t byte_1 = k3ByteHeader + (code_point & k3ByteValueMask);
77       out.push_back(static_cast<char>(byte_1));
78       out.push_back(static_cast<char>(byte_2));
79       out.push_back(static_cast<char>(byte_3));
80       return;
81     }
82 
83     if (code_point <= k4ByteMaxCodepoint) {
84       uint8_t byte_4 =
85           kContinuationHeader + (code_point & kContinuationValueMask);
86       code_point >>= kContinuationBits;
87       uint8_t byte_3 =
88           kContinuationHeader + (code_point & kContinuationValueMask);
89       code_point >>= kContinuationBits;
90       uint8_t byte_2 =
91           kContinuationHeader + (code_point & kContinuationValueMask);
92       code_point >>= kContinuationBits;
93       uint8_t byte_1 = k4ByteHeader + (code_point & k4ByteValueMask);
94       out.push_back(static_cast<char>(byte_1));
95       out.push_back(static_cast<char>(byte_2));
96       out.push_back(static_cast<char>(byte_3));
97       out.push_back(static_cast<char>(byte_4));
98       return;
99     }
100 
101     PERFETTO_FATAL("Invalid code point for UTF8 conversion: %" PRIu32,
102                    code_point);
103   }
104 };
105 
106 enum class Endianess {
107   kBigEndian,
108   kLittleEndian,
109 };
110 
111 template <Endianess endianess>
112 class Utf16Iterator {
113  public:
114   using Utf16CodeUnit = uint16_t;
115 
116   static constexpr Utf16CodeUnit kSurrogateMask = 0xFC00;
117   static constexpr Utf16CodeUnit kHighSurrogate = 0xD800;
118   static constexpr Utf16CodeUnit kLowSurrogate = 0xDC00;
119 
120   static constexpr CodePoint kSurrogateCodepointOffset = 0x10000;
121   static constexpr uint32_t kSurrogateCodepointBits = 10;
122   static constexpr CodePoint kSurrogateCodepointMask =
123       (1u << kSurrogateCodepointBits) - 1;
124 
125   CodePoint kInvalidCodePoint = 0xFFFD;
126 
Utf16Iterator(protozero::ConstBytes bytes)127   explicit Utf16Iterator(protozero::ConstBytes bytes)
128       : current_(reinterpret_cast<const uint8_t*>(bytes.data)),
129         end_(reinterpret_cast<const uint8_t*>(bytes.data + bytes.size)) {}
130 
HasMore() const131   bool HasMore() const { return current_ != end_; }
132 
NextCodePoint()133   CodePoint NextCodePoint() {
134     std::optional<Utf16CodeUnit> maybe_surrogate = NextCodeUnit();
135     if (!maybe_surrogate) {
136       return kInvalidCodePoint;
137     }
138 
139     if (PERFETTO_UNLIKELY(IsLowSurrogate(*maybe_surrogate))) {
140       return kInvalidCodePoint;
141     }
142 
143     if (PERFETTO_LIKELY(!IsHighSurrogate(*maybe_surrogate))) {
144       return *maybe_surrogate;
145     }
146 
147     Utf16CodeUnit high = *maybe_surrogate;
148 
149     maybe_surrogate = NextCodeUnit();
150     if (!maybe_surrogate) {
151       return kInvalidCodePoint;
152     }
153 
154     if (PERFETTO_UNLIKELY(!IsLowSurrogate(*maybe_surrogate))) {
155       return kInvalidCodePoint;
156     }
157 
158     Utf16CodeUnit low = *maybe_surrogate;
159 
160     CodePoint code_point = (high & kSurrogateCodepointMask);
161     code_point <<= kSurrogateCodepointBits;
162     code_point += (low & kSurrogateCodepointMask);
163     code_point += kSurrogateCodepointOffset;
164 
165     return code_point;
166   }
167 
168  private:
NextCodeUnit()169   std::optional<Utf16CodeUnit> NextCodeUnit() {
170     if (current_ == end_) {
171       return std::nullopt;
172     }
173     uint16_t byte_0 = static_cast<uint16_t>(*current_);
174     ++current_;
175 
176     if (current_ == end_) {
177       return std::nullopt;
178     }
179     uint16_t byte_1 = static_cast<uint16_t>(*current_);
180     ++current_;
181 
182     if (endianess == Endianess::kBigEndian) {
183       return (byte_0 << 8) + byte_1;
184     }
185 
186     return byte_0 + (byte_1 << 8);
187   }
188 
IsLowSurrogate(Utf16CodeUnit code_unit)189   static bool IsLowSurrogate(Utf16CodeUnit code_unit) {
190     return (code_unit & kSurrogateMask) == kLowSurrogate;
191   }
192 
IsHighSurrogate(Utf16CodeUnit code_unit)193   static bool IsHighSurrogate(Utf16CodeUnit code_unit) {
194     return (code_unit & kSurrogateMask) == kHighSurrogate;
195   }
196 
197   const uint8_t* current_;
198   const uint8_t* const end_;
199 };
200 
201 using Utf16LeIterator = Utf16Iterator<Endianess::kLittleEndian>;
202 using Utf16BeIterator = Utf16Iterator<Endianess::kBigEndian>;
203 
204 }  // namespace
205 
ConvertLatin1ToUtf8(protozero::ConstBytes latin1)206 std::string ConvertLatin1ToUtf8(protozero::ConstBytes latin1) {
207   size_t res_size = latin1.size;
208   for (size_t i = 0; i < latin1.size; ++i) {
209     CodePoint code_point = latin1.data[i];
210     if (code_point > Utf8::k1ByteMaxCodepoint) {
211       ++res_size;
212     }
213   }
214 
215   std::string res;
216   res.reserve(res_size);
217   for (size_t i = 0; i < latin1.size; ++i) {
218     CodePoint code_point = latin1.data[i];
219     Utf8::Append(code_point, res);
220   }
221   return res;
222 }
223 
ConvertUtf16LeToUtf8(protozero::ConstBytes utf16_le)224 std::string ConvertUtf16LeToUtf8(protozero::ConstBytes utf16_le) {
225   std::string res;
226   for (Utf16LeIterator iter(utf16_le); iter.HasMore();) {
227     Utf8::Append(iter.NextCodePoint(), res);
228   }
229   return res;
230 }
231 
ConvertUtf16BeToUtf8(protozero::ConstBytes utf16_le)232 std::string ConvertUtf16BeToUtf8(protozero::ConstBytes utf16_le) {
233   std::string res;
234   for (Utf16BeIterator iter(utf16_le); iter.HasMore();) {
235     Utf8::Append(iter.NextCodePoint(), res);
236   }
237   return res;
238 }
239 
240 }  // namespace trace_processor
241 }  // namespace perfetto
242