1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9
10 #include "base/strings/utf_string_conversions.h"
11
12 #include <limits.h>
13 #include <stdint.h>
14
15 #include <concepts>
16 #include <ostream>
17 #include <string_view>
18 #include <type_traits>
19
20 #include "base/strings/string_util.h"
21 #include "base/strings/utf_ostream_operators.h"
22 #include "base/strings/utf_string_conversion_utils.h"
23 #include "base/third_party/icu/icu_utf.h"
24 #include "build/build_config.h"
25
26 namespace base {
27
28 namespace {
29
30 constexpr base_icu::UChar32 kErrorCodePoint = 0xFFFD;
31
32 // Size coefficient ----------------------------------------------------------
33 // The maximum number of codeunits in the destination encoding corresponding to
34 // one codeunit in the source encoding.
35
36 template <typename SrcChar, typename DestChar>
37 struct SizeCoefficient {
38 static_assert(sizeof(SrcChar) < sizeof(DestChar),
39 "Default case: from a smaller encoding to the bigger one");
40
41 // ASCII symbols are encoded by one codeunit in all encodings.
42 static constexpr int value = 1;
43 };
44
45 template <>
46 struct SizeCoefficient<char16_t, char> {
47 // One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8.
48 static constexpr int value = 3;
49 };
50
51 #if defined(WCHAR_T_IS_32_BIT)
52 template <>
53 struct SizeCoefficient<wchar_t, char> {
54 // UTF-8 uses at most 4 codeunits per character.
55 static constexpr int value = 4;
56 };
57
58 template <>
59 struct SizeCoefficient<wchar_t, char16_t> {
60 // UTF-16 uses at most 2 codeunits per character.
61 static constexpr int value = 2;
62 };
63 #endif // defined(WCHAR_T_IS_32_BIT)
64
65 template <typename SrcChar, typename DestChar>
66 constexpr int size_coefficient_v =
67 SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value;
68
69 // UnicodeAppendUnsafe --------------------------------------------------------
70 // Function overloads that write code_point to the output string. Output string
71 // has to have enough space for the codepoint.
72
73 // Convenience typedef that checks whether the passed in type is integral (i.e.
74 // bool, char, int or their extended versions) and is of the correct size.
75 template <typename Char, size_t N>
76 concept BitsAre = std::integral<Char> && CHAR_BIT * sizeof(Char) == N;
77
78 template <typename Char>
79 requires(BitsAre<Char, 8>)
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)80 void UnicodeAppendUnsafe(Char* out,
81 size_t* size,
82 base_icu::UChar32 code_point) {
83 CBU8_APPEND_UNSAFE(reinterpret_cast<uint8_t*>(out), *size, code_point);
84 }
85
86 template <typename Char>
87 requires(BitsAre<Char, 16>)
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)88 void UnicodeAppendUnsafe(Char* out,
89 size_t* size,
90 base_icu::UChar32 code_point) {
91 CBU16_APPEND_UNSAFE(out, *size, code_point);
92 }
93
94 template <typename Char>
95 requires(BitsAre<Char, 32>)
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)96 void UnicodeAppendUnsafe(Char* out,
97 size_t* size,
98 base_icu::UChar32 code_point) {
99 out[(*size)++] = static_cast<Char>(code_point);
100 }
101
102 // DoUTFConversion ------------------------------------------------------------
103 // Main driver of UTFConversion specialized for different Src encodings.
104 // dest has to have enough room for the converted text.
105
106 template <typename DestChar>
DoUTFConversion(const char * src,size_t src_len,DestChar * dest,size_t * dest_len)107 bool DoUTFConversion(const char* src,
108 size_t src_len,
109 DestChar* dest,
110 size_t* dest_len) {
111 bool success = true;
112
113 for (size_t i = 0; i < src_len;) {
114 base_icu::UChar32 code_point;
115 CBU8_NEXT(reinterpret_cast<const uint8_t*>(src), i, src_len, code_point);
116
117 if (!IsValidCodepoint(code_point)) {
118 success = false;
119 code_point = kErrorCodePoint;
120 }
121
122 UnicodeAppendUnsafe(dest, dest_len, code_point);
123 }
124
125 return success;
126 }
127
128 template <typename DestChar>
DoUTFConversion(const char16_t * src,size_t src_len,DestChar * dest,size_t * dest_len)129 bool DoUTFConversion(const char16_t* src,
130 size_t src_len,
131 DestChar* dest,
132 size_t* dest_len) {
133 bool success = true;
134
135 auto ConvertSingleChar = [&success](char16_t in) -> base_icu::UChar32 {
136 if (!CBU16_IS_SINGLE(in) || !IsValidCodepoint(in)) {
137 success = false;
138 return kErrorCodePoint;
139 }
140 return in;
141 };
142
143 size_t i = 0;
144
145 // Always have another symbol in order to avoid checking boundaries in the
146 // middle of the surrogate pair.
147 while (i + 1 < src_len) {
148 base_icu::UChar32 code_point;
149
150 if (CBU16_IS_LEAD(src[i]) && CBU16_IS_TRAIL(src[i + 1])) {
151 code_point = CBU16_GET_SUPPLEMENTARY(src[i], src[i + 1]);
152 if (!IsValidCodepoint(code_point)) {
153 code_point = kErrorCodePoint;
154 success = false;
155 }
156 i += 2;
157 } else {
158 code_point = ConvertSingleChar(src[i]);
159 ++i;
160 }
161
162 UnicodeAppendUnsafe(dest, dest_len, code_point);
163 }
164
165 if (i < src_len) {
166 UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i]));
167 }
168
169 return success;
170 }
171
172 #if defined(WCHAR_T_IS_32_BIT)
173
174 template <typename DestChar>
DoUTFConversion(const wchar_t * src,size_t src_len,DestChar * dest,size_t * dest_len)175 bool DoUTFConversion(const wchar_t* src,
176 size_t src_len,
177 DestChar* dest,
178 size_t* dest_len) {
179 bool success = true;
180
181 for (size_t i = 0; i < src_len; ++i) {
182 auto code_point = static_cast<base_icu::UChar32>(src[i]);
183
184 if (!IsValidCodepoint(code_point)) {
185 success = false;
186 code_point = kErrorCodePoint;
187 }
188
189 UnicodeAppendUnsafe(dest, dest_len, code_point);
190 }
191
192 return success;
193 }
194
195 #endif // defined(WCHAR_T_IS_32_BIT)
196
197 // UTFConversion --------------------------------------------------------------
198 // Function template for generating all UTF conversions.
199
200 template <typename InputString, typename DestString>
UTFConversion(const InputString & src_str,DestString * dest_str)201 bool UTFConversion(const InputString& src_str, DestString* dest_str) {
202 if (IsStringASCII(src_str)) {
203 dest_str->assign(src_str.begin(), src_str.end());
204 return true;
205 }
206
207 dest_str->resize(src_str.length() *
208 size_coefficient_v<typename InputString::value_type,
209 typename DestString::value_type>);
210
211 // Empty string is ASCII => it OK to call operator[].
212 auto* dest = &(*dest_str)[0];
213
214 // ICU requires 32 bit numbers.
215 size_t src_len = src_str.length();
216 size_t dest_len = 0;
217
218 bool res = DoUTFConversion(src_str.data(), src_len, dest, &dest_len);
219
220 dest_str->resize(dest_len);
221 dest_str->shrink_to_fit();
222
223 return res;
224 }
225
226 } // namespace
227
228 // UTF16 <-> UTF8 --------------------------------------------------------------
229
UTF8ToUTF16(const char * src,size_t src_len,std::u16string * output)230 bool UTF8ToUTF16(const char* src, size_t src_len, std::u16string* output) {
231 return UTFConversion(std::string_view(src, src_len), output);
232 }
233
UTF8ToUTF16(std::string_view utf8)234 std::u16string UTF8ToUTF16(std::string_view utf8) {
235 std::u16string ret;
236 // Ignore the success flag of this call, it will do the best it can for
237 // invalid input, which is what we want here.
238 UTF8ToUTF16(utf8.data(), utf8.size(), &ret);
239 return ret;
240 }
241
UTF16ToUTF8(const char16_t * src,size_t src_len,std::string * output)242 bool UTF16ToUTF8(const char16_t* src, size_t src_len, std::string* output) {
243 return UTFConversion(std::u16string_view(src, src_len), output);
244 }
245
UTF16ToUTF8(std::u16string_view utf16)246 std::string UTF16ToUTF8(std::u16string_view utf16) {
247 std::string ret;
248 // Ignore the success flag of this call, it will do the best it can for
249 // invalid input, which is what we want here.
250 UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
251 return ret;
252 }
253
254 // UTF-16 <-> Wide -------------------------------------------------------------
255
256 #if defined(WCHAR_T_IS_16_BIT)
257 // When wide == UTF-16 the conversions are a NOP.
258
WideToUTF16(const wchar_t * src,size_t src_len,std::u16string * output)259 bool WideToUTF16(const wchar_t* src, size_t src_len, std::u16string* output) {
260 output->assign(src, src + src_len);
261 return true;
262 }
263
WideToUTF16(std::wstring_view wide)264 std::u16string WideToUTF16(std::wstring_view wide) {
265 return std::u16string(wide.begin(), wide.end());
266 }
267
UTF16ToWide(const char16_t * src,size_t src_len,std::wstring * output)268 bool UTF16ToWide(const char16_t* src, size_t src_len, std::wstring* output) {
269 output->assign(src, src + src_len);
270 return true;
271 }
272
UTF16ToWide(std::u16string_view utf16)273 std::wstring UTF16ToWide(std::u16string_view utf16) {
274 return std::wstring(utf16.begin(), utf16.end());
275 }
276
277 #elif defined(WCHAR_T_IS_32_BIT)
278
WideToUTF16(const wchar_t * src,size_t src_len,std::u16string * output)279 bool WideToUTF16(const wchar_t* src, size_t src_len, std::u16string* output) {
280 return UTFConversion(std::wstring_view(src, src_len), output);
281 }
282
WideToUTF16(std::wstring_view wide)283 std::u16string WideToUTF16(std::wstring_view wide) {
284 std::u16string ret;
285 // Ignore the success flag of this call, it will do the best it can for
286 // invalid input, which is what we want here.
287 WideToUTF16(wide.data(), wide.length(), &ret);
288 return ret;
289 }
290
UTF16ToWide(const char16_t * src,size_t src_len,std::wstring * output)291 bool UTF16ToWide(const char16_t* src, size_t src_len, std::wstring* output) {
292 return UTFConversion(std::u16string_view(src, src_len), output);
293 }
294
UTF16ToWide(std::u16string_view utf16)295 std::wstring UTF16ToWide(std::u16string_view utf16) {
296 std::wstring ret;
297 // Ignore the success flag of this call, it will do the best it can for
298 // invalid input, which is what we want here.
299 UTF16ToWide(utf16.data(), utf16.length(), &ret);
300 return ret;
301 }
302
303 #endif // defined(WCHAR_T_IS_32_BIT)
304
305 // UTF-8 <-> Wide --------------------------------------------------------------
306
307 // UTF8ToWide is the same code, regardless of whether wide is 16 or 32 bits
308
UTF8ToWide(const char * src,size_t src_len,std::wstring * output)309 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
310 return UTFConversion(std::string_view(src, src_len), output);
311 }
312
UTF8ToWide(std::string_view utf8)313 std::wstring UTF8ToWide(std::string_view utf8) {
314 std::wstring ret;
315 // Ignore the success flag of this call, it will do the best it can for
316 // invalid input, which is what we want here.
317 UTF8ToWide(utf8.data(), utf8.length(), &ret);
318 return ret;
319 }
320
321 #if defined(WCHAR_T_IS_16_BIT)
322 // Easy case since we can use the "utf" versions we already wrote above.
323
WideToUTF8(const wchar_t * src,size_t src_len,std::string * output)324 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
325 return UTF16ToUTF8(as_u16cstr(src), src_len, output);
326 }
327
WideToUTF8(std::wstring_view wide)328 std::string WideToUTF8(std::wstring_view wide) {
329 return UTF16ToUTF8(std::u16string_view(as_u16cstr(wide), wide.size()));
330 }
331
332 #elif defined(WCHAR_T_IS_32_BIT)
333
WideToUTF8(const wchar_t * src,size_t src_len,std::string * output)334 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
335 return UTFConversion(std::wstring_view(src, src_len), output);
336 }
337
WideToUTF8(std::wstring_view wide)338 std::string WideToUTF8(std::wstring_view wide) {
339 std::string ret;
340 // Ignore the success flag of this call, it will do the best it can for
341 // invalid input, which is what we want here.
342 WideToUTF8(wide.data(), wide.length(), &ret);
343 return ret;
344 }
345
346 #endif // defined(WCHAR_T_IS_32_BIT)
347
ASCIIToUTF16(std::string_view ascii)348 std::u16string ASCIIToUTF16(std::string_view ascii) {
349 DCHECK(IsStringASCII(ascii)) << ascii;
350 return std::u16string(ascii.begin(), ascii.end());
351 }
352
UTF16ToASCII(std::u16string_view utf16)353 std::string UTF16ToASCII(std::u16string_view utf16) {
354 DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16);
355 return std::string(utf16.begin(), utf16.end());
356 }
357
358 #if defined(WCHAR_T_IS_16_BIT)
ASCIIToWide(std::string_view ascii)359 std::wstring ASCIIToWide(std::string_view ascii) {
360 DCHECK(IsStringASCII(ascii)) << ascii;
361 return std::wstring(ascii.begin(), ascii.end());
362 }
363
WideToASCII(std::wstring_view wide)364 std::string WideToASCII(std::wstring_view wide) {
365 DCHECK(IsStringASCII(wide)) << wide;
366 return std::string(wide.begin(), wide.end());
367 }
368 #endif // defined(WCHAR_T_IS_16_BIT)
369
370 } // namespace base
371