1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/strings/utf_string_conversions.h"
6
7 #include <limits.h>
8 #include <stdint.h>
9
10 #include <ostream>
11 #include <type_traits>
12
13 #include "base/strings/string_piece.h"
14 #include "base/strings/string_util.h"
15 #include "base/strings/utf_string_conversion_utils.h"
16 #include "base/third_party/icu/icu_utf.h"
17 #include "build/build_config.h"
18
19 namespace base {
20
21 namespace {
22
23 constexpr base_icu::UChar32 kErrorCodePoint = 0xFFFD;
24
25 // Size coefficient ----------------------------------------------------------
26 // The maximum number of codeunits in the destination encoding corresponding to
27 // one codeunit in the source encoding.
28
29 template <typename SrcChar, typename DestChar>
30 struct SizeCoefficient {
31 static_assert(sizeof(SrcChar) < sizeof(DestChar),
32 "Default case: from a smaller encoding to the bigger one");
33
34 // ASCII symbols are encoded by one codeunit in all encodings.
35 static constexpr int value = 1;
36 };
37
38 template <>
39 struct SizeCoefficient<char16_t, char> {
40 // One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8.
41 static constexpr int value = 3;
42 };
43
44 #if defined(WCHAR_T_IS_UTF32)
45 template <>
46 struct SizeCoefficient<wchar_t, char> {
47 // UTF-8 uses at most 4 codeunits per character.
48 static constexpr int value = 4;
49 };
50
51 template <>
52 struct SizeCoefficient<wchar_t, char16_t> {
53 // UTF-16 uses at most 2 codeunits per character.
54 static constexpr int value = 2;
55 };
56 #endif // defined(WCHAR_T_IS_UTF32)
57
58 template <typename SrcChar, typename DestChar>
59 constexpr int size_coefficient_v =
60 SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value;
61
62 // UnicodeAppendUnsafe --------------------------------------------------------
63 // Function overloads that write code_point to the output string. Output string
64 // has to have enough space for the codepoint.
65
66 // Convenience typedef that checks whether the passed in type is integral (i.e.
67 // bool, char, int or their extended versions) and is of the correct size.
68 template <typename Char, size_t N>
69 using EnableIfBitsAre = std::enable_if_t<std::is_integral<Char>::value &&
70 CHAR_BIT * sizeof(Char) == N,
71 bool>;
72
73 template <typename Char, EnableIfBitsAre<Char, 8> = true>
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)74 void UnicodeAppendUnsafe(Char* out,
75 size_t* size,
76 base_icu::UChar32 code_point) {
77 CBU8_APPEND_UNSAFE(reinterpret_cast<uint8_t*>(out), *size, code_point);
78 }
79
80 template <typename Char, EnableIfBitsAre<Char, 16> = true>
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)81 void UnicodeAppendUnsafe(Char* out,
82 size_t* size,
83 base_icu::UChar32 code_point) {
84 CBU16_APPEND_UNSAFE(out, *size, code_point);
85 }
86
87 template <typename Char, EnableIfBitsAre<Char, 32> = true>
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)88 void UnicodeAppendUnsafe(Char* out,
89 size_t* size,
90 base_icu::UChar32 code_point) {
91 out[(*size)++] = static_cast<Char>(code_point);
92 }
93
94 // DoUTFConversion ------------------------------------------------------------
95 // Main driver of UTFConversion specialized for different Src encodings.
96 // dest has to have enough room for the converted text.
97
98 template <typename DestChar>
DoUTFConversion(const char * src,size_t src_len,DestChar * dest,size_t * dest_len)99 bool DoUTFConversion(const char* src,
100 size_t src_len,
101 DestChar* dest,
102 size_t* dest_len) {
103 bool success = true;
104
105 for (size_t i = 0; i < src_len;) {
106 base_icu::UChar32 code_point;
107 CBU8_NEXT(reinterpret_cast<const uint8_t*>(src), i, src_len, code_point);
108
109 if (!IsValidCodepoint(code_point)) {
110 success = false;
111 code_point = kErrorCodePoint;
112 }
113
114 UnicodeAppendUnsafe(dest, dest_len, code_point);
115 }
116
117 return success;
118 }
119
120 template <typename DestChar>
DoUTFConversion(const char16_t * src,size_t src_len,DestChar * dest,size_t * dest_len)121 bool DoUTFConversion(const char16_t* src,
122 size_t src_len,
123 DestChar* dest,
124 size_t* dest_len) {
125 bool success = true;
126
127 auto ConvertSingleChar = [&success](char16_t in) -> base_icu::UChar32 {
128 if (!CBU16_IS_SINGLE(in) || !IsValidCodepoint(in)) {
129 success = false;
130 return kErrorCodePoint;
131 }
132 return in;
133 };
134
135 size_t i = 0;
136
137 // Always have another symbol in order to avoid checking boundaries in the
138 // middle of the surrogate pair.
139 while (i + 1 < src_len) {
140 base_icu::UChar32 code_point;
141
142 if (CBU16_IS_LEAD(src[i]) && CBU16_IS_TRAIL(src[i + 1])) {
143 code_point = CBU16_GET_SUPPLEMENTARY(src[i], src[i + 1]);
144 if (!IsValidCodepoint(code_point)) {
145 code_point = kErrorCodePoint;
146 success = false;
147 }
148 i += 2;
149 } else {
150 code_point = ConvertSingleChar(src[i]);
151 ++i;
152 }
153
154 UnicodeAppendUnsafe(dest, dest_len, code_point);
155 }
156
157 if (i < src_len) {
158 UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i]));
159 }
160
161 return success;
162 }
163
164 #if defined(WCHAR_T_IS_UTF32)
165
166 template <typename DestChar>
DoUTFConversion(const wchar_t * src,size_t src_len,DestChar * dest,size_t * dest_len)167 bool DoUTFConversion(const wchar_t* src,
168 size_t src_len,
169 DestChar* dest,
170 size_t* dest_len) {
171 bool success = true;
172
173 for (size_t i = 0; i < src_len; ++i) {
174 auto code_point = static_cast<base_icu::UChar32>(src[i]);
175
176 if (!IsValidCodepoint(code_point)) {
177 success = false;
178 code_point = kErrorCodePoint;
179 }
180
181 UnicodeAppendUnsafe(dest, dest_len, code_point);
182 }
183
184 return success;
185 }
186
187 #endif // defined(WCHAR_T_IS_UTF32)
188
189 // UTFConversion --------------------------------------------------------------
190 // Function template for generating all UTF conversions.
191
192 template <typename InputString, typename DestString>
UTFConversion(const InputString & src_str,DestString * dest_str)193 bool UTFConversion(const InputString& src_str, DestString* dest_str) {
194 if (IsStringASCII(src_str)) {
195 dest_str->assign(src_str.begin(), src_str.end());
196 return true;
197 }
198
199 dest_str->resize(src_str.length() *
200 size_coefficient_v<typename InputString::value_type,
201 typename DestString::value_type>);
202
203 // Empty string is ASCII => it OK to call operator[].
204 auto* dest = &(*dest_str)[0];
205
206 // ICU requires 32 bit numbers.
207 size_t src_len = src_str.length();
208 size_t dest_len = 0;
209
210 bool res = DoUTFConversion(src_str.data(), src_len, dest, &dest_len);
211
212 dest_str->resize(dest_len);
213 dest_str->shrink_to_fit();
214
215 return res;
216 }
217
218 } // namespace
219
220 // UTF16 <-> UTF8 --------------------------------------------------------------
221
UTF8ToUTF16(const char * src,size_t src_len,std::u16string * output)222 bool UTF8ToUTF16(const char* src, size_t src_len, std::u16string* output) {
223 return UTFConversion(StringPiece(src, src_len), output);
224 }
225
UTF8ToUTF16(StringPiece utf8)226 std::u16string UTF8ToUTF16(StringPiece utf8) {
227 std::u16string ret;
228 // Ignore the success flag of this call, it will do the best it can for
229 // invalid input, which is what we want here.
230 UTF8ToUTF16(utf8.data(), utf8.size(), &ret);
231 return ret;
232 }
233
UTF16ToUTF8(const char16_t * src,size_t src_len,std::string * output)234 bool UTF16ToUTF8(const char16_t* src, size_t src_len, std::string* output) {
235 return UTFConversion(StringPiece16(src, src_len), output);
236 }
237
UTF16ToUTF8(StringPiece16 utf16)238 std::string UTF16ToUTF8(StringPiece16 utf16) {
239 std::string ret;
240 // Ignore the success flag of this call, it will do the best it can for
241 // invalid input, which is what we want here.
242 UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
243 return ret;
244 }
245
246 // UTF-16 <-> Wide -------------------------------------------------------------
247
248 #if defined(WCHAR_T_IS_UTF16)
249 // When wide == UTF-16 the conversions are a NOP.
250
WideToUTF16(const wchar_t * src,size_t src_len,std::u16string * output)251 bool WideToUTF16(const wchar_t* src, size_t src_len, std::u16string* output) {
252 output->assign(src, src + src_len);
253 return true;
254 }
255
WideToUTF16(WStringPiece wide)256 std::u16string WideToUTF16(WStringPiece wide) {
257 return std::u16string(wide.begin(), wide.end());
258 }
259
UTF16ToWide(const char16_t * src,size_t src_len,std::wstring * output)260 bool UTF16ToWide(const char16_t* src, size_t src_len, std::wstring* output) {
261 output->assign(src, src + src_len);
262 return true;
263 }
264
UTF16ToWide(StringPiece16 utf16)265 std::wstring UTF16ToWide(StringPiece16 utf16) {
266 return std::wstring(utf16.begin(), utf16.end());
267 }
268
269 #elif defined(WCHAR_T_IS_UTF32)
270
WideToUTF16(const wchar_t * src,size_t src_len,std::u16string * output)271 bool WideToUTF16(const wchar_t* src, size_t src_len, std::u16string* output) {
272 return UTFConversion(base::WStringPiece(src, src_len), output);
273 }
274
WideToUTF16(WStringPiece wide)275 std::u16string WideToUTF16(WStringPiece wide) {
276 std::u16string ret;
277 // Ignore the success flag of this call, it will do the best it can for
278 // invalid input, which is what we want here.
279 WideToUTF16(wide.data(), wide.length(), &ret);
280 return ret;
281 }
282
UTF16ToWide(const char16_t * src,size_t src_len,std::wstring * output)283 bool UTF16ToWide(const char16_t* src, size_t src_len, std::wstring* output) {
284 return UTFConversion(StringPiece16(src, src_len), output);
285 }
286
UTF16ToWide(StringPiece16 utf16)287 std::wstring UTF16ToWide(StringPiece16 utf16) {
288 std::wstring ret;
289 // Ignore the success flag of this call, it will do the best it can for
290 // invalid input, which is what we want here.
291 UTF16ToWide(utf16.data(), utf16.length(), &ret);
292 return ret;
293 }
294
295 #endif // defined(WCHAR_T_IS_UTF32)
296
297 // UTF-8 <-> Wide --------------------------------------------------------------
298
299 // UTF8ToWide is the same code, regardless of whether wide is 16 or 32 bits
300
UTF8ToWide(const char * src,size_t src_len,std::wstring * output)301 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
302 return UTFConversion(StringPiece(src, src_len), output);
303 }
304
UTF8ToWide(StringPiece utf8)305 std::wstring UTF8ToWide(StringPiece utf8) {
306 std::wstring ret;
307 // Ignore the success flag of this call, it will do the best it can for
308 // invalid input, which is what we want here.
309 UTF8ToWide(utf8.data(), utf8.length(), &ret);
310 return ret;
311 }
312
313 #if defined(WCHAR_T_IS_UTF16)
314 // Easy case since we can use the "utf" versions we already wrote above.
315
WideToUTF8(const wchar_t * src,size_t src_len,std::string * output)316 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
317 return UTF16ToUTF8(as_u16cstr(src), src_len, output);
318 }
319
WideToUTF8(WStringPiece wide)320 std::string WideToUTF8(WStringPiece wide) {
321 return UTF16ToUTF8(StringPiece16(as_u16cstr(wide), wide.size()));
322 }
323
324 #elif defined(WCHAR_T_IS_UTF32)
325
WideToUTF8(const wchar_t * src,size_t src_len,std::string * output)326 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
327 return UTFConversion(WStringPiece(src, src_len), output);
328 }
329
WideToUTF8(WStringPiece wide)330 std::string WideToUTF8(WStringPiece wide) {
331 std::string ret;
332 // Ignore the success flag of this call, it will do the best it can for
333 // invalid input, which is what we want here.
334 WideToUTF8(wide.data(), wide.length(), &ret);
335 return ret;
336 }
337
338 #endif // defined(WCHAR_T_IS_UTF32)
339
ASCIIToUTF16(StringPiece ascii)340 std::u16string ASCIIToUTF16(StringPiece ascii) {
341 DCHECK(IsStringASCII(ascii)) << ascii;
342 return std::u16string(ascii.begin(), ascii.end());
343 }
344
UTF16ToASCII(StringPiece16 utf16)345 std::string UTF16ToASCII(StringPiece16 utf16) {
346 DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16);
347 return std::string(utf16.begin(), utf16.end());
348 }
349
350 #if defined(WCHAR_T_IS_UTF16)
ASCIIToWide(StringPiece ascii)351 std::wstring ASCIIToWide(StringPiece ascii) {
352 DCHECK(IsStringASCII(ascii)) << ascii;
353 return std::wstring(ascii.begin(), ascii.end());
354 }
355
WideToASCII(WStringPiece wide)356 std::string WideToASCII(WStringPiece wide) {
357 DCHECK(IsStringASCII(wide)) << wide;
358 return std::string(wide.begin(), wide.end());
359 }
360 #endif // defined(WCHAR_T_IS_UTF16)
361
362 } // namespace base
363