• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9 
10 #include "base/strings/utf_string_conversions.h"
11 
12 #include <limits.h>
13 #include <stdint.h>
14 
15 #include <concepts>
16 #include <ostream>
17 #include <string_view>
18 #include <type_traits>
19 
20 #include "base/strings/string_util.h"
21 #include "base/strings/utf_ostream_operators.h"
22 #include "base/strings/utf_string_conversion_utils.h"
23 #include "base/third_party/icu/icu_utf.h"
24 #include "build/build_config.h"
25 
26 namespace base {
27 
28 namespace {
29 
30 constexpr base_icu::UChar32 kErrorCodePoint = 0xFFFD;
31 
32 // Size coefficient ----------------------------------------------------------
33 // The maximum number of codeunits in the destination encoding corresponding to
34 // one codeunit in the source encoding.
35 
36 template <typename SrcChar, typename DestChar>
37 struct SizeCoefficient {
38   static_assert(sizeof(SrcChar) < sizeof(DestChar),
39                 "Default case: from a smaller encoding to the bigger one");
40 
41   // ASCII symbols are encoded by one codeunit in all encodings.
42   static constexpr int value = 1;
43 };
44 
45 template <>
46 struct SizeCoefficient<char16_t, char> {
47   // One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8.
48   static constexpr int value = 3;
49 };
50 
51 #if defined(WCHAR_T_IS_32_BIT)
52 template <>
53 struct SizeCoefficient<wchar_t, char> {
54   // UTF-8 uses at most 4 codeunits per character.
55   static constexpr int value = 4;
56 };
57 
58 template <>
59 struct SizeCoefficient<wchar_t, char16_t> {
60   // UTF-16 uses at most 2 codeunits per character.
61   static constexpr int value = 2;
62 };
63 #endif  // defined(WCHAR_T_IS_32_BIT)
64 
65 template <typename SrcChar, typename DestChar>
66 constexpr int size_coefficient_v =
67     SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value;
68 
69 // UnicodeAppendUnsafe --------------------------------------------------------
70 // Function overloads that write code_point to the output string. Output string
71 // has to have enough space for the codepoint.
72 
73 // Convenience typedef that checks whether the passed in type is integral (i.e.
74 // bool, char, int or their extended versions) and is of the correct size.
75 template <typename Char, size_t N>
76 concept BitsAre = std::integral<Char> && CHAR_BIT * sizeof(Char) == N;
77 
78 template <typename Char>
79   requires(BitsAre<Char, 8>)
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)80 void UnicodeAppendUnsafe(Char* out,
81                          size_t* size,
82                          base_icu::UChar32 code_point) {
83   CBU8_APPEND_UNSAFE(reinterpret_cast<uint8_t*>(out), *size, code_point);
84 }
85 
86 template <typename Char>
87   requires(BitsAre<Char, 16>)
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)88 void UnicodeAppendUnsafe(Char* out,
89                          size_t* size,
90                          base_icu::UChar32 code_point) {
91   CBU16_APPEND_UNSAFE(out, *size, code_point);
92 }
93 
94 template <typename Char>
95   requires(BitsAre<Char, 32>)
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)96 void UnicodeAppendUnsafe(Char* out,
97                          size_t* size,
98                          base_icu::UChar32 code_point) {
99   out[(*size)++] = static_cast<Char>(code_point);
100 }
101 
102 // DoUTFConversion ------------------------------------------------------------
103 // Main driver of UTFConversion specialized for different Src encodings.
104 // dest has to have enough room for the converted text.
105 
106 template <typename DestChar>
DoUTFConversion(const char * src,size_t src_len,DestChar * dest,size_t * dest_len)107 bool DoUTFConversion(const char* src,
108                      size_t src_len,
109                      DestChar* dest,
110                      size_t* dest_len) {
111   bool success = true;
112 
113   for (size_t i = 0; i < src_len;) {
114     base_icu::UChar32 code_point;
115     CBU8_NEXT(reinterpret_cast<const uint8_t*>(src), i, src_len, code_point);
116 
117     if (!IsValidCodepoint(code_point)) {
118       success = false;
119       code_point = kErrorCodePoint;
120     }
121 
122     UnicodeAppendUnsafe(dest, dest_len, code_point);
123   }
124 
125   return success;
126 }
127 
128 template <typename DestChar>
DoUTFConversion(const char16_t * src,size_t src_len,DestChar * dest,size_t * dest_len)129 bool DoUTFConversion(const char16_t* src,
130                      size_t src_len,
131                      DestChar* dest,
132                      size_t* dest_len) {
133   bool success = true;
134 
135   auto ConvertSingleChar = [&success](char16_t in) -> base_icu::UChar32 {
136     if (!CBU16_IS_SINGLE(in) || !IsValidCodepoint(in)) {
137       success = false;
138       return kErrorCodePoint;
139     }
140     return in;
141   };
142 
143   size_t i = 0;
144 
145   // Always have another symbol in order to avoid checking boundaries in the
146   // middle of the surrogate pair.
147   while (i + 1 < src_len) {
148     base_icu::UChar32 code_point;
149 
150     if (CBU16_IS_LEAD(src[i]) && CBU16_IS_TRAIL(src[i + 1])) {
151       code_point = CBU16_GET_SUPPLEMENTARY(src[i], src[i + 1]);
152       if (!IsValidCodepoint(code_point)) {
153         code_point = kErrorCodePoint;
154         success = false;
155       }
156       i += 2;
157     } else {
158       code_point = ConvertSingleChar(src[i]);
159       ++i;
160     }
161 
162     UnicodeAppendUnsafe(dest, dest_len, code_point);
163   }
164 
165   if (i < src_len) {
166     UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i]));
167   }
168 
169   return success;
170 }
171 
172 #if defined(WCHAR_T_IS_32_BIT)
173 
174 template <typename DestChar>
DoUTFConversion(const wchar_t * src,size_t src_len,DestChar * dest,size_t * dest_len)175 bool DoUTFConversion(const wchar_t* src,
176                      size_t src_len,
177                      DestChar* dest,
178                      size_t* dest_len) {
179   bool success = true;
180 
181   for (size_t i = 0; i < src_len; ++i) {
182     auto code_point = static_cast<base_icu::UChar32>(src[i]);
183 
184     if (!IsValidCodepoint(code_point)) {
185       success = false;
186       code_point = kErrorCodePoint;
187     }
188 
189     UnicodeAppendUnsafe(dest, dest_len, code_point);
190   }
191 
192   return success;
193 }
194 
195 #endif  // defined(WCHAR_T_IS_32_BIT)
196 
197 // UTFConversion --------------------------------------------------------------
198 // Function template for generating all UTF conversions.
199 
200 template <typename InputString, typename DestString>
UTFConversion(const InputString & src_str,DestString * dest_str)201 bool UTFConversion(const InputString& src_str, DestString* dest_str) {
202   if (IsStringASCII(src_str)) {
203     dest_str->assign(src_str.begin(), src_str.end());
204     return true;
205   }
206 
207   dest_str->resize(src_str.length() *
208                    size_coefficient_v<typename InputString::value_type,
209                                       typename DestString::value_type>);
210 
211   // Empty string is ASCII => it OK to call operator[].
212   auto* dest = &(*dest_str)[0];
213 
214   // ICU requires 32 bit numbers.
215   size_t src_len = src_str.length();
216   size_t dest_len = 0;
217 
218   bool res = DoUTFConversion(src_str.data(), src_len, dest, &dest_len);
219 
220   dest_str->resize(dest_len);
221   dest_str->shrink_to_fit();
222 
223   return res;
224 }
225 
226 }  // namespace
227 
228 // UTF16 <-> UTF8 --------------------------------------------------------------
229 
UTF8ToUTF16(const char * src,size_t src_len,std::u16string * output)230 bool UTF8ToUTF16(const char* src, size_t src_len, std::u16string* output) {
231   return UTFConversion(std::string_view(src, src_len), output);
232 }
233 
UTF8ToUTF16(std::string_view utf8)234 std::u16string UTF8ToUTF16(std::string_view utf8) {
235   std::u16string ret;
236   // Ignore the success flag of this call, it will do the best it can for
237   // invalid input, which is what we want here.
238   UTF8ToUTF16(utf8.data(), utf8.size(), &ret);
239   return ret;
240 }
241 
UTF16ToUTF8(const char16_t * src,size_t src_len,std::string * output)242 bool UTF16ToUTF8(const char16_t* src, size_t src_len, std::string* output) {
243   return UTFConversion(std::u16string_view(src, src_len), output);
244 }
245 
UTF16ToUTF8(std::u16string_view utf16)246 std::string UTF16ToUTF8(std::u16string_view utf16) {
247   std::string ret;
248   // Ignore the success flag of this call, it will do the best it can for
249   // invalid input, which is what we want here.
250   UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
251   return ret;
252 }
253 
254 // UTF-16 <-> Wide -------------------------------------------------------------
255 
256 #if defined(WCHAR_T_IS_16_BIT)
257 // When wide == UTF-16 the conversions are a NOP.
258 
WideToUTF16(const wchar_t * src,size_t src_len,std::u16string * output)259 bool WideToUTF16(const wchar_t* src, size_t src_len, std::u16string* output) {
260   output->assign(src, src + src_len);
261   return true;
262 }
263 
WideToUTF16(std::wstring_view wide)264 std::u16string WideToUTF16(std::wstring_view wide) {
265   return std::u16string(wide.begin(), wide.end());
266 }
267 
UTF16ToWide(const char16_t * src,size_t src_len,std::wstring * output)268 bool UTF16ToWide(const char16_t* src, size_t src_len, std::wstring* output) {
269   output->assign(src, src + src_len);
270   return true;
271 }
272 
UTF16ToWide(std::u16string_view utf16)273 std::wstring UTF16ToWide(std::u16string_view utf16) {
274   return std::wstring(utf16.begin(), utf16.end());
275 }
276 
277 #elif defined(WCHAR_T_IS_32_BIT)
278 
WideToUTF16(const wchar_t * src,size_t src_len,std::u16string * output)279 bool WideToUTF16(const wchar_t* src, size_t src_len, std::u16string* output) {
280   return UTFConversion(std::wstring_view(src, src_len), output);
281 }
282 
WideToUTF16(std::wstring_view wide)283 std::u16string WideToUTF16(std::wstring_view wide) {
284   std::u16string ret;
285   // Ignore the success flag of this call, it will do the best it can for
286   // invalid input, which is what we want here.
287   WideToUTF16(wide.data(), wide.length(), &ret);
288   return ret;
289 }
290 
UTF16ToWide(const char16_t * src,size_t src_len,std::wstring * output)291 bool UTF16ToWide(const char16_t* src, size_t src_len, std::wstring* output) {
292   return UTFConversion(std::u16string_view(src, src_len), output);
293 }
294 
UTF16ToWide(std::u16string_view utf16)295 std::wstring UTF16ToWide(std::u16string_view utf16) {
296   std::wstring ret;
297   // Ignore the success flag of this call, it will do the best it can for
298   // invalid input, which is what we want here.
299   UTF16ToWide(utf16.data(), utf16.length(), &ret);
300   return ret;
301 }
302 
303 #endif  // defined(WCHAR_T_IS_32_BIT)
304 
305 // UTF-8 <-> Wide --------------------------------------------------------------
306 
307 // UTF8ToWide is the same code, regardless of whether wide is 16 or 32 bits
308 
UTF8ToWide(const char * src,size_t src_len,std::wstring * output)309 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
310   return UTFConversion(std::string_view(src, src_len), output);
311 }
312 
UTF8ToWide(std::string_view utf8)313 std::wstring UTF8ToWide(std::string_view utf8) {
314   std::wstring ret;
315   // Ignore the success flag of this call, it will do the best it can for
316   // invalid input, which is what we want here.
317   UTF8ToWide(utf8.data(), utf8.length(), &ret);
318   return ret;
319 }
320 
321 #if defined(WCHAR_T_IS_16_BIT)
322 // Easy case since we can use the "utf" versions we already wrote above.
323 
WideToUTF8(const wchar_t * src,size_t src_len,std::string * output)324 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
325   return UTF16ToUTF8(as_u16cstr(src), src_len, output);
326 }
327 
WideToUTF8(std::wstring_view wide)328 std::string WideToUTF8(std::wstring_view wide) {
329   return UTF16ToUTF8(std::u16string_view(as_u16cstr(wide), wide.size()));
330 }
331 
332 #elif defined(WCHAR_T_IS_32_BIT)
333 
WideToUTF8(const wchar_t * src,size_t src_len,std::string * output)334 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
335   return UTFConversion(std::wstring_view(src, src_len), output);
336 }
337 
WideToUTF8(std::wstring_view wide)338 std::string WideToUTF8(std::wstring_view wide) {
339   std::string ret;
340   // Ignore the success flag of this call, it will do the best it can for
341   // invalid input, which is what we want here.
342   WideToUTF8(wide.data(), wide.length(), &ret);
343   return ret;
344 }
345 
346 #endif  // defined(WCHAR_T_IS_32_BIT)
347 
ASCIIToUTF16(std::string_view ascii)348 std::u16string ASCIIToUTF16(std::string_view ascii) {
349   DCHECK(IsStringASCII(ascii)) << ascii;
350   return std::u16string(ascii.begin(), ascii.end());
351 }
352 
UTF16ToASCII(std::u16string_view utf16)353 std::string UTF16ToASCII(std::u16string_view utf16) {
354   DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16);
355   return std::string(utf16.begin(), utf16.end());
356 }
357 
358 #if defined(WCHAR_T_IS_16_BIT)
ASCIIToWide(std::string_view ascii)359 std::wstring ASCIIToWide(std::string_view ascii) {
360   DCHECK(IsStringASCII(ascii)) << ascii;
361   return std::wstring(ascii.begin(), ascii.end());
362 }
363 
WideToASCII(std::wstring_view wide)364 std::string WideToASCII(std::wstring_view wide) {
365   DCHECK(IsStringASCII(wide)) << wide;
366   return std::string(wide.begin(), wide.end());
367 }
368 #endif  // defined(WCHAR_T_IS_16_BIT)
369 
370 }  // namespace base
371