• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2018 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/strings/utf_string_conversions.h"
6 
7 #include <limits.h>
8 #include <stdint.h>
9 
10 #include <ostream>
11 #include <type_traits>
12 
13 #include "base/strings/string_piece.h"
14 #include "base/strings/string_util.h"
15 #include "base/strings/utf_string_conversion_utils.h"
16 #include "base/third_party/icu/icu_utf.h"
17 #include "build/build_config.h"
18 
19 namespace base {
20 
21 namespace {
22 
23 constexpr base_icu::UChar32 kErrorCodePoint = 0xFFFD;
24 
25 // Size coefficient ----------------------------------------------------------
26 // The maximum number of codeunits in the destination encoding corresponding to
27 // one codeunit in the source encoding.
28 
29 template <typename SrcChar, typename DestChar>
30 struct SizeCoefficient {
31   static_assert(sizeof(SrcChar) < sizeof(DestChar),
32                 "Default case: from a smaller encoding to the bigger one");
33 
34   // ASCII symbols are encoded by one codeunit in all encodings.
35   static constexpr int value = 1;
36 };
37 
38 template <>
39 struct SizeCoefficient<char16_t, char> {
40   // One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8.
41   static constexpr int value = 3;
42 };
43 
44 #if defined(WCHAR_T_IS_UTF32)
45 template <>
46 struct SizeCoefficient<wchar_t, char> {
47   // UTF-8 uses at most 4 codeunits per character.
48   static constexpr int value = 4;
49 };
50 
51 template <>
52 struct SizeCoefficient<wchar_t, char16_t> {
53   // UTF-16 uses at most 2 codeunits per character.
54   static constexpr int value = 2;
55 };
56 #endif  // defined(WCHAR_T_IS_UTF32)
57 
58 template <typename SrcChar, typename DestChar>
59 constexpr int size_coefficient_v =
60     SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value;
61 
62 // UnicodeAppendUnsafe --------------------------------------------------------
63 // Function overloads that write code_point to the output string. Output string
64 // has to have enough space for the codepoint.
65 
66 // Convenience typedef that checks whether the passed in type is integral (i.e.
67 // bool, char, int or their extended versions) and is of the correct size.
68 template <typename Char, size_t N>
69 using EnableIfBitsAre = std::enable_if_t<std::is_integral<Char>::value &&
70                                              CHAR_BIT * sizeof(Char) == N,
71                                          bool>;
72 
73 template <typename Char, EnableIfBitsAre<Char, 8> = true>
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)74 void UnicodeAppendUnsafe(Char* out,
75                          size_t* size,
76                          base_icu::UChar32 code_point) {
77   CBU8_APPEND_UNSAFE(reinterpret_cast<uint8_t*>(out), *size, code_point);
78 }
79 
80 template <typename Char, EnableIfBitsAre<Char, 16> = true>
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)81 void UnicodeAppendUnsafe(Char* out,
82                          size_t* size,
83                          base_icu::UChar32 code_point) {
84   CBU16_APPEND_UNSAFE(out, *size, code_point);
85 }
86 
87 template <typename Char, EnableIfBitsAre<Char, 32> = true>
UnicodeAppendUnsafe(Char * out,size_t * size,base_icu::UChar32 code_point)88 void UnicodeAppendUnsafe(Char* out,
89                          size_t* size,
90                          base_icu::UChar32 code_point) {
91   out[(*size)++] = static_cast<Char>(code_point);
92 }
93 
94 // DoUTFConversion ------------------------------------------------------------
95 // Main driver of UTFConversion specialized for different Src encodings.
96 // dest has to have enough room for the converted text.
97 
98 template <typename DestChar>
DoUTFConversion(const char * src,size_t src_len,DestChar * dest,size_t * dest_len)99 bool DoUTFConversion(const char* src,
100                      size_t src_len,
101                      DestChar* dest,
102                      size_t* dest_len) {
103   bool success = true;
104 
105   for (size_t i = 0; i < src_len;) {
106     base_icu::UChar32 code_point;
107     CBU8_NEXT(reinterpret_cast<const uint8_t*>(src), i, src_len, code_point);
108 
109     if (!IsValidCodepoint(code_point)) {
110       success = false;
111       code_point = kErrorCodePoint;
112     }
113 
114     UnicodeAppendUnsafe(dest, dest_len, code_point);
115   }
116 
117   return success;
118 }
119 
120 template <typename DestChar>
DoUTFConversion(const char16_t * src,size_t src_len,DestChar * dest,size_t * dest_len)121 bool DoUTFConversion(const char16_t* src,
122                      size_t src_len,
123                      DestChar* dest,
124                      size_t* dest_len) {
125   bool success = true;
126 
127   auto ConvertSingleChar = [&success](char16_t in) -> base_icu::UChar32 {
128     if (!CBU16_IS_SINGLE(in) || !IsValidCodepoint(in)) {
129       success = false;
130       return kErrorCodePoint;
131     }
132     return in;
133   };
134 
135   size_t i = 0;
136 
137   // Always have another symbol in order to avoid checking boundaries in the
138   // middle of the surrogate pair.
139   while (i + 1 < src_len) {
140     base_icu::UChar32 code_point;
141 
142     if (CBU16_IS_LEAD(src[i]) && CBU16_IS_TRAIL(src[i + 1])) {
143       code_point = CBU16_GET_SUPPLEMENTARY(src[i], src[i + 1]);
144       if (!IsValidCodepoint(code_point)) {
145         code_point = kErrorCodePoint;
146         success = false;
147       }
148       i += 2;
149     } else {
150       code_point = ConvertSingleChar(src[i]);
151       ++i;
152     }
153 
154     UnicodeAppendUnsafe(dest, dest_len, code_point);
155   }
156 
157   if (i < src_len) {
158     UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i]));
159   }
160 
161   return success;
162 }
163 
164 #if defined(WCHAR_T_IS_UTF32)
165 
166 template <typename DestChar>
DoUTFConversion(const wchar_t * src,size_t src_len,DestChar * dest,size_t * dest_len)167 bool DoUTFConversion(const wchar_t* src,
168                      size_t src_len,
169                      DestChar* dest,
170                      size_t* dest_len) {
171   bool success = true;
172 
173   for (size_t i = 0; i < src_len; ++i) {
174     auto code_point = static_cast<base_icu::UChar32>(src[i]);
175 
176     if (!IsValidCodepoint(code_point)) {
177       success = false;
178       code_point = kErrorCodePoint;
179     }
180 
181     UnicodeAppendUnsafe(dest, dest_len, code_point);
182   }
183 
184   return success;
185 }
186 
187 #endif  // defined(WCHAR_T_IS_UTF32)
188 
189 // UTFConversion --------------------------------------------------------------
190 // Function template for generating all UTF conversions.
191 
192 template <typename InputString, typename DestString>
UTFConversion(const InputString & src_str,DestString * dest_str)193 bool UTFConversion(const InputString& src_str, DestString* dest_str) {
194   if (IsStringASCII(src_str)) {
195     dest_str->assign(src_str.begin(), src_str.end());
196     return true;
197   }
198 
199   dest_str->resize(src_str.length() *
200                    size_coefficient_v<typename InputString::value_type,
201                                       typename DestString::value_type>);
202 
203   // Empty string is ASCII => it OK to call operator[].
204   auto* dest = &(*dest_str)[0];
205 
206   // ICU requires 32 bit numbers.
207   size_t src_len = src_str.length();
208   size_t dest_len = 0;
209 
210   bool res = DoUTFConversion(src_str.data(), src_len, dest, &dest_len);
211 
212   dest_str->resize(dest_len);
213   dest_str->shrink_to_fit();
214 
215   return res;
216 }
217 
218 }  // namespace
219 
220 // UTF16 <-> UTF8 --------------------------------------------------------------
221 
UTF8ToUTF16(const char * src,size_t src_len,std::u16string * output)222 bool UTF8ToUTF16(const char* src, size_t src_len, std::u16string* output) {
223   return UTFConversion(StringPiece(src, src_len), output);
224 }
225 
UTF8ToUTF16(StringPiece utf8)226 std::u16string UTF8ToUTF16(StringPiece utf8) {
227   std::u16string ret;
228   // Ignore the success flag of this call, it will do the best it can for
229   // invalid input, which is what we want here.
230   UTF8ToUTF16(utf8.data(), utf8.size(), &ret);
231   return ret;
232 }
233 
UTF16ToUTF8(const char16_t * src,size_t src_len,std::string * output)234 bool UTF16ToUTF8(const char16_t* src, size_t src_len, std::string* output) {
235   return UTFConversion(StringPiece16(src, src_len), output);
236 }
237 
UTF16ToUTF8(StringPiece16 utf16)238 std::string UTF16ToUTF8(StringPiece16 utf16) {
239   std::string ret;
240   // Ignore the success flag of this call, it will do the best it can for
241   // invalid input, which is what we want here.
242   UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
243   return ret;
244 }
245 
246 // UTF-16 <-> Wide -------------------------------------------------------------
247 
248 #if defined(WCHAR_T_IS_UTF16)
249 // When wide == UTF-16 the conversions are a NOP.
250 
WideToUTF16(const wchar_t * src,size_t src_len,std::u16string * output)251 bool WideToUTF16(const wchar_t* src, size_t src_len, std::u16string* output) {
252   output->assign(src, src + src_len);
253   return true;
254 }
255 
WideToUTF16(WStringPiece wide)256 std::u16string WideToUTF16(WStringPiece wide) {
257   return std::u16string(wide.begin(), wide.end());
258 }
259 
UTF16ToWide(const char16_t * src,size_t src_len,std::wstring * output)260 bool UTF16ToWide(const char16_t* src, size_t src_len, std::wstring* output) {
261   output->assign(src, src + src_len);
262   return true;
263 }
264 
UTF16ToWide(StringPiece16 utf16)265 std::wstring UTF16ToWide(StringPiece16 utf16) {
266   return std::wstring(utf16.begin(), utf16.end());
267 }
268 
269 #elif defined(WCHAR_T_IS_UTF32)
270 
WideToUTF16(const wchar_t * src,size_t src_len,std::u16string * output)271 bool WideToUTF16(const wchar_t* src, size_t src_len, std::u16string* output) {
272   return UTFConversion(base::WStringPiece(src, src_len), output);
273 }
274 
WideToUTF16(WStringPiece wide)275 std::u16string WideToUTF16(WStringPiece wide) {
276   std::u16string ret;
277   // Ignore the success flag of this call, it will do the best it can for
278   // invalid input, which is what we want here.
279   WideToUTF16(wide.data(), wide.length(), &ret);
280   return ret;
281 }
282 
UTF16ToWide(const char16_t * src,size_t src_len,std::wstring * output)283 bool UTF16ToWide(const char16_t* src, size_t src_len, std::wstring* output) {
284   return UTFConversion(StringPiece16(src, src_len), output);
285 }
286 
UTF16ToWide(StringPiece16 utf16)287 std::wstring UTF16ToWide(StringPiece16 utf16) {
288   std::wstring ret;
289   // Ignore the success flag of this call, it will do the best it can for
290   // invalid input, which is what we want here.
291   UTF16ToWide(utf16.data(), utf16.length(), &ret);
292   return ret;
293 }
294 
295 #endif  // defined(WCHAR_T_IS_UTF32)
296 
297 // UTF-8 <-> Wide --------------------------------------------------------------
298 
299 // UTF8ToWide is the same code, regardless of whether wide is 16 or 32 bits
300 
UTF8ToWide(const char * src,size_t src_len,std::wstring * output)301 bool UTF8ToWide(const char* src, size_t src_len, std::wstring* output) {
302   return UTFConversion(StringPiece(src, src_len), output);
303 }
304 
UTF8ToWide(StringPiece utf8)305 std::wstring UTF8ToWide(StringPiece utf8) {
306   std::wstring ret;
307   // Ignore the success flag of this call, it will do the best it can for
308   // invalid input, which is what we want here.
309   UTF8ToWide(utf8.data(), utf8.length(), &ret);
310   return ret;
311 }
312 
313 #if defined(WCHAR_T_IS_UTF16)
314 // Easy case since we can use the "utf" versions we already wrote above.
315 
WideToUTF8(const wchar_t * src,size_t src_len,std::string * output)316 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
317   return UTF16ToUTF8(as_u16cstr(src), src_len, output);
318 }
319 
WideToUTF8(WStringPiece wide)320 std::string WideToUTF8(WStringPiece wide) {
321   return UTF16ToUTF8(StringPiece16(as_u16cstr(wide), wide.size()));
322 }
323 
324 #elif defined(WCHAR_T_IS_UTF32)
325 
WideToUTF8(const wchar_t * src,size_t src_len,std::string * output)326 bool WideToUTF8(const wchar_t* src, size_t src_len, std::string* output) {
327   return UTFConversion(WStringPiece(src, src_len), output);
328 }
329 
WideToUTF8(WStringPiece wide)330 std::string WideToUTF8(WStringPiece wide) {
331   std::string ret;
332   // Ignore the success flag of this call, it will do the best it can for
333   // invalid input, which is what we want here.
334   WideToUTF8(wide.data(), wide.length(), &ret);
335   return ret;
336 }
337 
338 #endif  // defined(WCHAR_T_IS_UTF32)
339 
ASCIIToUTF16(StringPiece ascii)340 std::u16string ASCIIToUTF16(StringPiece ascii) {
341   DCHECK(IsStringASCII(ascii)) << ascii;
342   return std::u16string(ascii.begin(), ascii.end());
343 }
344 
UTF16ToASCII(StringPiece16 utf16)345 std::string UTF16ToASCII(StringPiece16 utf16) {
346   DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16);
347   return std::string(utf16.begin(), utf16.end());
348 }
349 
350 #if defined(WCHAR_T_IS_UTF16)
ASCIIToWide(StringPiece ascii)351 std::wstring ASCIIToWide(StringPiece ascii) {
352   DCHECK(IsStringASCII(ascii)) << ascii;
353   return std::wstring(ascii.begin(), ascii.end());
354 }
355 
WideToASCII(WStringPiece wide)356 std::string WideToASCII(WStringPiece wide) {
357   DCHECK(IsStringASCII(wide)) << wide;
358   return std::string(wide.begin(), wide.end());
359 }
360 #endif  // defined(WCHAR_T_IS_UTF16)
361 
362 }  // namespace base
363