• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2018 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/strings/utf_string_conversions.h"
6 
7 #include <stdint.h>
8 
9 #include <string_view>
10 
11 #include "base/strings/string_util.h"
12 #include "base/strings/utf_string_conversion_utils.h"
13 #include "base/third_party/icu/icu_utf.h"
14 #include "util/build_config.h"
15 
16 namespace base {
17 
18 namespace {
19 
20 constexpr int32_t kErrorCodePoint = 0xFFFD;
21 
22 // Size coefficient ----------------------------------------------------------
23 // The maximum number of codeunits in the destination encoding corresponding to
24 // one codeunit in the source encoding.
25 
26 template <typename SrcChar, typename DestChar>
27 struct SizeCoefficient {
28   static_assert(sizeof(SrcChar) < sizeof(DestChar),
29                 "Default case: from a smaller encoding to the bigger one");
30 
31   // ASCII symbols are encoded by one codeunit in all encodings.
32   static constexpr int value = 1;
33 };
34 
35 template <>
36 struct SizeCoefficient<char16_t, char> {
37   // One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8.
38   static constexpr int value = 3;
39 };
40 
41 template <typename SrcChar, typename DestChar>
42 constexpr int size_coefficient_v =
43     SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value;
44 
45 // UnicodeAppendUnsafe --------------------------------------------------------
46 // Function overloads that write code_point to the output string. Output string
47 // has to have enough space for the codepoint.
48 
UnicodeAppendUnsafe(char * out,int32_t * size,uint32_t code_point)49 void UnicodeAppendUnsafe(char* out, int32_t* size, uint32_t code_point) {
50   CBU8_APPEND_UNSAFE(out, *size, code_point);
51 }
52 
UnicodeAppendUnsafe(char16_t * out,int32_t * size,uint32_t code_point)53 void UnicodeAppendUnsafe(char16_t* out, int32_t* size, uint32_t code_point) {
54   CBU16_APPEND_UNSAFE(out, *size, code_point);
55 }
56 
57 // DoUTFConversion ------------------------------------------------------------
58 // Main driver of UTFConversion specialized for different Src encodings.
59 // dest has to have enough room for the converted text.
60 
61 template <typename DestChar>
DoUTFConversion(const char * src,int32_t src_len,DestChar * dest,int32_t * dest_len)62 bool DoUTFConversion(const char* src,
63                      int32_t src_len,
64                      DestChar* dest,
65                      int32_t* dest_len) {
66   bool success = true;
67 
68   for (int32_t i = 0; i < src_len;) {
69     int32_t code_point;
70     CBU8_NEXT(src, i, src_len, code_point);
71 
72     if (!IsValidCodepoint(code_point)) {
73       success = false;
74       code_point = kErrorCodePoint;
75     }
76 
77     UnicodeAppendUnsafe(dest, dest_len, code_point);
78   }
79 
80   return success;
81 }
82 
83 template <typename DestChar>
DoUTFConversion(const char16_t * src,int32_t src_len,DestChar * dest,int32_t * dest_len)84 bool DoUTFConversion(const char16_t* src,
85                      int32_t src_len,
86                      DestChar* dest,
87                      int32_t* dest_len) {
88   bool success = true;
89 
90   auto ConvertSingleChar = [&success](char16_t in) -> int32_t {
91     if (!CBU16_IS_SINGLE(in) || !IsValidCodepoint(in)) {
92       success = false;
93       return kErrorCodePoint;
94     }
95     return in;
96   };
97 
98   int32_t i = 0;
99 
100   // Always have another symbol in order to avoid checking boundaries in the
101   // middle of the surrogate pair.
102   while (i < src_len - 1) {
103     int32_t code_point;
104 
105     if (CBU16_IS_LEAD(src[i]) && CBU16_IS_TRAIL(src[i + 1])) {
106       code_point = CBU16_GET_SUPPLEMENTARY(src[i], src[i + 1]);
107       if (!IsValidCodepoint(code_point)) {
108         code_point = kErrorCodePoint;
109         success = false;
110       }
111       i += 2;
112     } else {
113       code_point = ConvertSingleChar(src[i]);
114       ++i;
115     }
116 
117     UnicodeAppendUnsafe(dest, dest_len, code_point);
118   }
119 
120   if (i < src_len)
121     UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i]));
122 
123   return success;
124 }
125 
126 // UTFConversion --------------------------------------------------------------
127 // Function template for generating all UTF conversions.
128 
129 template <typename InputString, typename DestString>
UTFConversion(const InputString & src_str,DestString * dest_str)130 bool UTFConversion(const InputString& src_str, DestString* dest_str) {
131   if (IsStringASCII(src_str)) {
132     dest_str->assign(src_str.begin(), src_str.end());
133     return true;
134   }
135 
136   dest_str->resize(src_str.length() *
137                    size_coefficient_v<typename InputString::value_type,
138                                       typename DestString::value_type>);
139 
140   // Empty string is ASCII => it OK to call operator[].
141   auto* dest = &(*dest_str)[0];
142 
143   // ICU requires 32 bit numbers.
144   int32_t src_len32 = static_cast<int32_t>(src_str.length());
145   int32_t dest_len32 = 0;
146 
147   bool res = DoUTFConversion(src_str.data(), src_len32, dest, &dest_len32);
148 
149   dest_str->resize(dest_len32);
150   dest_str->shrink_to_fit();
151 
152   return res;
153 }
154 
155 }  // namespace
156 
157 // UTF16 <-> UTF8 --------------------------------------------------------------
158 
UTF8ToUTF16(const char * src,size_t src_len,std::u16string * output)159 bool UTF8ToUTF16(const char* src, size_t src_len, std::u16string* output) {
160   return UTFConversion(std::string_view(src, src_len), output);
161 }
162 
UTF8ToUTF16(std::string_view utf8)163 std::u16string UTF8ToUTF16(std::string_view utf8) {
164   std::u16string ret;
165   // Ignore the success flag of this call, it will do the best it can for
166   // invalid input, which is what we want here.
167   UTF8ToUTF16(utf8.data(), utf8.size(), &ret);
168   return ret;
169 }
170 
UTF16ToUTF8(const char16_t * src,size_t src_len,std::string * output)171 bool UTF16ToUTF8(const char16_t* src, size_t src_len, std::string* output) {
172   return UTFConversion(std::u16string_view(src, src_len), output);
173 }
174 
UTF16ToUTF8(std::u16string_view utf16)175 std::string UTF16ToUTF8(std::u16string_view utf16) {
176   std::string ret;
177   // Ignore the success flag of this call, it will do the best it can for
178   // invalid input, which is what we want here.
179   UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
180   return ret;
181 }
182 
183 // ASCII <-> UTF-16 -----------------------------------------------------------
184 
ASCIIToUTF16(std::string_view ascii)185 std::u16string ASCIIToUTF16(std::string_view ascii) {
186   DCHECK(IsStringASCII(ascii)) << ascii;
187   return std::u16string(ascii.begin(), ascii.end());
188 }
189 
UTF16ToASCII(std::u16string_view utf16)190 std::string UTF16ToASCII(std::u16string_view utf16) {
191   DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16);
192   return std::string(utf16.begin(), utf16.end());
193 }
194 
195 }  // namespace base
196