1 // Copyright (c) 2018 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/strings/utf_string_conversions.h"
6
7 #include <stdint.h>
8
9 #include <string_view>
10
11 #include "base/strings/string_util.h"
12 #include "base/strings/utf_string_conversion_utils.h"
13 #include "base/third_party/icu/icu_utf.h"
14 #include "util/build_config.h"
15
16 namespace base {
17
18 namespace {
19
20 constexpr int32_t kErrorCodePoint = 0xFFFD;
21
22 // Size coefficient ----------------------------------------------------------
23 // The maximum number of codeunits in the destination encoding corresponding to
24 // one codeunit in the source encoding.
25
26 template <typename SrcChar, typename DestChar>
27 struct SizeCoefficient {
28 static_assert(sizeof(SrcChar) < sizeof(DestChar),
29 "Default case: from a smaller encoding to the bigger one");
30
31 // ASCII symbols are encoded by one codeunit in all encodings.
32 static constexpr int value = 1;
33 };
34
35 template <>
36 struct SizeCoefficient<char16_t, char> {
37 // One UTF-16 codeunit corresponds to at most 3 codeunits in UTF-8.
38 static constexpr int value = 3;
39 };
40
41 template <typename SrcChar, typename DestChar>
42 constexpr int size_coefficient_v =
43 SizeCoefficient<std::decay_t<SrcChar>, std::decay_t<DestChar>>::value;
44
45 // UnicodeAppendUnsafe --------------------------------------------------------
46 // Function overloads that write code_point to the output string. Output string
47 // has to have enough space for the codepoint.
48
UnicodeAppendUnsafe(char * out,int32_t * size,uint32_t code_point)49 void UnicodeAppendUnsafe(char* out, int32_t* size, uint32_t code_point) {
50 CBU8_APPEND_UNSAFE(out, *size, code_point);
51 }
52
UnicodeAppendUnsafe(char16_t * out,int32_t * size,uint32_t code_point)53 void UnicodeAppendUnsafe(char16_t* out, int32_t* size, uint32_t code_point) {
54 CBU16_APPEND_UNSAFE(out, *size, code_point);
55 }
56
57 // DoUTFConversion ------------------------------------------------------------
58 // Main driver of UTFConversion specialized for different Src encodings.
59 // dest has to have enough room for the converted text.
60
61 template <typename DestChar>
DoUTFConversion(const char * src,int32_t src_len,DestChar * dest,int32_t * dest_len)62 bool DoUTFConversion(const char* src,
63 int32_t src_len,
64 DestChar* dest,
65 int32_t* dest_len) {
66 bool success = true;
67
68 for (int32_t i = 0; i < src_len;) {
69 int32_t code_point;
70 CBU8_NEXT(src, i, src_len, code_point);
71
72 if (!IsValidCodepoint(code_point)) {
73 success = false;
74 code_point = kErrorCodePoint;
75 }
76
77 UnicodeAppendUnsafe(dest, dest_len, code_point);
78 }
79
80 return success;
81 }
82
83 template <typename DestChar>
DoUTFConversion(const char16_t * src,int32_t src_len,DestChar * dest,int32_t * dest_len)84 bool DoUTFConversion(const char16_t* src,
85 int32_t src_len,
86 DestChar* dest,
87 int32_t* dest_len) {
88 bool success = true;
89
90 auto ConvertSingleChar = [&success](char16_t in) -> int32_t {
91 if (!CBU16_IS_SINGLE(in) || !IsValidCodepoint(in)) {
92 success = false;
93 return kErrorCodePoint;
94 }
95 return in;
96 };
97
98 int32_t i = 0;
99
100 // Always have another symbol in order to avoid checking boundaries in the
101 // middle of the surrogate pair.
102 while (i < src_len - 1) {
103 int32_t code_point;
104
105 if (CBU16_IS_LEAD(src[i]) && CBU16_IS_TRAIL(src[i + 1])) {
106 code_point = CBU16_GET_SUPPLEMENTARY(src[i], src[i + 1]);
107 if (!IsValidCodepoint(code_point)) {
108 code_point = kErrorCodePoint;
109 success = false;
110 }
111 i += 2;
112 } else {
113 code_point = ConvertSingleChar(src[i]);
114 ++i;
115 }
116
117 UnicodeAppendUnsafe(dest, dest_len, code_point);
118 }
119
120 if (i < src_len)
121 UnicodeAppendUnsafe(dest, dest_len, ConvertSingleChar(src[i]));
122
123 return success;
124 }
125
126 // UTFConversion --------------------------------------------------------------
127 // Function template for generating all UTF conversions.
128
129 template <typename InputString, typename DestString>
UTFConversion(const InputString & src_str,DestString * dest_str)130 bool UTFConversion(const InputString& src_str, DestString* dest_str) {
131 if (IsStringASCII(src_str)) {
132 dest_str->assign(src_str.begin(), src_str.end());
133 return true;
134 }
135
136 dest_str->resize(src_str.length() *
137 size_coefficient_v<typename InputString::value_type,
138 typename DestString::value_type>);
139
140 // Empty string is ASCII => it OK to call operator[].
141 auto* dest = &(*dest_str)[0];
142
143 // ICU requires 32 bit numbers.
144 int32_t src_len32 = static_cast<int32_t>(src_str.length());
145 int32_t dest_len32 = 0;
146
147 bool res = DoUTFConversion(src_str.data(), src_len32, dest, &dest_len32);
148
149 dest_str->resize(dest_len32);
150 dest_str->shrink_to_fit();
151
152 return res;
153 }
154
155 } // namespace
156
157 // UTF16 <-> UTF8 --------------------------------------------------------------
158
UTF8ToUTF16(const char * src,size_t src_len,std::u16string * output)159 bool UTF8ToUTF16(const char* src, size_t src_len, std::u16string* output) {
160 return UTFConversion(std::string_view(src, src_len), output);
161 }
162
UTF8ToUTF16(std::string_view utf8)163 std::u16string UTF8ToUTF16(std::string_view utf8) {
164 std::u16string ret;
165 // Ignore the success flag of this call, it will do the best it can for
166 // invalid input, which is what we want here.
167 UTF8ToUTF16(utf8.data(), utf8.size(), &ret);
168 return ret;
169 }
170
UTF16ToUTF8(const char16_t * src,size_t src_len,std::string * output)171 bool UTF16ToUTF8(const char16_t* src, size_t src_len, std::string* output) {
172 return UTFConversion(std::u16string_view(src, src_len), output);
173 }
174
UTF16ToUTF8(std::u16string_view utf16)175 std::string UTF16ToUTF8(std::u16string_view utf16) {
176 std::string ret;
177 // Ignore the success flag of this call, it will do the best it can for
178 // invalid input, which is what we want here.
179 UTF16ToUTF8(utf16.data(), utf16.length(), &ret);
180 return ret;
181 }
182
183 // ASCII <-> UTF-16 -----------------------------------------------------------
184
ASCIIToUTF16(std::string_view ascii)185 std::u16string ASCIIToUTF16(std::string_view ascii) {
186 DCHECK(IsStringASCII(ascii)) << ascii;
187 return std::u16string(ascii.begin(), ascii.end());
188 }
189
UTF16ToASCII(std::u16string_view utf16)190 std::string UTF16ToASCII(std::u16string_view utf16) {
191 DCHECK(IsStringASCII(utf16)) << UTF16ToUTF8(utf16);
192 return std::string(utf16.begin(), utf16.end());
193 }
194
195 } // namespace base
196