1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/strings/string_util.h"
6
7 #include <ctype.h>
8 #include <errno.h>
9 #include <math.h>
10 #include <stdarg.h>
11 #include <stdint.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <time.h>
16 #include <wchar.h>
17 #include <wctype.h>
18
19 #include <algorithm>
20 #include <iterator>
21 #include <limits>
22 #include <vector>
23
24 #include "base/logging.h"
25 #include "base/strings/utf_string_conversion_utils.h"
26 #include "base/strings/utf_string_conversions.h"
27 #include "base/third_party/icu/icu_utf.h"
28 #include "util/build_config.h"
29
30 namespace base {
31
32 namespace {
33
34 // Used by ReplaceStringPlaceholders to track the position in the string of
35 // replaced parameters.
36 struct ReplacementOffset {
ReplacementOffsetbase::__anon7c1f9a6a0111::ReplacementOffset37 ReplacementOffset(uintptr_t parameter, size_t offset)
38 : parameter(parameter), offset(offset) {}
39
40 // Index of the parameter.
41 uintptr_t parameter;
42
43 // Starting position in the string.
44 size_t offset;
45 };
46
CompareParameter(const ReplacementOffset & elem1,const ReplacementOffset & elem2)47 static bool CompareParameter(const ReplacementOffset& elem1,
48 const ReplacementOffset& elem2) {
49 return elem1.parameter < elem2.parameter;
50 }
51
52 // Assuming that a pointer is the size of a "machine word", then
53 // uintptr_t is an integer type that is also a machine word.
54 typedef uintptr_t MachineWord;
55 const uintptr_t kMachineWordAlignmentMask = sizeof(MachineWord) - 1;
56
IsAlignedToMachineWord(const void * pointer)57 inline bool IsAlignedToMachineWord(const void* pointer) {
58 return !(reinterpret_cast<MachineWord>(pointer) & kMachineWordAlignmentMask);
59 }
60
61 template <typename T>
AlignToMachineWord(T * pointer)62 inline T* AlignToMachineWord(T* pointer) {
63 return reinterpret_cast<T*>(reinterpret_cast<MachineWord>(pointer) &
64 ~kMachineWordAlignmentMask);
65 }
66
67 template <size_t size, typename CharacterType>
68 struct NonASCIIMask;
69 template <>
70 struct NonASCIIMask<4, char16_t> {
valuebase::__anon7c1f9a6a0111::NonASCIIMask71 static inline uint32_t value() { return 0xFF80FF80U; }
72 };
73 template <>
74 struct NonASCIIMask<4, char> {
valuebase::__anon7c1f9a6a0111::NonASCIIMask75 static inline uint32_t value() { return 0x80808080U; }
76 };
77 template <>
78 struct NonASCIIMask<8, char16_t> {
valuebase::__anon7c1f9a6a0111::NonASCIIMask79 static inline uint64_t value() { return 0xFF80FF80FF80FF80ULL; }
80 };
81 template <>
82 struct NonASCIIMask<8, char> {
valuebase::__anon7c1f9a6a0111::NonASCIIMask83 static inline uint64_t value() { return 0x8080808080808080ULL; }
84 };
85
86 } // namespace
87
88 namespace {
89
90 template <typename StringType>
ToLowerASCIIImpl(std::basic_string_view<typename StringType::value_type> str)91 StringType ToLowerASCIIImpl(
92 std::basic_string_view<typename StringType::value_type> str) {
93 StringType ret;
94 ret.reserve(str.size());
95 for (size_t i = 0; i < str.size(); i++)
96 ret.push_back(ToLowerASCII(str[i]));
97 return ret;
98 }
99
100 template <typename StringType>
ToUpperASCIIImpl(std::basic_string_view<typename StringType::value_type> str)101 StringType ToUpperASCIIImpl(
102 std::basic_string_view<typename StringType::value_type> str) {
103 StringType ret;
104 ret.reserve(str.size());
105 for (size_t i = 0; i < str.size(); i++)
106 ret.push_back(ToUpperASCII(str[i]));
107 return ret;
108 }
109
110 } // namespace
111
ToLowerASCII(std::string_view str)112 std::string ToLowerASCII(std::string_view str) {
113 return ToLowerASCIIImpl<std::string>(str);
114 }
115
ToLowerASCII(std::u16string_view str)116 std::u16string ToLowerASCII(std::u16string_view str) {
117 return ToLowerASCIIImpl<std::u16string>(str);
118 }
119
ToUpperASCII(std::string_view str)120 std::string ToUpperASCII(std::string_view str) {
121 return ToUpperASCIIImpl<std::string>(str);
122 }
123
ToUpperASCII(std::u16string_view str)124 std::u16string ToUpperASCII(std::u16string_view str) {
125 return ToUpperASCIIImpl<std::u16string>(str);
126 }
127
128 template <class StringType>
CompareCaseInsensitiveASCIIT(std::basic_string_view<typename StringType::value_type> a,std::basic_string_view<typename StringType::value_type> b)129 int CompareCaseInsensitiveASCIIT(
130 std::basic_string_view<typename StringType::value_type> a,
131 std::basic_string_view<typename StringType::value_type> b) {
132 // Find the first characters that aren't equal and compare them. If the end
133 // of one of the strings is found before a nonequal character, the lengths
134 // of the strings are compared.
135 size_t i = 0;
136 while (i < a.length() && i < b.length()) {
137 typename StringType::value_type lower_a = ToLowerASCII(a[i]);
138 typename StringType::value_type lower_b = ToLowerASCII(b[i]);
139 if (lower_a < lower_b)
140 return -1;
141 if (lower_a > lower_b)
142 return 1;
143 i++;
144 }
145
146 // End of one string hit before finding a different character. Expect the
147 // common case to be "strings equal" at this point so check that first.
148 if (a.length() == b.length())
149 return 0;
150
151 if (a.length() < b.length())
152 return -1;
153 return 1;
154 }
155
CompareCaseInsensitiveASCII(std::string_view a,std::string_view b)156 int CompareCaseInsensitiveASCII(std::string_view a, std::string_view b) {
157 return CompareCaseInsensitiveASCIIT<std::string>(a, b);
158 }
159
CompareCaseInsensitiveASCII(std::u16string_view a,std::u16string_view b)160 int CompareCaseInsensitiveASCII(std::u16string_view a, std::u16string_view b) {
161 return CompareCaseInsensitiveASCIIT<std::u16string>(a, b);
162 }
163
EqualsCaseInsensitiveASCII(std::string_view a,std::string_view b)164 bool EqualsCaseInsensitiveASCII(std::string_view a, std::string_view b) {
165 if (a.length() != b.length())
166 return false;
167 return CompareCaseInsensitiveASCIIT<std::string>(a, b) == 0;
168 }
169
EqualsCaseInsensitiveASCII(std::u16string_view a,std::u16string_view b)170 bool EqualsCaseInsensitiveASCII(std::u16string_view a, std::u16string_view b) {
171 if (a.length() != b.length())
172 return false;
173 return CompareCaseInsensitiveASCIIT<std::u16string>(a, b) == 0;
174 }
175
176 template <class StringType>
177 bool ReplaceCharsT(
178 const StringType& input,
179 std::basic_string_view<typename StringType::value_type> find_any_of_these,
180 std::basic_string_view<typename StringType::value_type> replace_with,
181 StringType* output);
182
ReplaceChars(const std::u16string & input,std::u16string_view replace_chars,const std::u16string & replace_with,std::u16string * output)183 bool ReplaceChars(const std::u16string& input,
184 std::u16string_view replace_chars,
185 const std::u16string& replace_with,
186 std::u16string* output) {
187 return ReplaceCharsT(input, replace_chars, std::u16string_view(replace_with),
188 output);
189 }
190
ReplaceChars(const std::string & input,std::string_view replace_chars,const std::string & replace_with,std::string * output)191 bool ReplaceChars(const std::string& input,
192 std::string_view replace_chars,
193 const std::string& replace_with,
194 std::string* output) {
195 return ReplaceCharsT(input, replace_chars, std::string_view(replace_with),
196 output);
197 }
198
RemoveChars(const std::u16string & input,std::u16string_view remove_chars,std::u16string * output)199 bool RemoveChars(const std::u16string& input,
200 std::u16string_view remove_chars,
201 std::u16string* output) {
202 return ReplaceCharsT(input, remove_chars, std::u16string_view(), output);
203 }
204
RemoveChars(const std::string & input,std::string_view remove_chars,std::string * output)205 bool RemoveChars(const std::string& input,
206 std::string_view remove_chars,
207 std::string* output) {
208 return ReplaceCharsT(input, remove_chars, std::string_view(), output);
209 }
210
211 template <typename Str>
TrimStringT(const Str & input,std::basic_string_view<typename Str::value_type> trim_chars,TrimPositions positions,Str * output)212 TrimPositions TrimStringT(
213 const Str& input,
214 std::basic_string_view<typename Str::value_type> trim_chars,
215 TrimPositions positions,
216 Str* output) {
217 // Find the edges of leading/trailing whitespace as desired. Need to use
218 // a std::string_view version of input to be able to call find* on it with the
219 // std::string_view version of trim_chars (normally the trim_chars will be a
220 // constant so avoid making a copy).
221 std::basic_string_view<typename Str::value_type> input_piece(input);
222 const size_t last_char = input.length() - 1;
223 const size_t first_good_char = (positions & TRIM_LEADING)
224 ? input_piece.find_first_not_of(trim_chars)
225 : 0;
226 const size_t last_good_char = (positions & TRIM_TRAILING)
227 ? input_piece.find_last_not_of(trim_chars)
228 : last_char;
229
230 // When the string was all trimmed, report that we stripped off characters
231 // from whichever position the caller was interested in. For empty input, we
232 // stripped no characters, but we still need to clear |output|.
233 if (input.empty() || (first_good_char == Str::npos) ||
234 (last_good_char == Str::npos)) {
235 bool input_was_empty = input.empty(); // in case output == &input
236 output->clear();
237 return input_was_empty ? TRIM_NONE : positions;
238 }
239
240 // Trim.
241 *output = input.substr(first_good_char, last_good_char - first_good_char + 1);
242
243 // Return where we trimmed from.
244 return static_cast<TrimPositions>(
245 ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) |
246 ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING));
247 }
248
TrimString(const std::u16string & input,std::u16string_view trim_chars,std::u16string * output)249 bool TrimString(const std::u16string& input,
250 std::u16string_view trim_chars,
251 std::u16string* output) {
252 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
253 }
254
TrimString(const std::string & input,std::string_view trim_chars,std::string * output)255 bool TrimString(const std::string& input,
256 std::string_view trim_chars,
257 std::string* output) {
258 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
259 }
260
261 template <typename char_type>
TrimStringPieceT(std::basic_string_view<char_type> input,std::basic_string_view<char_type> trim_chars,TrimPositions positions)262 std::basic_string_view<char_type> TrimStringPieceT(
263 std::basic_string_view<char_type> input,
264 std::basic_string_view<char_type> trim_chars,
265 TrimPositions positions) {
266 size_t begin =
267 (positions & TRIM_LEADING) ? input.find_first_not_of(trim_chars) : 0;
268 if (begin == std::basic_string_view<char_type>::npos)
269 return std::basic_string_view<char_type>(); // All trimmed.
270
271 size_t end = (positions & TRIM_TRAILING)
272 ? input.find_last_not_of(trim_chars) + 1
273 : input.size();
274 return input.substr(begin, end - begin);
275 }
276
TrimString(std::u16string_view input,std::u16string_view trim_chars,TrimPositions positions)277 std::u16string_view TrimString(std::u16string_view input,
278 std::u16string_view trim_chars,
279 TrimPositions positions) {
280 return TrimStringPieceT(input, trim_chars, positions);
281 }
282
TrimString(std::string_view input,std::string_view trim_chars,TrimPositions positions)283 std::string_view TrimString(std::string_view input,
284 std::string_view trim_chars,
285 TrimPositions positions) {
286 return TrimStringPieceT(input, trim_chars, positions);
287 }
288
TruncateUTF8ToByteSize(const std::string & input,const size_t byte_size,std::string * output)289 void TruncateUTF8ToByteSize(const std::string& input,
290 const size_t byte_size,
291 std::string* output) {
292 DCHECK(output);
293 if (byte_size > input.length()) {
294 *output = input;
295 return;
296 }
297 DCHECK_LE(byte_size,
298 static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
299 // Note: This cast is necessary because CBU8_NEXT uses int32_ts.
300 int32_t truncation_length = static_cast<int32_t>(byte_size);
301 int32_t char_index = truncation_length - 1;
302 const char* data = input.data();
303
304 // Using CBU8, we will move backwards from the truncation point
305 // to the beginning of the string looking for a valid UTF8
306 // character. Once a full UTF8 character is found, we will
307 // truncate the string to the end of that character.
308 while (char_index >= 0) {
309 int32_t prev = char_index;
310 base_icu::UChar32 code_point = 0;
311 CBU8_NEXT(data, char_index, truncation_length, code_point);
312 if (!IsValidCharacter(code_point) || !IsValidCodepoint(code_point)) {
313 char_index = prev - 1;
314 } else {
315 break;
316 }
317 }
318
319 if (char_index >= 0)
320 *output = input.substr(0, char_index);
321 else
322 output->clear();
323 }
324
TrimWhitespace(const std::u16string & input,TrimPositions positions,std::u16string * output)325 TrimPositions TrimWhitespace(const std::u16string& input,
326 TrimPositions positions,
327 std::u16string* output) {
328 return TrimStringT(input, std::u16string_view(kWhitespaceUTF16), positions,
329 output);
330 }
331
TrimWhitespace(std::u16string_view input,TrimPositions positions)332 std::u16string_view TrimWhitespace(std::u16string_view input,
333 TrimPositions positions) {
334 return TrimStringPieceT(input, std::u16string_view(kWhitespaceUTF16),
335 positions);
336 }
337
TrimWhitespaceASCII(const std::string & input,TrimPositions positions,std::string * output)338 TrimPositions TrimWhitespaceASCII(const std::string& input,
339 TrimPositions positions,
340 std::string* output) {
341 return TrimStringT(input, std::string_view(kWhitespaceASCII), positions,
342 output);
343 }
344
TrimWhitespaceASCII(std::string_view input,TrimPositions positions)345 std::string_view TrimWhitespaceASCII(std::string_view input,
346 TrimPositions positions) {
347 return TrimStringPieceT(input, std::string_view(kWhitespaceASCII), positions);
348 }
349
350 template <typename STR>
CollapseWhitespaceT(const STR & text,bool trim_sequences_with_line_breaks)351 STR CollapseWhitespaceT(const STR& text, bool trim_sequences_with_line_breaks) {
352 STR result;
353 result.resize(text.size());
354
355 // Set flags to pretend we're already in a trimmed whitespace sequence, so we
356 // will trim any leading whitespace.
357 bool in_whitespace = true;
358 bool already_trimmed = true;
359
360 int chars_written = 0;
361 for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) {
362 if (IsUnicodeWhitespace(*i)) {
363 if (!in_whitespace) {
364 // Reduce all whitespace sequences to a single space.
365 in_whitespace = true;
366 result[chars_written++] = L' ';
367 }
368 if (trim_sequences_with_line_breaks && !already_trimmed &&
369 ((*i == '\n') || (*i == '\r'))) {
370 // Whitespace sequences containing CR or LF are eliminated entirely.
371 already_trimmed = true;
372 --chars_written;
373 }
374 } else {
375 // Non-whitespace characters are copied straight across.
376 in_whitespace = false;
377 already_trimmed = false;
378 result[chars_written++] = *i;
379 }
380 }
381
382 if (in_whitespace && !already_trimmed) {
383 // Any trailing whitespace is eliminated.
384 --chars_written;
385 }
386
387 result.resize(chars_written);
388 return result;
389 }
390
CollapseWhitespace(const std::u16string & text,bool trim_sequences_with_line_breaks)391 std::u16string CollapseWhitespace(const std::u16string& text,
392 bool trim_sequences_with_line_breaks) {
393 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
394 }
395
CollapseWhitespaceASCII(const std::string & text,bool trim_sequences_with_line_breaks)396 std::string CollapseWhitespaceASCII(const std::string& text,
397 bool trim_sequences_with_line_breaks) {
398 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
399 }
400
ContainsOnlyChars(std::string_view input,std::string_view characters)401 bool ContainsOnlyChars(std::string_view input, std::string_view characters) {
402 return input.find_first_not_of(characters) == std::string_view::npos;
403 }
404
ContainsOnlyChars(std::u16string_view input,std::u16string_view characters)405 bool ContainsOnlyChars(std::u16string_view input,
406 std::u16string_view characters) {
407 return input.find_first_not_of(characters) == std::u16string_view::npos;
408 }
409
410 template <class Char>
DoIsStringASCII(const Char * characters,size_t length)411 inline bool DoIsStringASCII(const Char* characters, size_t length) {
412 MachineWord all_char_bits = 0;
413 const Char* end = characters + length;
414
415 // Prologue: align the input.
416 while (!IsAlignedToMachineWord(characters) && characters != end) {
417 all_char_bits |= *characters;
418 ++characters;
419 }
420
421 // Compare the values of CPU word size.
422 const Char* word_end = AlignToMachineWord(end);
423 const size_t loop_increment = sizeof(MachineWord) / sizeof(Char);
424 while (characters < word_end) {
425 all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters));
426 characters += loop_increment;
427 }
428
429 // Process the remaining bytes.
430 while (characters != end) {
431 all_char_bits |= *characters;
432 ++characters;
433 }
434
435 MachineWord non_ascii_bit_mask =
436 NonASCIIMask<sizeof(MachineWord), Char>::value();
437 return !(all_char_bits & non_ascii_bit_mask);
438 }
439
IsStringASCII(std::string_view str)440 bool IsStringASCII(std::string_view str) {
441 return DoIsStringASCII(str.data(), str.length());
442 }
443
IsStringASCII(std::u16string_view str)444 bool IsStringASCII(std::u16string_view str) {
445 return DoIsStringASCII(str.data(), str.length());
446 }
447
IsStringUTF8(std::string_view str)448 bool IsStringUTF8(std::string_view str) {
449 const char* src = str.data();
450 int32_t src_len = static_cast<int32_t>(str.length());
451 int32_t char_index = 0;
452
453 while (char_index < src_len) {
454 int32_t code_point;
455 CBU8_NEXT(src, char_index, src_len, code_point);
456 if (!IsValidCharacter(code_point))
457 return false;
458 }
459 return true;
460 }
461
462 // Implementation note: Normally this function will be called with a hardcoded
463 // constant for the lowercase_ascii parameter. Constructing a std::string_view
464 // from a C constant requires running strlen, so the result will be two passes
465 // through the buffers, one to file the length of lowercase_ascii, and one to
466 // compare each letter.
467 //
468 // This function could have taken a const char* to avoid this and only do one
469 // pass through the string. But the strlen is faster than the case-insensitive
470 // compares and lets us early-exit in the case that the strings are different
471 // lengths (will often be the case for non-matches). So whether one approach or
472 // the other will be faster depends on the case.
473 //
474 // The hardcoded strings are typically very short so it doesn't matter, and the
475 // string piece gives additional flexibility for the caller (doesn't have to be
476 // null terminated) so we choose the std::string_view route.
477 template <typename Str>
DoLowerCaseEqualsASCII(std::basic_string_view<typename Str::value_type> str,std::string_view lowercase_ascii)478 static inline bool DoLowerCaseEqualsASCII(
479 std::basic_string_view<typename Str::value_type> str,
480 std::string_view lowercase_ascii) {
481 if (str.size() != lowercase_ascii.size())
482 return false;
483 for (size_t i = 0; i < str.size(); i++) {
484 if (ToLowerASCII(str[i]) != lowercase_ascii[i])
485 return false;
486 }
487 return true;
488 }
489
LowerCaseEqualsASCII(std::string_view str,std::string_view lowercase_ascii)490 bool LowerCaseEqualsASCII(std::string_view str,
491 std::string_view lowercase_ascii) {
492 return DoLowerCaseEqualsASCII<std::string>(str, lowercase_ascii);
493 }
494
LowerCaseEqualsASCII(std::u16string_view str,std::string_view lowercase_ascii)495 bool LowerCaseEqualsASCII(std::u16string_view str,
496 std::string_view lowercase_ascii) {
497 return DoLowerCaseEqualsASCII<std::u16string>(str, lowercase_ascii);
498 }
499
EqualsASCII(std::u16string_view str,std::string_view ascii)500 bool EqualsASCII(std::u16string_view str, std::string_view ascii) {
501 if (str.length() != ascii.length())
502 return false;
503 return std::equal(ascii.begin(), ascii.end(), str.begin());
504 }
505
506 template <typename char_type>
StartsWithT(std::basic_string_view<char_type> str,std::basic_string_view<char_type> search_for,CompareCase case_sensitivity)507 bool StartsWithT(std::basic_string_view<char_type> str,
508 std::basic_string_view<char_type> search_for,
509 CompareCase case_sensitivity) {
510 if (search_for.size() > str.size())
511 return false;
512
513 std::basic_string_view<char_type> source = str.substr(0, search_for.size());
514
515 switch (case_sensitivity) {
516 case CompareCase::SENSITIVE:
517 return source == search_for;
518
519 case CompareCase::INSENSITIVE_ASCII:
520 return std::equal(search_for.begin(), search_for.end(), source.begin(),
521 CaseInsensitiveCompareASCII<char_type>());
522
523 default:
524 NOTREACHED();
525 return false;
526 }
527 }
528
StartsWith(std::string_view str,std::string_view search_for,CompareCase case_sensitivity)529 bool StartsWith(std::string_view str,
530 std::string_view search_for,
531 CompareCase case_sensitivity) {
532 return StartsWithT(str, search_for, case_sensitivity);
533 }
534
StartsWith(std::u16string_view str,std::u16string_view search_for,CompareCase case_sensitivity)535 bool StartsWith(std::u16string_view str,
536 std::u16string_view search_for,
537 CompareCase case_sensitivity) {
538 return StartsWithT(str, search_for, case_sensitivity);
539 }
540
541 template <typename Str>
EndsWithT(std::basic_string_view<typename Str::value_type> str,std::basic_string_view<typename Str::value_type> search_for,CompareCase case_sensitivity)542 bool EndsWithT(std::basic_string_view<typename Str::value_type> str,
543 std::basic_string_view<typename Str::value_type> search_for,
544 CompareCase case_sensitivity) {
545 if (search_for.size() > str.size())
546 return false;
547
548 std::basic_string_view<typename Str::value_type> source =
549 str.substr(str.size() - search_for.size(), search_for.size());
550
551 switch (case_sensitivity) {
552 case CompareCase::SENSITIVE:
553 return source == search_for;
554
555 case CompareCase::INSENSITIVE_ASCII:
556 return std::equal(
557 source.begin(), source.end(), search_for.begin(),
558 CaseInsensitiveCompareASCII<typename Str::value_type>());
559
560 default:
561 NOTREACHED();
562 return false;
563 }
564 }
565
EndsWith(std::string_view str,std::string_view search_for,CompareCase case_sensitivity)566 bool EndsWith(std::string_view str,
567 std::string_view search_for,
568 CompareCase case_sensitivity) {
569 return EndsWithT<std::string>(str, search_for, case_sensitivity);
570 }
571
EndsWith(std::u16string_view str,std::u16string_view search_for,CompareCase case_sensitivity)572 bool EndsWith(std::u16string_view str,
573 std::u16string_view search_for,
574 CompareCase case_sensitivity) {
575 return EndsWithT<std::u16string>(str, search_for, case_sensitivity);
576 }
577
HexDigitToInt(char16_t c)578 char HexDigitToInt(char16_t c) {
579 DCHECK(IsHexDigit(c));
580 if (c >= '0' && c <= '9')
581 return static_cast<char>(c - '0');
582 if (c >= 'A' && c <= 'F')
583 return static_cast<char>(c - 'A' + 10);
584 if (c >= 'a' && c <= 'f')
585 return static_cast<char>(c - 'a' + 10);
586 return 0;
587 }
588
IsUnicodeWhitespace(char16_t c)589 bool IsUnicodeWhitespace(char16_t c) {
590 // kWhitespaceWide is a NULL-terminated string
591 for (const char16_t* cur = kWhitespaceUTF16; *cur; ++cur) {
592 if (*cur == c)
593 return true;
594 }
595 return false;
596 }
597
598 static const char* const kByteStringsUnlocalized[] = {" B", " kB", " MB",
599 " GB", " TB", " PB"};
600
FormatBytesUnlocalized(int64_t bytes)601 std::u16string FormatBytesUnlocalized(int64_t bytes) {
602 double unit_amount = static_cast<double>(bytes);
603 size_t dimension = 0;
604 const int kKilo = 1024;
605 while (unit_amount >= kKilo &&
606 dimension < std::size(kByteStringsUnlocalized) - 1) {
607 unit_amount /= kKilo;
608 dimension++;
609 }
610
611 char buf[64];
612 if (bytes != 0 && dimension > 0 && unit_amount < 100) {
613 base::snprintf(buf, std::size(buf), "%.1lf%s", unit_amount,
614 kByteStringsUnlocalized[dimension]);
615 } else {
616 base::snprintf(buf, std::size(buf), "%.0lf%s", unit_amount,
617 kByteStringsUnlocalized[dimension]);
618 }
619
620 return ASCIIToUTF16(buf);
621 }
622
623 // A Matcher for DoReplaceMatchesAfterOffset() that matches substrings.
624 template <class StringType>
625 struct SubstringMatcher {
626 std::basic_string_view<typename StringType::value_type> find_this;
627
Findbase::SubstringMatcher628 size_t Find(const StringType& input, size_t pos) {
629 return input.find(find_this.data(), pos, find_this.length());
630 }
MatchSizebase::SubstringMatcher631 size_t MatchSize() { return find_this.length(); }
632 };
633
634 // A Matcher for DoReplaceMatchesAfterOffset() that matches single characters.
635 template <class StringType>
636 struct CharacterMatcher {
637 std::basic_string_view<typename StringType::value_type> find_any_of_these;
638
Findbase::CharacterMatcher639 size_t Find(const StringType& input, size_t pos) {
640 return input.find_first_of(find_any_of_these.data(), pos,
641 find_any_of_these.length());
642 }
MatchSizebase::CharacterMatcher643 constexpr size_t MatchSize() { return 1; }
644 };
645
646 enum class ReplaceType { REPLACE_ALL, REPLACE_FIRST };
647
648 // Runs in O(n) time in the length of |str|, and transforms the string without
649 // reallocating when possible. Returns |true| if any matches were found.
650 //
651 // This is parameterized on a |Matcher| traits type, so that it can be the
652 // implementation for both ReplaceChars() and ReplaceSubstringsAfterOffset().
653 template <class StringType, class Matcher>
DoReplaceMatchesAfterOffset(StringType * str,size_t initial_offset,Matcher matcher,std::basic_string_view<typename StringType::value_type> replace_with,ReplaceType replace_type)654 bool DoReplaceMatchesAfterOffset(
655 StringType* str,
656 size_t initial_offset,
657 Matcher matcher,
658 std::basic_string_view<typename StringType::value_type> replace_with,
659 ReplaceType replace_type) {
660 using CharTraits = typename StringType::traits_type;
661
662 const size_t find_length = matcher.MatchSize();
663 if (!find_length)
664 return false;
665
666 // If the find string doesn't appear, there's nothing to do.
667 size_t first_match = matcher.Find(*str, initial_offset);
668 if (first_match == StringType::npos)
669 return false;
670
671 // If we're only replacing one instance, there's no need to do anything
672 // complicated.
673 const size_t replace_length = replace_with.length();
674 if (replace_type == ReplaceType::REPLACE_FIRST) {
675 str->replace(first_match, find_length, replace_with.data(), replace_length);
676 return true;
677 }
678
679 // If the find and replace strings are the same length, we can simply use
680 // replace() on each instance, and finish the entire operation in O(n) time.
681 if (find_length == replace_length) {
682 auto* buffer = &((*str)[0]);
683 for (size_t offset = first_match; offset != StringType::npos;
684 offset = matcher.Find(*str, offset + replace_length)) {
685 CharTraits::copy(buffer + offset, replace_with.data(), replace_length);
686 }
687 return true;
688 }
689
690 // Since the find and replace strings aren't the same length, a loop like the
691 // one above would be O(n^2) in the worst case, as replace() will shift the
692 // entire remaining string each time. We need to be more clever to keep things
693 // O(n).
694 //
695 // When the string is being shortened, it's possible to just shift the matches
696 // down in one pass while finding, and truncate the length at the end of the
697 // search.
698 //
699 // If the string is being lengthened, more work is required. The strategy used
700 // here is to make two find() passes through the string. The first pass counts
701 // the number of matches to determine the new size. The second pass will
702 // either construct the new string into a new buffer (if the existing buffer
703 // lacked capacity), or else -- if there is room -- create a region of scratch
704 // space after |first_match| by shifting the tail of the string to a higher
705 // index, and doing in-place moves from the tail to lower indices thereafter.
706 size_t str_length = str->length();
707 size_t expansion = 0;
708 if (replace_length > find_length) {
709 // This operation lengthens the string; determine the new length by counting
710 // matches.
711 const size_t expansion_per_match = (replace_length - find_length);
712 size_t num_matches = 0;
713 for (size_t match = first_match; match != StringType::npos;
714 match = matcher.Find(*str, match + find_length)) {
715 expansion += expansion_per_match;
716 ++num_matches;
717 }
718 const size_t final_length = str_length + expansion;
719
720 if (str->capacity() < final_length) {
721 // If we'd have to allocate a new buffer to grow the string, build the
722 // result directly into the new allocation via append().
723 StringType src(str->get_allocator());
724 str->swap(src);
725 str->reserve(final_length);
726
727 size_t pos = 0;
728 for (size_t match = first_match;; match = matcher.Find(src, pos)) {
729 str->append(src, pos, match - pos);
730 str->append(replace_with.data(), replace_length);
731 pos = match + find_length;
732
733 // A mid-loop test/break enables skipping the final Find() call; the
734 // number of matches is known, so don't search past the last one.
735 if (!--num_matches)
736 break;
737 }
738
739 // Handle substring after the final match.
740 str->append(src, pos, str_length - pos);
741 return true;
742 }
743
744 // Prepare for the copy/move loop below -- expand the string to its final
745 // size by shifting the data after the first match to the end of the resized
746 // string.
747 size_t shift_src = first_match + find_length;
748 size_t shift_dst = shift_src + expansion;
749
750 // Big |expansion| factors (relative to |str_length|) require padding up to
751 // |shift_dst|.
752 if (shift_dst > str_length)
753 str->resize(shift_dst);
754
755 str->replace(shift_dst, str_length - shift_src, *str, shift_src,
756 str_length - shift_src);
757 str_length = final_length;
758 }
759
760 // We can alternate replacement and move operations. This won't overwrite the
761 // unsearched region of the string so long as |write_offset| <= |read_offset|;
762 // that condition is always satisfied because:
763 //
764 // (a) If the string is being shortened, |expansion| is zero and
765 // |write_offset| grows slower than |read_offset|.
766 //
767 // (b) If the string is being lengthened, |write_offset| grows faster than
768 // |read_offset|, but |expansion| is big enough so that |write_offset|
769 // will only catch up to |read_offset| at the point of the last match.
770 auto* buffer = &((*str)[0]);
771 size_t write_offset = first_match;
772 size_t read_offset = first_match + expansion;
773 do {
774 if (replace_length) {
775 CharTraits::copy(buffer + write_offset, replace_with.data(),
776 replace_length);
777 write_offset += replace_length;
778 }
779 read_offset += find_length;
780
781 // min() clamps StringType::npos (the largest unsigned value) to str_length.
782 size_t match = std::min(matcher.Find(*str, read_offset), str_length);
783
784 size_t length = match - read_offset;
785 if (length) {
786 CharTraits::move(buffer + write_offset, buffer + read_offset, length);
787 write_offset += length;
788 read_offset += length;
789 }
790 } while (read_offset < str_length);
791
792 // If we're shortening the string, truncate it now.
793 str->resize(write_offset);
794 return true;
795 }
796
797 template <class StringType>
ReplaceCharsT(const StringType & input,std::basic_string_view<typename StringType::value_type> find_any_of_these,std::basic_string_view<typename StringType::value_type> replace_with,StringType * output)798 bool ReplaceCharsT(
799 const StringType& input,
800 std::basic_string_view<typename StringType::value_type> find_any_of_these,
801 std::basic_string_view<typename StringType::value_type> replace_with,
802 StringType* output) {
803 // Commonly, this is called with output and input being the same string; in
804 // that case, this assignment is inexpensive.
805 *output = input;
806
807 return DoReplaceMatchesAfterOffset(
808 output, 0, CharacterMatcher<StringType>{find_any_of_these}, replace_with,
809 ReplaceType::REPLACE_ALL);
810 }
811
ReplaceFirstSubstringAfterOffset(std::u16string * str,size_t start_offset,std::u16string_view find_this,std::u16string_view replace_with)812 void ReplaceFirstSubstringAfterOffset(std::u16string* str,
813 size_t start_offset,
814 std::u16string_view find_this,
815 std::u16string_view replace_with) {
816 DoReplaceMatchesAfterOffset(str, start_offset,
817 SubstringMatcher<std::u16string>{find_this},
818 replace_with, ReplaceType::REPLACE_FIRST);
819 }
820
ReplaceFirstSubstringAfterOffset(std::string * str,size_t start_offset,std::string_view find_this,std::string_view replace_with)821 void ReplaceFirstSubstringAfterOffset(std::string* str,
822 size_t start_offset,
823 std::string_view find_this,
824 std::string_view replace_with) {
825 DoReplaceMatchesAfterOffset(str, start_offset,
826 SubstringMatcher<std::string>{find_this},
827 replace_with, ReplaceType::REPLACE_FIRST);
828 }
829
ReplaceSubstringsAfterOffset(std::u16string * str,size_t start_offset,std::u16string_view find_this,std::u16string_view replace_with)830 void ReplaceSubstringsAfterOffset(std::u16string* str,
831 size_t start_offset,
832 std::u16string_view find_this,
833 std::u16string_view replace_with) {
834 DoReplaceMatchesAfterOffset(str, start_offset,
835 SubstringMatcher<std::u16string>{find_this},
836 replace_with, ReplaceType::REPLACE_ALL);
837 }
838
ReplaceSubstringsAfterOffset(std::string * str,size_t start_offset,std::string_view find_this,std::string_view replace_with)839 void ReplaceSubstringsAfterOffset(std::string* str,
840 size_t start_offset,
841 std::string_view find_this,
842 std::string_view replace_with) {
843 DoReplaceMatchesAfterOffset(str, start_offset,
844 SubstringMatcher<std::string>{find_this},
845 replace_with, ReplaceType::REPLACE_ALL);
846 }
847
848 template <class string_type>
WriteIntoT(string_type * str,size_t length_with_null)849 inline typename string_type::value_type* WriteIntoT(string_type* str,
850 size_t length_with_null) {
851 DCHECK_GT(length_with_null, 1u);
852 str->reserve(length_with_null);
853 str->resize(length_with_null - 1);
854 return &((*str)[0]);
855 }
856
WriteInto(std::string * str,size_t length_with_null)857 char* WriteInto(std::string* str, size_t length_with_null) {
858 return WriteIntoT(str, length_with_null);
859 }
860
WriteInto(std::u16string * str,size_t length_with_null)861 char16_t* WriteInto(std::u16string* str, size_t length_with_null) {
862 return WriteIntoT(str, length_with_null);
863 }
864
865 #if defined(_MSC_VER) && !defined(__clang__)
866 // Work around VC++ code-gen bug. https://crbug.com/804884
867 #pragma optimize("", off)
868 #endif
869
870 // Generic version for all JoinString overloads. |list_type| must be a sequence
871 // (std::vector or std::initializer_list) of strings/string_views of any type.
872 template <typename char_type, typename list_type>
JoinStringT(const list_type & parts,std::basic_string_view<char_type> sep)873 static std::basic_string<char_type> JoinStringT(
874 const list_type& parts,
875 std::basic_string_view<char_type> sep) {
876 if (parts.size() == 0)
877 return std::basic_string<char_type>();
878
879 // Pre-allocate the eventual size of the string. Start with the size of all of
880 // the separators (note that this *assumes* parts.size() > 0).
881 size_t total_size = (parts.size() - 1) * sep.size();
882 for (const auto& part : parts)
883 total_size += part.size();
884 std::basic_string<char_type> result;
885 result.reserve(total_size);
886
887 auto iter = parts.begin();
888 DCHECK(iter != parts.end());
889 result.append(*iter);
890 ++iter;
891
892 for (; iter != parts.end(); ++iter) {
893 result.append(sep);
894 result.append(*iter);
895 }
896
897 // Sanity-check that we pre-allocated correctly.
898 DCHECK_EQ(total_size, result.size());
899
900 return result;
901 }
902
JoinString(const std::vector<std::string> & parts,std::string_view separator)903 std::string JoinString(const std::vector<std::string>& parts,
904 std::string_view separator) {
905 return JoinStringT(parts, separator);
906 }
907
JoinString(const std::vector<std::u16string> & parts,std::u16string_view separator)908 std::u16string JoinString(const std::vector<std::u16string>& parts,
909 std::u16string_view separator) {
910 return JoinStringT(parts, separator);
911 }
912
913 #if defined(_MSC_VER) && !defined(__clang__)
914 // Work around VC++ code-gen bug. https://crbug.com/804884
915 #pragma optimize("", on)
916 #endif
917
JoinString(const std::vector<std::string_view> & parts,std::string_view separator)918 std::string JoinString(const std::vector<std::string_view>& parts,
919 std::string_view separator) {
920 return JoinStringT(parts, separator);
921 }
922
JoinString(const std::vector<std::u16string_view> & parts,std::u16string_view separator)923 std::u16string JoinString(const std::vector<std::u16string_view>& parts,
924 std::u16string_view separator) {
925 return JoinStringT(parts, separator);
926 }
927
JoinString(std::initializer_list<std::string_view> parts,std::string_view separator)928 std::string JoinString(std::initializer_list<std::string_view> parts,
929 std::string_view separator) {
930 return JoinStringT(parts, separator);
931 }
932
JoinString(std::initializer_list<std::u16string_view> parts,std::u16string_view separator)933 std::u16string JoinString(std::initializer_list<std::u16string_view> parts,
934 std::u16string_view separator) {
935 return JoinStringT(parts, separator);
936 }
937
938 template <class FormatStringType, class OutStringType>
DoReplaceStringPlaceholders(const FormatStringType & format_string,const std::vector<OutStringType> & subst,std::vector<size_t> * offsets)939 OutStringType DoReplaceStringPlaceholders(
940 const FormatStringType& format_string,
941 const std::vector<OutStringType>& subst,
942 std::vector<size_t>* offsets) {
943 size_t substitutions = subst.size();
944 DCHECK_LT(substitutions, 10U);
945
946 size_t sub_length = 0;
947 for (const auto& cur : subst)
948 sub_length += cur.length();
949
950 OutStringType formatted;
951 formatted.reserve(format_string.length() + sub_length);
952
953 std::vector<ReplacementOffset> r_offsets;
954 for (auto i = format_string.begin(); i != format_string.end(); ++i) {
955 if ('$' == *i) {
956 if (i + 1 != format_string.end()) {
957 ++i;
958 if ('$' == *i) {
959 while (i != format_string.end() && '$' == *i) {
960 formatted.push_back('$');
961 ++i;
962 }
963 --i;
964 } else {
965 if (*i < '1' || *i > '9') {
966 DLOG(ERROR) << "Invalid placeholder: $" << *i;
967 continue;
968 }
969 uintptr_t index = *i - '1';
970 if (offsets) {
971 ReplacementOffset r_offset(index,
972 static_cast<int>(formatted.size()));
973 r_offsets.insert(
974 std::upper_bound(r_offsets.begin(), r_offsets.end(), r_offset,
975 &CompareParameter),
976 r_offset);
977 }
978 if (index < substitutions)
979 formatted.append(subst.at(index));
980 }
981 }
982 } else {
983 formatted.push_back(*i);
984 }
985 }
986 if (offsets) {
987 for (const auto& cur : r_offsets)
988 offsets->push_back(cur.offset);
989 }
990 return formatted;
991 }
992
ReplaceStringPlaceholders(const std::u16string & format_string,const std::vector<std::u16string> & subst,std::vector<size_t> * offsets)993 std::u16string ReplaceStringPlaceholders(
994 const std::u16string& format_string,
995 const std::vector<std::u16string>& subst,
996 std::vector<size_t>* offsets) {
997 return DoReplaceStringPlaceholders(format_string, subst, offsets);
998 }
999
ReplaceStringPlaceholders(std::string_view format_string,const std::vector<std::string> & subst,std::vector<size_t> * offsets)1000 std::string ReplaceStringPlaceholders(std::string_view format_string,
1001 const std::vector<std::string>& subst,
1002 std::vector<size_t>* offsets) {
1003 return DoReplaceStringPlaceholders(format_string, subst, offsets);
1004 }
1005
ReplaceStringPlaceholders(const std::u16string & format_string,const std::u16string & a,size_t * offset)1006 std::u16string ReplaceStringPlaceholders(const std::u16string& format_string,
1007 const std::u16string& a,
1008 size_t* offset) {
1009 std::vector<size_t> offsets;
1010 std::vector<std::u16string> subst;
1011 subst.push_back(a);
1012 std::u16string result =
1013 ReplaceStringPlaceholders(format_string, subst, &offsets);
1014
1015 DCHECK_EQ(1U, offsets.size());
1016 if (offset)
1017 *offset = offsets[0];
1018 return result;
1019 }
1020
1021 // The following code is compatible with the OpenBSD lcpy interface. See:
1022 // http://www.gratisoft.us/todd/papers/strlcpy.html
1023 // ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c
1024
1025 namespace {
1026
1027 template <typename CHAR>
lcpyT(CHAR * dst,const CHAR * src,size_t dst_size)1028 size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) {
1029 for (size_t i = 0; i < dst_size; ++i) {
1030 if ((dst[i] = src[i]) == 0) // We hit and copied the terminating NULL.
1031 return i;
1032 }
1033
1034 // We were left off at dst_size. We over copied 1 byte. Null terminate.
1035 if (dst_size != 0)
1036 dst[dst_size - 1] = 0;
1037
1038 // Count the rest of the |src|, and return it's length in characters.
1039 while (src[dst_size])
1040 ++dst_size;
1041 return dst_size;
1042 }
1043
1044 } // namespace
1045
1046 } // namespace base
1047