• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/strings/string_util.h"
6 
7 #include <ctype.h>
8 #include <errno.h>
9 #include <math.h>
10 #include <stdarg.h>
11 #include <stdint.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <time.h>
16 #include <wchar.h>
17 #include <wctype.h>
18 
19 #include <algorithm>
20 #include <iterator>
21 #include <limits>
22 #include <vector>
23 
24 #include "base/logging.h"
25 #include "base/strings/utf_string_conversion_utils.h"
26 #include "base/strings/utf_string_conversions.h"
27 #include "base/third_party/icu/icu_utf.h"
28 #include "util/build_config.h"
29 
30 namespace base {
31 
32 namespace {
33 
34 // Used by ReplaceStringPlaceholders to track the position in the string of
35 // replaced parameters.
36 struct ReplacementOffset {
ReplacementOffsetbase::__anonf98482750111::ReplacementOffset37   ReplacementOffset(uintptr_t parameter, size_t offset)
38       : parameter(parameter), offset(offset) {}
39 
40   // Index of the parameter.
41   uintptr_t parameter;
42 
43   // Starting position in the string.
44   size_t offset;
45 };
46 
CompareParameter(const ReplacementOffset & elem1,const ReplacementOffset & elem2)47 static bool CompareParameter(const ReplacementOffset& elem1,
48                              const ReplacementOffset& elem2) {
49   return elem1.parameter < elem2.parameter;
50 }
51 
52 // Assuming that a pointer is the size of a "machine word", then
53 // uintptr_t is an integer type that is also a machine word.
54 typedef uintptr_t MachineWord;
55 const uintptr_t kMachineWordAlignmentMask = sizeof(MachineWord) - 1;
56 
IsAlignedToMachineWord(const void * pointer)57 inline bool IsAlignedToMachineWord(const void* pointer) {
58   return !(reinterpret_cast<MachineWord>(pointer) & kMachineWordAlignmentMask);
59 }
60 
61 template <typename T>
AlignToMachineWord(T * pointer)62 inline T* AlignToMachineWord(T* pointer) {
63   return reinterpret_cast<T*>(reinterpret_cast<MachineWord>(pointer) &
64                               ~kMachineWordAlignmentMask);
65 }
66 
67 template <size_t size, typename CharacterType>
68 struct NonASCIIMask;
69 template <>
70 struct NonASCIIMask<4, char16_t> {
valuebase::__anonf98482750111::NonASCIIMask71   static inline uint32_t value() { return 0xFF80FF80U; }
72 };
73 template <>
74 struct NonASCIIMask<4, char> {
valuebase::__anonf98482750111::NonASCIIMask75   static inline uint32_t value() { return 0x80808080U; }
76 };
77 template <>
78 struct NonASCIIMask<8, char16_t> {
valuebase::__anonf98482750111::NonASCIIMask79   static inline uint64_t value() { return 0xFF80FF80FF80FF80ULL; }
80 };
81 template <>
82 struct NonASCIIMask<8, char> {
valuebase::__anonf98482750111::NonASCIIMask83   static inline uint64_t value() { return 0x8080808080808080ULL; }
84 };
85 
86 }  // namespace
87 
88 namespace {
89 
90 template <typename StringType>
ToLowerASCIIImpl(std::basic_string_view<typename StringType::value_type> str)91 StringType ToLowerASCIIImpl(
92     std::basic_string_view<typename StringType::value_type> str) {
93   StringType ret;
94   ret.reserve(str.size());
95   for (size_t i = 0; i < str.size(); i++)
96     ret.push_back(ToLowerASCII(str[i]));
97   return ret;
98 }
99 
100 template <typename StringType>
ToUpperASCIIImpl(std::basic_string_view<typename StringType::value_type> str)101 StringType ToUpperASCIIImpl(
102     std::basic_string_view<typename StringType::value_type> str) {
103   StringType ret;
104   ret.reserve(str.size());
105   for (size_t i = 0; i < str.size(); i++)
106     ret.push_back(ToUpperASCII(str[i]));
107   return ret;
108 }
109 
110 }  // namespace
111 
ToLowerASCII(std::string_view str)112 std::string ToLowerASCII(std::string_view str) {
113   return ToLowerASCIIImpl<std::string>(str);
114 }
115 
ToLowerASCII(std::u16string_view str)116 std::u16string ToLowerASCII(std::u16string_view str) {
117   return ToLowerASCIIImpl<std::u16string>(str);
118 }
119 
ToUpperASCII(std::string_view str)120 std::string ToUpperASCII(std::string_view str) {
121   return ToUpperASCIIImpl<std::string>(str);
122 }
123 
ToUpperASCII(std::u16string_view str)124 std::u16string ToUpperASCII(std::u16string_view str) {
125   return ToUpperASCIIImpl<std::u16string>(str);
126 }
127 
128 template <class StringType>
CompareCaseInsensitiveASCIIT(std::basic_string_view<typename StringType::value_type> a,std::basic_string_view<typename StringType::value_type> b)129 int CompareCaseInsensitiveASCIIT(
130     std::basic_string_view<typename StringType::value_type> a,
131     std::basic_string_view<typename StringType::value_type> b) {
132   // Find the first characters that aren't equal and compare them.  If the end
133   // of one of the strings is found before a nonequal character, the lengths
134   // of the strings are compared.
135   size_t i = 0;
136   while (i < a.length() && i < b.length()) {
137     typename StringType::value_type lower_a = ToLowerASCII(a[i]);
138     typename StringType::value_type lower_b = ToLowerASCII(b[i]);
139     if (lower_a < lower_b)
140       return -1;
141     if (lower_a > lower_b)
142       return 1;
143     i++;
144   }
145 
146   // End of one string hit before finding a different character. Expect the
147   // common case to be "strings equal" at this point so check that first.
148   if (a.length() == b.length())
149     return 0;
150 
151   if (a.length() < b.length())
152     return -1;
153   return 1;
154 }
155 
CompareCaseInsensitiveASCII(std::string_view a,std::string_view b)156 int CompareCaseInsensitiveASCII(std::string_view a, std::string_view b) {
157   return CompareCaseInsensitiveASCIIT<std::string>(a, b);
158 }
159 
CompareCaseInsensitiveASCII(std::u16string_view a,std::u16string_view b)160 int CompareCaseInsensitiveASCII(std::u16string_view a, std::u16string_view b) {
161   return CompareCaseInsensitiveASCIIT<std::u16string>(a, b);
162 }
163 
EqualsCaseInsensitiveASCII(std::string_view a,std::string_view b)164 bool EqualsCaseInsensitiveASCII(std::string_view a, std::string_view b) {
165   if (a.length() != b.length())
166     return false;
167   return CompareCaseInsensitiveASCIIT<std::string>(a, b) == 0;
168 }
169 
EqualsCaseInsensitiveASCII(std::u16string_view a,std::u16string_view b)170 bool EqualsCaseInsensitiveASCII(std::u16string_view a, std::u16string_view b) {
171   if (a.length() != b.length())
172     return false;
173   return CompareCaseInsensitiveASCIIT<std::u16string>(a, b) == 0;
174 }
175 
176 template <class StringType>
177 bool ReplaceCharsT(
178     const StringType& input,
179     std::basic_string_view<typename StringType::value_type> find_any_of_these,
180     std::basic_string_view<typename StringType::value_type> replace_with,
181     StringType* output);
182 
ReplaceChars(const std::u16string & input,std::u16string_view replace_chars,const std::u16string & replace_with,std::u16string * output)183 bool ReplaceChars(const std::u16string& input,
184                   std::u16string_view replace_chars,
185                   const std::u16string& replace_with,
186                   std::u16string* output) {
187   return ReplaceCharsT(input, replace_chars, std::u16string_view(replace_with),
188                        output);
189 }
190 
ReplaceChars(const std::string & input,std::string_view replace_chars,const std::string & replace_with,std::string * output)191 bool ReplaceChars(const std::string& input,
192                   std::string_view replace_chars,
193                   const std::string& replace_with,
194                   std::string* output) {
195   return ReplaceCharsT(input, replace_chars, std::string_view(replace_with),
196                        output);
197 }
198 
RemoveChars(const std::u16string & input,std::u16string_view remove_chars,std::u16string * output)199 bool RemoveChars(const std::u16string& input,
200                  std::u16string_view remove_chars,
201                  std::u16string* output) {
202   return ReplaceCharsT(input, remove_chars, std::u16string_view(), output);
203 }
204 
RemoveChars(const std::string & input,std::string_view remove_chars,std::string * output)205 bool RemoveChars(const std::string& input,
206                  std::string_view remove_chars,
207                  std::string* output) {
208   return ReplaceCharsT(input, remove_chars, std::string_view(), output);
209 }
210 
211 template <typename Str>
TrimStringT(const Str & input,std::basic_string_view<typename Str::value_type> trim_chars,TrimPositions positions,Str * output)212 TrimPositions TrimStringT(
213     const Str& input,
214     std::basic_string_view<typename Str::value_type> trim_chars,
215     TrimPositions positions,
216     Str* output) {
217   // Find the edges of leading/trailing whitespace as desired. Need to use
218   // a std::string_view version of input to be able to call find* on it with the
219   // std::string_view version of trim_chars (normally the trim_chars will be a
220   // constant so avoid making a copy).
221   std::basic_string_view<typename Str::value_type> input_piece(input);
222   const size_t last_char = input.length() - 1;
223   const size_t first_good_char = (positions & TRIM_LEADING)
224                                      ? input_piece.find_first_not_of(trim_chars)
225                                      : 0;
226   const size_t last_good_char = (positions & TRIM_TRAILING)
227                                     ? input_piece.find_last_not_of(trim_chars)
228                                     : last_char;
229 
230   // When the string was all trimmed, report that we stripped off characters
231   // from whichever position the caller was interested in. For empty input, we
232   // stripped no characters, but we still need to clear |output|.
233   if (input.empty() || (first_good_char == Str::npos) ||
234       (last_good_char == Str::npos)) {
235     bool input_was_empty = input.empty();  // in case output == &input
236     output->clear();
237     return input_was_empty ? TRIM_NONE : positions;
238   }
239 
240   // Trim.
241   *output = input.substr(first_good_char, last_good_char - first_good_char + 1);
242 
243   // Return where we trimmed from.
244   return static_cast<TrimPositions>(
245       ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) |
246       ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING));
247 }
248 
TrimString(const std::u16string & input,std::u16string_view trim_chars,std::u16string * output)249 bool TrimString(const std::u16string& input,
250                 std::u16string_view trim_chars,
251                 std::u16string* output) {
252   return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
253 }
254 
TrimString(const std::string & input,std::string_view trim_chars,std::string * output)255 bool TrimString(const std::string& input,
256                 std::string_view trim_chars,
257                 std::string* output) {
258   return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE;
259 }
260 
261 template <typename char_type>
TrimStringPieceT(std::basic_string_view<char_type> input,std::basic_string_view<char_type> trim_chars,TrimPositions positions)262 std::basic_string_view<char_type> TrimStringPieceT(
263     std::basic_string_view<char_type> input,
264     std::basic_string_view<char_type> trim_chars,
265     TrimPositions positions) {
266   size_t begin =
267       (positions & TRIM_LEADING) ? input.find_first_not_of(trim_chars) : 0;
268   if (begin == std::basic_string_view<char_type>::npos)
269     return std::basic_string_view<char_type>();  // All trimmed.
270 
271   size_t end = (positions & TRIM_TRAILING)
272                    ? input.find_last_not_of(trim_chars) + 1
273                    : input.size();
274   return input.substr(begin, end - begin);
275 }
276 
TrimString(std::u16string_view input,std::u16string_view trim_chars,TrimPositions positions)277 std::u16string_view TrimString(std::u16string_view input,
278                                std::u16string_view trim_chars,
279                                TrimPositions positions) {
280   return TrimStringPieceT(input, trim_chars, positions);
281 }
282 
TrimString(std::string_view input,std::string_view trim_chars,TrimPositions positions)283 std::string_view TrimString(std::string_view input,
284                             std::string_view trim_chars,
285                             TrimPositions positions) {
286   return TrimStringPieceT(input, trim_chars, positions);
287 }
288 
TruncateUTF8ToByteSize(const std::string & input,const size_t byte_size,std::string * output)289 void TruncateUTF8ToByteSize(const std::string& input,
290                             const size_t byte_size,
291                             std::string* output) {
292   DCHECK(output);
293   if (byte_size > input.length()) {
294     *output = input;
295     return;
296   }
297   DCHECK_LE(byte_size,
298             static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
299   // Note: This cast is necessary because CBU8_NEXT uses int32_ts.
300   int32_t truncation_length = static_cast<int32_t>(byte_size);
301   int32_t char_index = truncation_length - 1;
302   const char* data = input.data();
303 
304   // Using CBU8, we will move backwards from the truncation point
305   // to the beginning of the string looking for a valid UTF8
306   // character.  Once a full UTF8 character is found, we will
307   // truncate the string to the end of that character.
308   while (char_index >= 0) {
309     int32_t prev = char_index;
310     base_icu::UChar32 code_point = 0;
311     CBU8_NEXT(data, char_index, truncation_length, code_point);
312     if (!IsValidCharacter(code_point) || !IsValidCodepoint(code_point)) {
313       char_index = prev - 1;
314     } else {
315       break;
316     }
317   }
318 
319   if (char_index >= 0)
320     *output = input.substr(0, char_index);
321   else
322     output->clear();
323 }
324 
TrimWhitespace(const std::u16string & input,TrimPositions positions,std::u16string * output)325 TrimPositions TrimWhitespace(const std::u16string& input,
326                              TrimPositions positions,
327                              std::u16string* output) {
328   return TrimStringT(input, std::u16string_view(kWhitespaceUTF16), positions,
329                      output);
330 }
331 
TrimWhitespace(std::u16string_view input,TrimPositions positions)332 std::u16string_view TrimWhitespace(std::u16string_view input,
333                                    TrimPositions positions) {
334   return TrimStringPieceT(input, std::u16string_view(kWhitespaceUTF16),
335                           positions);
336 }
337 
TrimWhitespaceASCII(const std::string & input,TrimPositions positions,std::string * output)338 TrimPositions TrimWhitespaceASCII(const std::string& input,
339                                   TrimPositions positions,
340                                   std::string* output) {
341   return TrimStringT(input, std::string_view(kWhitespaceASCII), positions,
342                      output);
343 }
344 
TrimWhitespaceASCII(std::string_view input,TrimPositions positions)345 std::string_view TrimWhitespaceASCII(std::string_view input,
346                                      TrimPositions positions) {
347   return TrimStringPieceT(input, std::string_view(kWhitespaceASCII), positions);
348 }
349 
350 template <typename STR>
CollapseWhitespaceT(const STR & text,bool trim_sequences_with_line_breaks)351 STR CollapseWhitespaceT(const STR& text, bool trim_sequences_with_line_breaks) {
352   STR result;
353   result.resize(text.size());
354 
355   // Set flags to pretend we're already in a trimmed whitespace sequence, so we
356   // will trim any leading whitespace.
357   bool in_whitespace = true;
358   bool already_trimmed = true;
359 
360   int chars_written = 0;
361   for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) {
362     if (IsUnicodeWhitespace(*i)) {
363       if (!in_whitespace) {
364         // Reduce all whitespace sequences to a single space.
365         in_whitespace = true;
366         result[chars_written++] = L' ';
367       }
368       if (trim_sequences_with_line_breaks && !already_trimmed &&
369           ((*i == '\n') || (*i == '\r'))) {
370         // Whitespace sequences containing CR or LF are eliminated entirely.
371         already_trimmed = true;
372         --chars_written;
373       }
374     } else {
375       // Non-whitespace characters are copied straight across.
376       in_whitespace = false;
377       already_trimmed = false;
378       result[chars_written++] = *i;
379     }
380   }
381 
382   if (in_whitespace && !already_trimmed) {
383     // Any trailing whitespace is eliminated.
384     --chars_written;
385   }
386 
387   result.resize(chars_written);
388   return result;
389 }
390 
CollapseWhitespace(const std::u16string & text,bool trim_sequences_with_line_breaks)391 std::u16string CollapseWhitespace(const std::u16string& text,
392                                   bool trim_sequences_with_line_breaks) {
393   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
394 }
395 
CollapseWhitespaceASCII(const std::string & text,bool trim_sequences_with_line_breaks)396 std::string CollapseWhitespaceASCII(const std::string& text,
397                                     bool trim_sequences_with_line_breaks) {
398   return CollapseWhitespaceT(text, trim_sequences_with_line_breaks);
399 }
400 
ContainsOnlyChars(std::string_view input,std::string_view characters)401 bool ContainsOnlyChars(std::string_view input, std::string_view characters) {
402   return input.find_first_not_of(characters) == std::string_view::npos;
403 }
404 
ContainsOnlyChars(std::u16string_view input,std::u16string_view characters)405 bool ContainsOnlyChars(std::u16string_view input,
406                        std::u16string_view characters) {
407   return input.find_first_not_of(characters) == std::u16string_view::npos;
408 }
409 
410 template <class Char>
DoIsStringASCII(const Char * characters,size_t length)411 inline bool DoIsStringASCII(const Char* characters, size_t length) {
412   MachineWord all_char_bits = 0;
413   const Char* end = characters + length;
414 
415   // Prologue: align the input.
416   while (!IsAlignedToMachineWord(characters) && characters != end) {
417     all_char_bits |= *characters;
418     ++characters;
419   }
420 
421   // Compare the values of CPU word size.
422   const Char* word_end = AlignToMachineWord(end);
423   const size_t loop_increment = sizeof(MachineWord) / sizeof(Char);
424   while (characters < word_end) {
425     all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters));
426     characters += loop_increment;
427   }
428 
429   // Process the remaining bytes.
430   while (characters != end) {
431     all_char_bits |= *characters;
432     ++characters;
433   }
434 
435   MachineWord non_ascii_bit_mask =
436       NonASCIIMask<sizeof(MachineWord), Char>::value();
437   return !(all_char_bits & non_ascii_bit_mask);
438 }
439 
IsStringASCII(std::string_view str)440 bool IsStringASCII(std::string_view str) {
441   return DoIsStringASCII(str.data(), str.length());
442 }
443 
IsStringASCII(std::u16string_view str)444 bool IsStringASCII(std::u16string_view str) {
445   return DoIsStringASCII(str.data(), str.length());
446 }
447 
IsStringUTF8(std::string_view str)448 bool IsStringUTF8(std::string_view str) {
449   const char* src = str.data();
450   int32_t src_len = static_cast<int32_t>(str.length());
451   int32_t char_index = 0;
452 
453   while (char_index < src_len) {
454     int32_t code_point;
455     CBU8_NEXT(src, char_index, src_len, code_point);
456     if (!IsValidCharacter(code_point))
457       return false;
458   }
459   return true;
460 }
461 
462 // Implementation note: Normally this function will be called with a hardcoded
463 // constant for the lowercase_ascii parameter. Constructing a std::string_view
464 // from a C constant requires running strlen, so the result will be two passes
465 // through the buffers, one to file the length of lowercase_ascii, and one to
466 // compare each letter.
467 //
468 // This function could have taken a const char* to avoid this and only do one
469 // pass through the string. But the strlen is faster than the case-insensitive
470 // compares and lets us early-exit in the case that the strings are different
471 // lengths (will often be the case for non-matches). So whether one approach or
472 // the other will be faster depends on the case.
473 //
474 // The hardcoded strings are typically very short so it doesn't matter, and the
475 // string piece gives additional flexibility for the caller (doesn't have to be
476 // null terminated) so we choose the std::string_view route.
477 template <typename Str>
DoLowerCaseEqualsASCII(std::basic_string_view<typename Str::value_type> str,std::string_view lowercase_ascii)478 static inline bool DoLowerCaseEqualsASCII(
479     std::basic_string_view<typename Str::value_type> str,
480     std::string_view lowercase_ascii) {
481   if (str.size() != lowercase_ascii.size())
482     return false;
483   for (size_t i = 0; i < str.size(); i++) {
484     if (ToLowerASCII(str[i]) != lowercase_ascii[i])
485       return false;
486   }
487   return true;
488 }
489 
LowerCaseEqualsASCII(std::string_view str,std::string_view lowercase_ascii)490 bool LowerCaseEqualsASCII(std::string_view str,
491                           std::string_view lowercase_ascii) {
492   return DoLowerCaseEqualsASCII<std::string>(str, lowercase_ascii);
493 }
494 
LowerCaseEqualsASCII(std::u16string_view str,std::string_view lowercase_ascii)495 bool LowerCaseEqualsASCII(std::u16string_view str,
496                           std::string_view lowercase_ascii) {
497   return DoLowerCaseEqualsASCII<std::u16string>(str, lowercase_ascii);
498 }
499 
EqualsASCII(std::u16string_view str,std::string_view ascii)500 bool EqualsASCII(std::u16string_view str, std::string_view ascii) {
501   if (str.length() != ascii.length())
502     return false;
503   return std::equal(ascii.begin(), ascii.end(), str.begin());
504 }
505 
506 template <typename char_type>
StartsWithT(std::basic_string_view<char_type> str,std::basic_string_view<char_type> search_for,CompareCase case_sensitivity)507 bool StartsWithT(std::basic_string_view<char_type> str,
508                  std::basic_string_view<char_type> search_for,
509                  CompareCase case_sensitivity) {
510   if (search_for.size() > str.size())
511     return false;
512 
513   std::basic_string_view<char_type> source = str.substr(0, search_for.size());
514 
515   switch (case_sensitivity) {
516     case CompareCase::SENSITIVE:
517       return source == search_for;
518 
519     case CompareCase::INSENSITIVE_ASCII:
520       return std::equal(search_for.begin(), search_for.end(), source.begin(),
521                         CaseInsensitiveCompareASCII<char_type>());
522 
523     default:
524       NOTREACHED();
525       return false;
526   }
527 }
528 
StartsWith(std::string_view str,std::string_view search_for,CompareCase case_sensitivity)529 bool StartsWith(std::string_view str,
530                 std::string_view search_for,
531                 CompareCase case_sensitivity) {
532   return StartsWithT(str, search_for, case_sensitivity);
533 }
534 
StartsWith(std::u16string_view str,std::u16string_view search_for,CompareCase case_sensitivity)535 bool StartsWith(std::u16string_view str,
536                 std::u16string_view search_for,
537                 CompareCase case_sensitivity) {
538   return StartsWithT(str, search_for, case_sensitivity);
539 }
540 
541 template <typename Str>
EndsWithT(std::basic_string_view<typename Str::value_type> str,std::basic_string_view<typename Str::value_type> search_for,CompareCase case_sensitivity)542 bool EndsWithT(std::basic_string_view<typename Str::value_type> str,
543                std::basic_string_view<typename Str::value_type> search_for,
544                CompareCase case_sensitivity) {
545   if (search_for.size() > str.size())
546     return false;
547 
548   std::basic_string_view<typename Str::value_type> source =
549       str.substr(str.size() - search_for.size(), search_for.size());
550 
551   switch (case_sensitivity) {
552     case CompareCase::SENSITIVE:
553       return source == search_for;
554 
555     case CompareCase::INSENSITIVE_ASCII:
556       return std::equal(
557           source.begin(), source.end(), search_for.begin(),
558           CaseInsensitiveCompareASCII<typename Str::value_type>());
559 
560     default:
561       NOTREACHED();
562       return false;
563   }
564 }
565 
EndsWith(std::string_view str,std::string_view search_for,CompareCase case_sensitivity)566 bool EndsWith(std::string_view str,
567               std::string_view search_for,
568               CompareCase case_sensitivity) {
569   return EndsWithT<std::string>(str, search_for, case_sensitivity);
570 }
571 
EndsWith(std::u16string_view str,std::u16string_view search_for,CompareCase case_sensitivity)572 bool EndsWith(std::u16string_view str,
573               std::u16string_view search_for,
574               CompareCase case_sensitivity) {
575   return EndsWithT<std::u16string>(str, search_for, case_sensitivity);
576 }
577 
HexDigitToInt(char16_t c)578 char HexDigitToInt(char16_t c) {
579   DCHECK(IsHexDigit(c));
580   if (c >= '0' && c <= '9')
581     return static_cast<char>(c - '0');
582   if (c >= 'A' && c <= 'F')
583     return static_cast<char>(c - 'A' + 10);
584   if (c >= 'a' && c <= 'f')
585     return static_cast<char>(c - 'a' + 10);
586   return 0;
587 }
588 
IsUnicodeWhitespace(char16_t c)589 bool IsUnicodeWhitespace(char16_t c) {
590   // kWhitespaceWide is a NULL-terminated string
591   for (const char16_t* cur = kWhitespaceUTF16; *cur; ++cur) {
592     if (*cur == c)
593       return true;
594   }
595   return false;
596 }
597 
598 static const char* const kByteStringsUnlocalized[] = {" B",  " kB", " MB",
599                                                       " GB", " TB", " PB"};
600 
FormatBytesUnlocalized(int64_t bytes)601 std::u16string FormatBytesUnlocalized(int64_t bytes) {
602   double unit_amount = static_cast<double>(bytes);
603   size_t dimension = 0;
604   const int kKilo = 1024;
605   while (unit_amount >= kKilo &&
606          dimension < std::size(kByteStringsUnlocalized) - 1) {
607     unit_amount /= kKilo;
608     dimension++;
609   }
610 
611   char buf[64];
612   if (bytes != 0 && dimension > 0 && unit_amount < 100) {
613     base::snprintf(buf, std::size(buf), "%.1lf%s", unit_amount,
614                    kByteStringsUnlocalized[dimension]);
615   } else {
616     base::snprintf(buf, std::size(buf), "%.0lf%s", unit_amount,
617                    kByteStringsUnlocalized[dimension]);
618   }
619 
620   return ASCIIToUTF16(buf);
621 }
622 
623 // A Matcher for DoReplaceMatchesAfterOffset() that matches substrings.
624 template <class StringType>
625 struct SubstringMatcher {
626   std::basic_string_view<typename StringType::value_type> find_this;
627 
Findbase::SubstringMatcher628   size_t Find(const StringType& input, size_t pos) {
629     return input.find(find_this.data(), pos, find_this.length());
630   }
MatchSizebase::SubstringMatcher631   size_t MatchSize() { return find_this.length(); }
632 };
633 
634 // A Matcher for DoReplaceMatchesAfterOffset() that matches single characters.
635 template <class StringType>
636 struct CharacterMatcher {
637   std::basic_string_view<typename StringType::value_type> find_any_of_these;
638 
Findbase::CharacterMatcher639   size_t Find(const StringType& input, size_t pos) {
640     return input.find_first_of(find_any_of_these.data(), pos,
641                                find_any_of_these.length());
642   }
MatchSizebase::CharacterMatcher643   constexpr size_t MatchSize() { return 1; }
644 };
645 
646 enum class ReplaceType { REPLACE_ALL, REPLACE_FIRST };
647 
648 // Runs in O(n) time in the length of |str|, and transforms the string without
649 // reallocating when possible. Returns |true| if any matches were found.
650 //
651 // This is parameterized on a |Matcher| traits type, so that it can be the
652 // implementation for both ReplaceChars() and ReplaceSubstringsAfterOffset().
653 template <class StringType, class Matcher>
DoReplaceMatchesAfterOffset(StringType * str,size_t initial_offset,Matcher matcher,std::basic_string_view<typename StringType::value_type> replace_with,ReplaceType replace_type)654 bool DoReplaceMatchesAfterOffset(
655     StringType* str,
656     size_t initial_offset,
657     Matcher matcher,
658     std::basic_string_view<typename StringType::value_type> replace_with,
659     ReplaceType replace_type) {
660   using CharTraits = typename StringType::traits_type;
661 
662   const size_t find_length = matcher.MatchSize();
663   if (!find_length)
664     return false;
665 
666   // If the find string doesn't appear, there's nothing to do.
667   size_t first_match = matcher.Find(*str, initial_offset);
668   if (first_match == StringType::npos)
669     return false;
670 
671   // If we're only replacing one instance, there's no need to do anything
672   // complicated.
673   const size_t replace_length = replace_with.length();
674   if (replace_type == ReplaceType::REPLACE_FIRST) {
675     str->replace(first_match, find_length, replace_with.data(), replace_length);
676     return true;
677   }
678 
679   // If the find and replace strings are the same length, we can simply use
680   // replace() on each instance, and finish the entire operation in O(n) time.
681   if (find_length == replace_length) {
682     auto* buffer = &((*str)[0]);
683     for (size_t offset = first_match; offset != StringType::npos;
684          offset = matcher.Find(*str, offset + replace_length)) {
685       CharTraits::copy(buffer + offset, replace_with.data(), replace_length);
686     }
687     return true;
688   }
689 
690   // Since the find and replace strings aren't the same length, a loop like the
691   // one above would be O(n^2) in the worst case, as replace() will shift the
692   // entire remaining string each time. We need to be more clever to keep things
693   // O(n).
694   //
695   // When the string is being shortened, it's possible to just shift the matches
696   // down in one pass while finding, and truncate the length at the end of the
697   // search.
698   //
699   // If the string is being lengthened, more work is required. The strategy used
700   // here is to make two find() passes through the string. The first pass counts
701   // the number of matches to determine the new size. The second pass will
702   // either construct the new string into a new buffer (if the existing buffer
703   // lacked capacity), or else -- if there is room -- create a region of scratch
704   // space after |first_match| by shifting the tail of the string to a higher
705   // index, and doing in-place moves from the tail to lower indices thereafter.
706   size_t str_length = str->length();
707   size_t expansion = 0;
708   if (replace_length > find_length) {
709     // This operation lengthens the string; determine the new length by counting
710     // matches.
711     const size_t expansion_per_match = (replace_length - find_length);
712     size_t num_matches = 0;
713     for (size_t match = first_match; match != StringType::npos;
714          match = matcher.Find(*str, match + find_length)) {
715       expansion += expansion_per_match;
716       ++num_matches;
717     }
718     const size_t final_length = str_length + expansion;
719 
720     if (str->capacity() < final_length) {
721       // If we'd have to allocate a new buffer to grow the string, build the
722       // result directly into the new allocation via append().
723       StringType src(str->get_allocator());
724       str->swap(src);
725       str->reserve(final_length);
726 
727       size_t pos = 0;
728       for (size_t match = first_match;; match = matcher.Find(src, pos)) {
729         str->append(src, pos, match - pos);
730         str->append(replace_with.data(), replace_length);
731         pos = match + find_length;
732 
733         // A mid-loop test/break enables skipping the final Find() call; the
734         // number of matches is known, so don't search past the last one.
735         if (!--num_matches)
736           break;
737       }
738 
739       // Handle substring after the final match.
740       str->append(src, pos, str_length - pos);
741       return true;
742     }
743 
744     // Prepare for the copy/move loop below -- expand the string to its final
745     // size by shifting the data after the first match to the end of the resized
746     // string.
747     size_t shift_src = first_match + find_length;
748     size_t shift_dst = shift_src + expansion;
749 
750     // Big |expansion| factors (relative to |str_length|) require padding up to
751     // |shift_dst|.
752     if (shift_dst > str_length)
753       str->resize(shift_dst);
754 
755     str->replace(shift_dst, str_length - shift_src, *str, shift_src,
756                  str_length - shift_src);
757     str_length = final_length;
758   }
759 
760   // We can alternate replacement and move operations. This won't overwrite the
761   // unsearched region of the string so long as |write_offset| <= |read_offset|;
762   // that condition is always satisfied because:
763   //
764   //   (a) If the string is being shortened, |expansion| is zero and
765   //       |write_offset| grows slower than |read_offset|.
766   //
767   //   (b) If the string is being lengthened, |write_offset| grows faster than
768   //       |read_offset|, but |expansion| is big enough so that |write_offset|
769   //       will only catch up to |read_offset| at the point of the last match.
770   auto* buffer = &((*str)[0]);
771   size_t write_offset = first_match;
772   size_t read_offset = first_match + expansion;
773   do {
774     if (replace_length) {
775       CharTraits::copy(buffer + write_offset, replace_with.data(),
776                        replace_length);
777       write_offset += replace_length;
778     }
779     read_offset += find_length;
780 
781     // min() clamps StringType::npos (the largest unsigned value) to str_length.
782     size_t match = std::min(matcher.Find(*str, read_offset), str_length);
783 
784     size_t length = match - read_offset;
785     if (length) {
786       CharTraits::move(buffer + write_offset, buffer + read_offset, length);
787       write_offset += length;
788       read_offset += length;
789     }
790   } while (read_offset < str_length);
791 
792   // If we're shortening the string, truncate it now.
793   str->resize(write_offset);
794   return true;
795 }
796 
797 template <class StringType>
ReplaceCharsT(const StringType & input,std::basic_string_view<typename StringType::value_type> find_any_of_these,std::basic_string_view<typename StringType::value_type> replace_with,StringType * output)798 bool ReplaceCharsT(
799     const StringType& input,
800     std::basic_string_view<typename StringType::value_type> find_any_of_these,
801     std::basic_string_view<typename StringType::value_type> replace_with,
802     StringType* output) {
803   // Commonly, this is called with output and input being the same string; in
804   // that case, this assignment is inexpensive.
805   *output = input;
806 
807   return DoReplaceMatchesAfterOffset(
808       output, 0, CharacterMatcher<StringType>{find_any_of_these}, replace_with,
809       ReplaceType::REPLACE_ALL);
810 }
811 
ReplaceFirstSubstringAfterOffset(std::u16string * str,size_t start_offset,std::u16string_view find_this,std::u16string_view replace_with)812 void ReplaceFirstSubstringAfterOffset(std::u16string* str,
813                                       size_t start_offset,
814                                       std::u16string_view find_this,
815                                       std::u16string_view replace_with) {
816   DoReplaceMatchesAfterOffset(str, start_offset,
817                               SubstringMatcher<std::u16string>{find_this},
818                               replace_with, ReplaceType::REPLACE_FIRST);
819 }
820 
ReplaceFirstSubstringAfterOffset(std::string * str,size_t start_offset,std::string_view find_this,std::string_view replace_with)821 void ReplaceFirstSubstringAfterOffset(std::string* str,
822                                       size_t start_offset,
823                                       std::string_view find_this,
824                                       std::string_view replace_with) {
825   DoReplaceMatchesAfterOffset(str, start_offset,
826                               SubstringMatcher<std::string>{find_this},
827                               replace_with, ReplaceType::REPLACE_FIRST);
828 }
829 
ReplaceSubstringsAfterOffset(std::u16string * str,size_t start_offset,std::u16string_view find_this,std::u16string_view replace_with)830 void ReplaceSubstringsAfterOffset(std::u16string* str,
831                                   size_t start_offset,
832                                   std::u16string_view find_this,
833                                   std::u16string_view replace_with) {
834   DoReplaceMatchesAfterOffset(str, start_offset,
835                               SubstringMatcher<std::u16string>{find_this},
836                               replace_with, ReplaceType::REPLACE_ALL);
837 }
838 
ReplaceSubstringsAfterOffset(std::string * str,size_t start_offset,std::string_view find_this,std::string_view replace_with)839 void ReplaceSubstringsAfterOffset(std::string* str,
840                                   size_t start_offset,
841                                   std::string_view find_this,
842                                   std::string_view replace_with) {
843   DoReplaceMatchesAfterOffset(str, start_offset,
844                               SubstringMatcher<std::string>{find_this},
845                               replace_with, ReplaceType::REPLACE_ALL);
846 }
847 
848 template <class string_type>
WriteIntoT(string_type * str,size_t length_with_null)849 inline typename string_type::value_type* WriteIntoT(string_type* str,
850                                                     size_t length_with_null) {
851   DCHECK_GT(length_with_null, 1u);
852   str->reserve(length_with_null);
853   str->resize(length_with_null - 1);
854   return &((*str)[0]);
855 }
856 
WriteInto(std::string * str,size_t length_with_null)857 char* WriteInto(std::string* str, size_t length_with_null) {
858   return WriteIntoT(str, length_with_null);
859 }
860 
WriteInto(std::u16string * str,size_t length_with_null)861 char16_t* WriteInto(std::u16string* str, size_t length_with_null) {
862   return WriteIntoT(str, length_with_null);
863 }
864 
865 #if defined(_MSC_VER) && !defined(__clang__)
866 // Work around VC++ code-gen bug. https://crbug.com/804884
867 #pragma optimize("", off)
868 #endif
869 
870 // Generic version for all JoinString overloads. |list_type| must be a sequence
871 // (std::vector or std::initializer_list) of strings/string_views of any type.
872 template <typename char_type, typename list_type>
JoinStringT(const list_type & parts,std::basic_string_view<char_type> sep)873 static std::basic_string<char_type> JoinStringT(
874     const list_type& parts,
875     std::basic_string_view<char_type> sep) {
876   if (parts.size() == 0)
877     return std::basic_string<char_type>();
878 
879   // Pre-allocate the eventual size of the string. Start with the size of all of
880   // the separators (note that this *assumes* parts.size() > 0).
881   size_t total_size = (parts.size() - 1) * sep.size();
882   for (const auto& part : parts)
883     total_size += part.size();
884   std::basic_string<char_type> result;
885   result.reserve(total_size);
886 
887   auto iter = parts.begin();
888   DCHECK(iter != parts.end());
889   result.append(*iter);
890   ++iter;
891 
892   for (; iter != parts.end(); ++iter) {
893     result.append(sep);
894     result.append(*iter);
895   }
896 
897   // Sanity-check that we pre-allocated correctly.
898   DCHECK_EQ(total_size, result.size());
899 
900   return result;
901 }
902 
JoinString(const std::vector<std::string> & parts,std::string_view separator)903 std::string JoinString(const std::vector<std::string>& parts,
904                        std::string_view separator) {
905   return JoinStringT(parts, separator);
906 }
907 
JoinString(const std::vector<std::u16string> & parts,std::u16string_view separator)908 std::u16string JoinString(const std::vector<std::u16string>& parts,
909                           std::u16string_view separator) {
910   return JoinStringT(parts, separator);
911 }
912 
913 #if defined(_MSC_VER) && !defined(__clang__)
914 // Work around VC++ code-gen bug. https://crbug.com/804884
915 #pragma optimize("", on)
916 #endif
917 
JoinString(const std::vector<std::string_view> & parts,std::string_view separator)918 std::string JoinString(const std::vector<std::string_view>& parts,
919                        std::string_view separator) {
920   return JoinStringT(parts, separator);
921 }
922 
JoinString(const std::vector<std::u16string_view> & parts,std::u16string_view separator)923 std::u16string JoinString(const std::vector<std::u16string_view>& parts,
924                           std::u16string_view separator) {
925   return JoinStringT(parts, separator);
926 }
927 
JoinString(std::initializer_list<std::string_view> parts,std::string_view separator)928 std::string JoinString(std::initializer_list<std::string_view> parts,
929                        std::string_view separator) {
930   return JoinStringT(parts, separator);
931 }
932 
JoinString(std::initializer_list<std::u16string_view> parts,std::u16string_view separator)933 std::u16string JoinString(std::initializer_list<std::u16string_view> parts,
934                           std::u16string_view separator) {
935   return JoinStringT(parts, separator);
936 }
937 
938 template <class FormatStringType, class OutStringType>
DoReplaceStringPlaceholders(const FormatStringType & format_string,const std::vector<OutStringType> & subst,std::vector<size_t> * offsets)939 OutStringType DoReplaceStringPlaceholders(
940     const FormatStringType& format_string,
941     const std::vector<OutStringType>& subst,
942     std::vector<size_t>* offsets) {
943   size_t substitutions = subst.size();
944   DCHECK_LT(substitutions, 10U);
945 
946   size_t sub_length = 0;
947   for (const auto& cur : subst)
948     sub_length += cur.length();
949 
950   OutStringType formatted;
951   formatted.reserve(format_string.length() + sub_length);
952 
953   std::vector<ReplacementOffset> r_offsets;
954   for (auto i = format_string.begin(); i != format_string.end(); ++i) {
955     if ('$' == *i) {
956       if (i + 1 != format_string.end()) {
957         ++i;
958         if ('$' == *i) {
959           while (i != format_string.end() && '$' == *i) {
960             formatted.push_back('$');
961             ++i;
962           }
963           --i;
964         } else {
965           if (*i < '1' || *i > '9') {
966             DLOG(ERROR) << "Invalid placeholder: $" << *i;
967             continue;
968           }
969           uintptr_t index = *i - '1';
970           if (offsets) {
971             ReplacementOffset r_offset(index,
972                                        static_cast<int>(formatted.size()));
973             r_offsets.insert(
974                 std::upper_bound(r_offsets.begin(), r_offsets.end(), r_offset,
975                                  &CompareParameter),
976                 r_offset);
977           }
978           if (index < substitutions)
979             formatted.append(subst.at(index));
980         }
981       }
982     } else {
983       formatted.push_back(*i);
984     }
985   }
986   if (offsets) {
987     for (const auto& cur : r_offsets)
988       offsets->push_back(cur.offset);
989   }
990   return formatted;
991 }
992 
ReplaceStringPlaceholders(const std::u16string & format_string,const std::vector<std::u16string> & subst,std::vector<size_t> * offsets)993 std::u16string ReplaceStringPlaceholders(
994     const std::u16string& format_string,
995     const std::vector<std::u16string>& subst,
996     std::vector<size_t>* offsets) {
997   return DoReplaceStringPlaceholders(format_string, subst, offsets);
998 }
999 
ReplaceStringPlaceholders(std::string_view format_string,const std::vector<std::string> & subst,std::vector<size_t> * offsets)1000 std::string ReplaceStringPlaceholders(std::string_view format_string,
1001                                       const std::vector<std::string>& subst,
1002                                       std::vector<size_t>* offsets) {
1003   return DoReplaceStringPlaceholders(format_string, subst, offsets);
1004 }
1005 
ReplaceStringPlaceholders(const std::u16string & format_string,const std::u16string & a,size_t * offset)1006 std::u16string ReplaceStringPlaceholders(const std::u16string& format_string,
1007                                          const std::u16string& a,
1008                                          size_t* offset) {
1009   std::vector<size_t> offsets;
1010   std::vector<std::u16string> subst;
1011   subst.push_back(a);
1012   std::u16string result =
1013       ReplaceStringPlaceholders(format_string, subst, &offsets);
1014 
1015   DCHECK_EQ(1U, offsets.size());
1016   if (offset)
1017     *offset = offsets[0];
1018   return result;
1019 }
1020 
1021 // The following code is compatible with the OpenBSD lcpy interface.  See:
1022 //   http://www.gratisoft.us/todd/papers/strlcpy.html
1023 //   ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c
1024 
1025 namespace {
1026 
1027 template <typename CHAR>
lcpyT(CHAR * dst,const CHAR * src,size_t dst_size)1028 size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) {
1029   for (size_t i = 0; i < dst_size; ++i) {
1030     if ((dst[i] = src[i]) == 0)  // We hit and copied the terminating NULL.
1031       return i;
1032   }
1033 
1034   // We were left off at dst_size.  We over copied 1 byte.  Null terminate.
1035   if (dst_size != 0)
1036     dst[dst_size - 1] = 0;
1037 
1038   // Count the rest of the |src|, and return it's length in characters.
1039   while (src[dst_size])
1040     ++dst_size;
1041   return dst_size;
1042 }
1043 
1044 }  // namespace
1045 
1046 }  // namespace base
1047