• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2023 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14 
15 #include <cpp-string/utf_codecs.h>
16 #include <unicode/utf8.h>
17 
18 namespace bt_lib_cpp_string {
19 
IsStringUTF8(std::string_view str)20 bool IsStringUTF8(std::string_view str) {
21   const char* src = str.data();
22   size_t src_len = str.size();
23   size_t char_index = 0;
24 
25   while (char_index < src_len) {
26     int32_t code_point;
27     U8_NEXT(src, char_index, src_len, code_point);
28     if (!IsValidCharacter(code_point))
29       return false;
30   }
31   return true;
32 }
33 
34 // ReadUnicodeCharacter --------------------------------------------------------
35 
ReadUnicodeCharacter(const char * src,size_t src_len,size_t * char_index,uint32_t * code_point_out)36 bool ReadUnicodeCharacter(const char* src,
37                           size_t src_len,
38                           size_t* char_index,
39                           uint32_t* code_point_out) {
40   // U8_NEXT expects to be able to use -1 to signal an error, so we must
41   // use a signed type for code_point.  But this function returns false
42   // on error anyway, so code_point_out is unsigned.
43   int32_t code_point;
44   U8_NEXT(src, *char_index, src_len, code_point);
45   *code_point_out = static_cast<uint32_t>(code_point);
46 
47   // The ICU macro above moves to the next char, we want to point to the last
48   // char consumed.
49   (*char_index)--;
50 
51   // Validate the decoded value.
52   return IsValidCodepoint(code_point);
53 }
54 
55 // WriteUnicodeCharacter -------------------------------------------------------
56 
WriteUnicodeCharacter(uint32_t code_point,std::string * output)57 size_t WriteUnicodeCharacter(uint32_t code_point, std::string* output) {
58   if (code_point <= 0x7f) {
59     // Fast path the common case of one byte.
60     output->push_back(static_cast<char>(code_point));
61     return 1;
62   }
63 
64   // BT_LIB_U8_APPEND_UNSAFE can append up to 4 bytes.
65   size_t char_offset = output->length();
66   size_t original_char_offset = char_offset;
67   output->resize(char_offset + U8_MAX_LENGTH);
68 
69   U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
70 
71   // BT_LIB_U8_APPEND_UNSAFE will advance our pointer past the inserted
72   // character, so it will represent the new length of the string.
73   output->resize(char_offset);
74   return char_offset - original_char_offset;
75 }
76 
77 }  // namespace bt_lib_cpp_string
78