1 // Copyright 2023 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 // https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14
15 #include <cpp-string/utf_codecs.h>
16 #include <unicode/utf8.h>
17
18 namespace bt_lib_cpp_string {
19
IsStringUTF8(std::string_view str)20 bool IsStringUTF8(std::string_view str) {
21 const char* src = str.data();
22 size_t src_len = str.size();
23 size_t char_index = 0;
24
25 while (char_index < src_len) {
26 int32_t code_point;
27 U8_NEXT(src, char_index, src_len, code_point);
28 if (!IsValidCharacter(code_point))
29 return false;
30 }
31 return true;
32 }
33
34 // ReadUnicodeCharacter --------------------------------------------------------
35
ReadUnicodeCharacter(const char * src,size_t src_len,size_t * char_index,uint32_t * code_point_out)36 bool ReadUnicodeCharacter(const char* src,
37 size_t src_len,
38 size_t* char_index,
39 uint32_t* code_point_out) {
40 // U8_NEXT expects to be able to use -1 to signal an error, so we must
41 // use a signed type for code_point. But this function returns false
42 // on error anyway, so code_point_out is unsigned.
43 int32_t code_point;
44 U8_NEXT(src, *char_index, src_len, code_point);
45 *code_point_out = static_cast<uint32_t>(code_point);
46
47 // The ICU macro above moves to the next char, we want to point to the last
48 // char consumed.
49 (*char_index)--;
50
51 // Validate the decoded value.
52 return IsValidCodepoint(code_point);
53 }
54
55 // WriteUnicodeCharacter -------------------------------------------------------
56
WriteUnicodeCharacter(uint32_t code_point,std::string * output)57 size_t WriteUnicodeCharacter(uint32_t code_point, std::string* output) {
58 if (code_point <= 0x7f) {
59 // Fast path the common case of one byte.
60 output->push_back(static_cast<char>(code_point));
61 return 1;
62 }
63
64 // BT_LIB_U8_APPEND_UNSAFE can append up to 4 bytes.
65 size_t char_offset = output->length();
66 size_t original_char_offset = char_offset;
67 output->resize(char_offset + U8_MAX_LENGTH);
68
69 U8_APPEND_UNSAFE(&(*output)[0], char_offset, code_point);
70
71 // BT_LIB_U8_APPEND_UNSAFE will advance our pointer past the inserted
72 // character, so it will represent the new length of the string.
73 output->resize(char_offset);
74 return char_offset - original_char_offset;
75 }
76
77 } // namespace bt_lib_cpp_string
78