1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_UTIL_I18N_UTILS_H_ 16 #define ICING_UTIL_I18N_UTILS_H_ 17 18 #include <string> 19 #include <string_view> 20 21 #include "icing/text_classifier/lib3/utils/base/statusor.h" 22 #include "unicode/umachine.h" 23 24 namespace icing { 25 namespace lib { 26 27 // Internationalization utils that use standard utilities or custom code. Does 28 // not require any special dependencies, such as data files for ICU. 29 namespace i18n_utils { 30 31 // An invalid value defined by Unicode. 32 static constexpr UChar32 kInvalidUChar32 = 0xFFFD; 33 34 // Converts a UTF16 string to a UTF8 string. 35 // 36 // Returns: 37 // A UTF8 string on success 38 // INTERNAL_ERROR on any failures 39 libtextclassifier3::StatusOr<std::string> Utf16ToUtf8( 40 const std::u16string& utf16_string); 41 42 // Converts a UTF8 string to a UTF16 string. 43 // 44 // Returns: 45 // A UTF16 string on success 46 // INTERNAL_ERROR on any failures 47 libtextclassifier3::StatusOr<std::u16string> Utf8ToUtf16( 48 std::string_view utf8_string); 49 50 // Returns the char at the given position. 51 UChar32 GetUChar32At(const char* data, int length, int position); 52 53 // Returns the safe position to truncate a UTF8 string at so that multi-byte 54 // UTF8 characters are not cut in the middle. The returned value will always be 55 // 0 <= val <= desired_length. 56 // 57 // REQUIRES: 0 <= desired_length < strlen(str) 58 int SafeTruncateUtf8Length(const char* str, int desired_length); 59 60 // Safely truncates a UTF8 string so that multi-byte UTF8 characters are not cut 61 // in the middle. The string will be truncated in place. 62 void SafeTruncateUtf8(std::string* str, int truncate_to_length); 63 64 // Checks if the single char is within ASCII range. 65 bool IsAscii(char c); 66 67 // Checks if the Unicode char is within ASCII range. 68 bool IsAscii(UChar32 c); 69 70 // Returns how many code units (char) are used for the UTF-8 encoding of this 71 // Unicode character. Returns 0 if not valid. 72 int GetUtf8Length(UChar32 c); 73 74 // Returns how many code units (char16_t) are used for the UTF-16 encoding of 75 // this Unicode character. Returns 0 if not valid. 76 int GetUtf16Length(UChar32 c); 77 78 // Checks if the single char is the first byte of a UTF8 character, note 79 // that a single ASCII char is also considered a lead byte. 80 bool IsLeadUtf8Byte(char c); 81 82 // Checks if the character at position is punctuation. Assigns the length of the 83 // character at position to *char_len_out if the character at position is valid 84 // punctuation and char_len_out is not null. 85 bool IsPunctuationAt(std::string_view input, int position, 86 int* char_len_out = nullptr); 87 88 // Checks if the character at position is a whitespace. 89 bool IsWhitespaceAt(std::string_view input, int position); 90 91 // Checks if the character at position is a whitespace. 92 bool IsAlphabeticAt(std::string_view input, int position); 93 94 void AppendUchar32ToUtf8(std::string* utf8_string, UChar32 uchar); 95 96 } // namespace i18n_utils 97 } // namespace lib 98 } // namespace icing 99 100 #endif // ICING_UTIL_I18N_UTILS_H_ 101