1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_ 18 #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_ 19 20 #include "utils/base/integral_types.h" 21 #include "utils/utf8/unicodetext.h" 22 #include "utils/utf8/unilib-common.h" 23 24 #if defined TC3_UNILIB_ICU 25 #include "utils/utf8/unilib-icu.h" 26 #define INIT_UNILIB_FOR_TESTING(VAR) VAR() 27 #elif defined TC3_UNILIB_JAVAICU 28 #include "utils/utf8/unilib-javaicu.h" 29 #define INIT_UNILIB_FOR_TESTING(VAR) VAR(nullptr) 30 #elif defined TC3_UNILIB_APPLE 31 #include "utils/utf8/unilib-apple.h" 32 #define INIT_UNILIB_FOR_TESTING(VAR) VAR() 33 #else 34 #error No TC3_UNILIB implementation specified. 35 #endif 36 37 namespace libtextclassifier3 { 38 39 class UniLib : public UniLibBase { 40 public: 41 using UniLibBase::UniLibBase; 42 43 // Lowercase a unicode string. ToLowerText(const UnicodeText & text)44 UnicodeText ToLowerText(const UnicodeText& text) const { 45 UnicodeText result; 46 for (const char32 codepoint : text) { 47 result.push_back(ToLower(codepoint)); 48 } 49 return result; 50 } 51 52 // Uppercase a unicode string. ToUpperText(const UnicodeText & text)53 UnicodeText ToUpperText(const UnicodeText& text) const { 54 UnicodeText result; 55 for (const char32 codepoint : text) { 56 result.push_back(UniLibBase::ToUpper(codepoint)); 57 } 58 return result; 59 } 60 IsLowerText(const UnicodeText & text)61 bool IsLowerText(const UnicodeText& text) const { 62 for (const char32 codepoint : text) { 63 if (!IsLower(codepoint)) { 64 return false; 65 } 66 } 67 return true; 68 } 69 IsUpperText(const UnicodeText & text)70 bool IsUpperText(const UnicodeText& text) const { 71 for (const char32 codepoint : text) { 72 if (!IsUpper(codepoint)) { 73 return false; 74 } 75 } 76 return true; 77 } 78 IsDigits(const UnicodeText & text)79 bool IsDigits(const UnicodeText& text) const { 80 for (const char32 codepoint : text) { 81 if (!IsDigit(codepoint)) { 82 return false; 83 } 84 } 85 return true; 86 } 87 IsPercentage(char32 codepoint)88 bool IsPercentage(char32 codepoint) const { 89 return libtextclassifier3::IsPercentage(codepoint); 90 } 91 IsSlash(char32 codepoint)92 bool IsSlash(char32 codepoint) const { 93 return libtextclassifier3::IsSlash(codepoint); 94 } 95 IsMinus(char32 codepoint)96 bool IsMinus(char32 codepoint) const { 97 return libtextclassifier3::IsMinus(codepoint); 98 } 99 IsNumberSign(char32 codepoint)100 bool IsNumberSign(char32 codepoint) const { 101 return libtextclassifier3::IsNumberSign(codepoint); 102 } 103 IsDot(char32 codepoint)104 bool IsDot(char32 codepoint) const { 105 return libtextclassifier3::IsDot(codepoint); 106 } 107 IsApostrophe(char32 codepoint)108 bool IsApostrophe(char32 codepoint) const { 109 return libtextclassifier3::IsApostrophe(codepoint); 110 } 111 IsQuotation(char32 codepoint)112 bool IsQuotation(char32 codepoint) const { 113 return libtextclassifier3::IsQuotation(codepoint); 114 } 115 IsAmpersand(char32 codepoint)116 bool IsAmpersand(char32 codepoint) const { 117 return libtextclassifier3::IsAmpersand(codepoint); 118 } 119 IsLatinLetter(char32 codepoint)120 bool IsLatinLetter(char32 codepoint) const { 121 return libtextclassifier3::IsLatinLetter(codepoint); 122 } 123 IsArabicLetter(char32 codepoint)124 bool IsArabicLetter(char32 codepoint) const { 125 return libtextclassifier3::IsArabicLetter(codepoint); 126 } 127 IsCyrillicLetter(char32 codepoint)128 bool IsCyrillicLetter(char32 codepoint) const { 129 return libtextclassifier3::IsCyrillicLetter(codepoint); 130 } 131 IsChineseLetter(char32 codepoint)132 bool IsChineseLetter(char32 codepoint) const { 133 return libtextclassifier3::IsChineseLetter(codepoint); 134 } 135 IsJapaneseLetter(char32 codepoint)136 bool IsJapaneseLetter(char32 codepoint) const { 137 return libtextclassifier3::IsJapaneseLetter(codepoint); 138 } 139 IsKoreanLetter(char32 codepoint)140 bool IsKoreanLetter(char32 codepoint) const { 141 return libtextclassifier3::IsKoreanLetter(codepoint); 142 } 143 IsThaiLetter(char32 codepoint)144 bool IsThaiLetter(char32 codepoint) const { 145 return libtextclassifier3::IsThaiLetter(codepoint); 146 } 147 IsCJTletter(char32 codepoint)148 bool IsCJTletter(char32 codepoint) const { 149 return libtextclassifier3::IsCJTletter(codepoint); 150 } 151 IsLetter(char32 codepoint)152 bool IsLetter(char32 codepoint) const { 153 return libtextclassifier3::IsLetter(codepoint); 154 } 155 IsValidUtf8(const UnicodeText & text)156 bool IsValidUtf8(const UnicodeText& text) const { 157 // Basic check of structural validity of UTF8. 158 if (!text.is_valid()) { 159 return false; 160 } 161 // In addition to that, we declare that a valid UTF8 is when the number of 162 // codepoints in the string as measured by ICU is the same as the number of 163 // codepoints as measured by UnicodeText. Because if we don't do this check, 164 // the indices might differ, and cause trouble, because the assumption 165 // throughout the code is that ICU indices and UnicodeText indices are the 166 // same. 167 // NOTE: This is not perfect, as this doesn't check the alignment of the 168 // codepoints, but for the practical purposes should be enough. 169 const StatusOr<int32> icu_length = Length(text); 170 if (!icu_length.ok()) { 171 return false; 172 } 173 174 if (icu_length.ValueOrDie() != text.size_codepoints()) { 175 return false; 176 } 177 178 return true; 179 } 180 }; 181 182 } // namespace libtextclassifier3 183 #endif // LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_H_ 184