1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_ 16 #define ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_ 17 18 #include <memory> 19 #include <string> 20 #include <string_view> 21 22 #include "icing/text_classifier/lib3/utils/base/statusor.h" 23 #include "icing/transform/normalizer.h" 24 #include "icing/util/character-iterator.h" 25 #include "unicode/unorm2.h" 26 #include "unicode/utrans.h" 27 28 namespace icing { 29 namespace lib { 30 31 // Used to normalize UTF8 strings for text matching. It enforces a set of rules: 32 // 1. Transforms text to be lowercase UTF8. 33 // 2. Transforms full-width Latin characters to ASCII characters if possible. 34 // 3. Transforms hiragana to katakana. 35 // 4. Removes accent / diacritic marks on Latin characters 36 // 5. Removes accent / diacritic marks on Greek characters 37 // 6. Normalized text must be less than or equal to max_term_byte_size, 38 // otherwise it will be truncated. 39 // 40 // There're some other rules from ICU not listed here, please see .cc file for 41 // details. 42 class IcuNormalizer : public Normalizer { 43 public: 44 // Creates a normalizer with the subcomponents it needs. max_term_byte_size 45 // enforces the max size of text after normalization, text will be truncated 46 // if exceeds the max size. 47 // 48 // Returns: 49 // A normalizer on success 50 // INVALID_ARGUMENT if max_term_byte_size <= 0 51 // INTERNAL_ERROR if failed to create any subcomponent 52 static libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer>> Create( 53 int max_term_byte_size); 54 55 // Normalizes the input term based on rules. See .cc file for rule details. 56 // 57 // NOTE: Term should not mix Latin and non-Latin characters. Doing so may 58 // result in the non-Latin characters not properly being normalized 59 std::string NormalizeTerm(std::string_view term) const override; 60 61 // Returns a CharacterIterator pointing to one past the end of the segment of 62 // term that (once normalized) matches with normalized_term. 63 // 64 // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return 65 // CharacterIterator(u8:4, u16:4, u32:4). 66 // 67 // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return 68 // CharacterIterator(u8:0, u16:0, u32:0). 69 CharacterIterator FindNormalizedMatchEndPosition( 70 std::string_view term, std::string_view normalized_term) const override; 71 72 private: 73 // A handler class that helps manage the lifecycle of UTransliterator. It's 74 // used in IcuNormalizer to transform terms into the formats we need. 75 class TermTransformer { 76 public: 77 // Creates TermTransformer with a valid UTransliterator instance 78 // 79 // Returns: 80 // A term transformer on success 81 // INTERNAL_ERROR if failed to create any subcomponent 82 static libtextclassifier3::StatusOr<std::unique_ptr<TermTransformer>> 83 Create(); 84 85 // Closes the UTransliterator instance 86 ~TermTransformer(); 87 88 // Transforms the text based on our rules described at top of this file 89 std::string Transform(std::string_view term) const; 90 91 // Returns a CharacterIterator pointing to one past the end of the segment 92 // of a non-latin term that (once normalized) matches with normalized_term. 93 CharacterIterator FindNormalizedNonLatinMatchEndPosition( 94 std::string_view term, CharacterIterator char_itr, 95 std::string_view normalized_term) const; 96 97 private: 98 explicit TermTransformer(UTransliterator* u_transliterator); 99 100 // An ICU class to execute custom term transformation / normalization rules. 101 // utrans_close() must by called after using. 102 UTransliterator* u_transliterator_; 103 }; 104 105 struct NormalizeLatinResult { 106 // A string representing the maximum prefix of term (can be empty or term 107 // itself) that can be normalized into ASCII. 108 std::string text; 109 // The first position of the char within term that normalization failed to 110 // transform into an ASCII char, or term.length() if all chars can be 111 // transformed. 112 size_t end_pos; 113 }; 114 115 explicit IcuNormalizer(std::unique_ptr<TermTransformer> term_transformer, 116 int max_term_byte_size); 117 118 // Helper method to normalize Latin terms only. Rules applied: 119 // 1. Uppercase to lowercase 120 // 2. Remove diacritic (accent) marks 121 NormalizeLatinResult NormalizeLatin(const UNormalizer2* normalizer2, 122 std::string_view term) const; 123 124 // Set char_itr and normalized_char_itr to point to one past the end of the 125 // segments of term and normalized_term that can match if normalized into 126 // ASCII. In this case, true will be returned. 127 // 128 // The method stops at the position when char_itr cannot be normalized into 129 // ASCII and returns false, so that term_transformer can handle the remaining 130 // portion. 131 bool FindNormalizedLatinMatchEndPosition( 132 const UNormalizer2* normalizer2, std::string_view term, 133 CharacterIterator& char_itr, std::string_view normalized_term, 134 CharacterIterator& normalized_char_itr) const; 135 136 // Used to transform terms into their normalized forms. 137 std::unique_ptr<TermTransformer> term_transformer_; 138 139 // The maximum term length allowed after normalization. 140 int max_term_byte_size_; 141 }; 142 143 } // namespace lib 144 } // namespace icing 145 146 #endif // ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_ 147