• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
16 #define ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
17 
18 #include <memory>
19 #include <string>
20 #include <string_view>
21 
22 #include "icing/text_classifier/lib3/utils/base/statusor.h"
23 #include "icing/transform/normalizer.h"
24 #include "icing/util/character-iterator.h"
25 #include "unicode/unorm2.h"
26 #include "unicode/utrans.h"
27 
28 namespace icing {
29 namespace lib {
30 
31 // Used to normalize UTF8 strings for text matching. It enforces a set of rules:
32 //  1. Transforms text to be lowercase UTF8.
33 //  2. Transforms full-width Latin characters to ASCII characters if possible.
34 //  3. Transforms hiragana to katakana.
35 //  4. Removes accent / diacritic marks on Latin characters
36 //  5. Removes accent / diacritic marks on Greek characters
37 //  6. Normalized text must be less than or equal to max_term_byte_size,
38 //     otherwise it will be truncated.
39 //
40 // There're some other rules from ICU not listed here, please see .cc file for
41 // details.
42 class IcuNormalizer : public Normalizer {
43  public:
44   // Creates a normalizer with the subcomponents it needs. max_term_byte_size
45   // enforces the max size of text after normalization, text will be truncated
46   // if exceeds the max size.
47   //
48   // Returns:
49   //   A normalizer on success
50   //   INVALID_ARGUMENT if max_term_byte_size <= 0
51   //   INTERNAL_ERROR if failed to create any subcomponent
52   static libtextclassifier3::StatusOr<std::unique_ptr<IcuNormalizer>> Create(
53       int max_term_byte_size);
54 
55   // Normalizes the input term based on rules. See .cc file for rule details.
56   //
57   // NOTE: Term should not mix Latin and non-Latin characters. Doing so may
58   // result in the non-Latin characters not properly being normalized
59   std::string NormalizeTerm(std::string_view term) const override;
60 
61   // Returns a CharacterIterator pointing to one past the end of the segment of
62   // term that (once normalized) matches with normalized_term.
63   //
64   // Ex. FindNormalizedMatchEndPosition("YELLOW", "yell") will return
65   // CharacterIterator(u8:4, u16:4, u32:4).
66   //
67   // Ex. FindNormalizedMatchEndPosition("YELLOW", "red") will return
68   // CharacterIterator(u8:0, u16:0, u32:0).
69   CharacterIterator FindNormalizedMatchEndPosition(
70       std::string_view term, std::string_view normalized_term) const override;
71 
72  private:
73   // A handler class that helps manage the lifecycle of UTransliterator. It's
74   // used in IcuNormalizer to transform terms into the formats we need.
75   class TermTransformer {
76    public:
77     // Creates TermTransformer with a valid UTransliterator instance
78     //
79     // Returns:
80     //   A term transformer on success
81     //   INTERNAL_ERROR if failed to create any subcomponent
82     static libtextclassifier3::StatusOr<std::unique_ptr<TermTransformer>>
83     Create();
84 
85     // Closes the UTransliterator instance
86     ~TermTransformer();
87 
88     // Transforms the text based on our rules described at top of this file
89     std::string Transform(std::string_view term) const;
90 
91     // Returns a CharacterIterator pointing to one past the end of the segment
92     // of a non-latin term that (once normalized) matches with normalized_term.
93     CharacterIterator FindNormalizedNonLatinMatchEndPosition(
94         std::string_view term, CharacterIterator char_itr,
95         std::string_view normalized_term) const;
96 
97    private:
98     explicit TermTransformer(UTransliterator* u_transliterator);
99 
100     // An ICU class to execute custom term transformation / normalization rules.
101     // utrans_close() must by called after using.
102     UTransliterator* u_transliterator_;
103   };
104 
105   struct NormalizeLatinResult {
106     // A string representing the maximum prefix of term (can be empty or term
107     // itself) that can be normalized into ASCII.
108     std::string text;
109     // The first position of the char within term that normalization failed to
110     // transform into an ASCII char, or term.length() if all chars can be
111     // transformed.
112     size_t end_pos;
113   };
114 
115   explicit IcuNormalizer(std::unique_ptr<TermTransformer> term_transformer,
116                          int max_term_byte_size);
117 
118   // Helper method to normalize Latin terms only. Rules applied:
119   // 1. Uppercase to lowercase
120   // 2. Remove diacritic (accent) marks
121   NormalizeLatinResult NormalizeLatin(const UNormalizer2* normalizer2,
122                                       std::string_view term) const;
123 
124   // Set char_itr and normalized_char_itr to point to one past the end of the
125   // segments of term and normalized_term that can match if normalized into
126   // ASCII. In this case, true will be returned.
127   //
128   // The method stops at the position when char_itr cannot be normalized into
129   // ASCII and returns false, so that term_transformer can handle the remaining
130   // portion.
131   bool FindNormalizedLatinMatchEndPosition(
132       const UNormalizer2* normalizer2, std::string_view term,
133       CharacterIterator& char_itr, std::string_view normalized_term,
134       CharacterIterator& normalized_char_itr) const;
135 
136   // Used to transform terms into their normalized forms.
137   std::unique_ptr<TermTransformer> term_transformer_;
138 
139   // The maximum term length allowed after normalization.
140   int max_term_byte_size_;
141 };
142 
143 }  // namespace lib
144 }  // namespace icing
145 
146 #endif  // ICING_TRANSFORM_ICU_ICU_NORMALIZER_H_
147