1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include <string_view>
16 #include <vector>
17
18 #include "testing/base/public/gmock.h"
19 #include "testing/base/public/gunit.h"
20 #include "third_party/icing/portable/platform.h"
21 #include "third_party/icing/proto/schema_proto_portable.pb.h"
22 #include "third_party/icing/testing/common-matchers.h"
23 #include "third_party/icing/testing/icu-data-file-helper.h"
24 #include "third_party/icing/testing/jni-test-helpers.h"
25 #include "third_party/icing/testing/test-data.h"
26 #include "third_party/icing/tokenization/language-segmenter-factory.h"
27 #include "third_party/icing/tokenization/language-segmenter.h"
28 #include "third_party/icing/tokenization/tokenizer-factory.h"
29 #include "third_party/icing/tokenization/tokenizer.h"
30 #include "third_party/icu/include/unicode/uloc.h"
31
32 namespace icing {
33 namespace lib {
34
35 namespace {
36
37 using ::testing::ElementsAre;
38
39 // This test exists to ensure that the different tokenizers treat different
40 // segments of text in the same manner.
41 class CombinedTokenizerTest : public ::testing::Test {
42 protected:
SetUp()43 void SetUp() override {
44 if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
45 ICING_ASSERT_OK(
46 // File generated via icu_data_file rule in //third_party/icing/BUILD.
47 icu_data_file_helper::SetUpICUDataFile(
48 GetTestFilePath("third_party/icing/icu.dat")));
49 }
50 jni_cache_ = GetTestJniCache();
51
52 language_segmenter_factory::SegmenterOptions options(ULOC_US,
53 jni_cache_.get());
54 ICING_ASSERT_OK_AND_ASSIGN(
55 lang_segmenter_,
56 language_segmenter_factory::Create(std::move(options)));
57 }
58
59 std::unique_ptr<const JniCache> jni_cache_;
60 std::unique_ptr<LanguageSegmenter> lang_segmenter_;
61 };
62
GetTokenTerms(const std::vector<Token> & tokens)63 std::vector<std::string> GetTokenTerms(const std::vector<Token>& tokens) {
64 std::vector<std::string> terms;
65 terms.reserve(tokens.size());
66 for (const Token& token : tokens) {
67 if (token.type == Token::Type::REGULAR) {
68 terms.push_back(std::string(token.text));
69 }
70 }
71 return terms;
72 }
73
74 } // namespace
75
TEST_F(CombinedTokenizerTest,SpecialCharacters)76 TEST_F(CombinedTokenizerTest, SpecialCharacters) {
77 const std::string_view kText = " Hello! Goodbye?";
78 ICING_ASSERT_OK_AND_ASSIGN(
79 std::unique_ptr<Tokenizer> indexing_tokenizer,
80 tokenizer_factory::CreateIndexingTokenizer(
81 StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
82
83 ICING_ASSERT_OK_AND_ASSIGN(
84 std::unique_ptr<Tokenizer> query_tokenizer,
85 CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
86 lang_segmenter_.get()));
87
88 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
89 indexing_tokenizer->TokenizeAll(kText));
90 std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
91 EXPECT_THAT(indexing_terms, ElementsAre("", "Hello", "Goodbye"));
92
93 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
94 query_tokenizer->TokenizeAll(kText));
95 std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
96 EXPECT_THAT(query_terms, ElementsAre("", "Hello", "Goodbye"));
97 }
98
TEST_F(CombinedTokenizerTest,Parentheses)99 TEST_F(CombinedTokenizerTest, Parentheses) {
100 const std::string_view kText = "((paren1)(paren2) (last paren))";
101 ICING_ASSERT_OK_AND_ASSIGN(
102 std::unique_ptr<Tokenizer> indexing_tokenizer,
103 tokenizer_factory::CreateIndexingTokenizer(
104 StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
105
106 ICING_ASSERT_OK_AND_ASSIGN(
107 std::unique_ptr<Tokenizer> query_tokenizer,
108 CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
109 lang_segmenter_.get()));
110
111 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
112 indexing_tokenizer->TokenizeAll(kText));
113 std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
114 EXPECT_THAT(indexing_terms, ElementsAre("paren1", "paren2", "last", "paren"));
115
116 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
117 query_tokenizer->TokenizeAll(kText));
118 std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
119 EXPECT_THAT(query_terms, ElementsAre("paren1", "paren2", "last", "paren"));
120 }
121
TEST_F(CombinedTokenizerTest,Negation)122 TEST_F(CombinedTokenizerTest, Negation) {
123 const std::string_view kText = "-foo -bar -baz";
124 ICING_ASSERT_OK_AND_ASSIGN(
125 std::unique_ptr<Tokenizer> indexing_tokenizer,
126 tokenizer_factory::CreateIndexingTokenizer(
127 StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
128
129 ICING_ASSERT_OK_AND_ASSIGN(
130 std::unique_ptr<Tokenizer> query_tokenizer,
131 CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
132 lang_segmenter_.get()));
133
134 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
135 indexing_tokenizer->TokenizeAll(kText));
136 std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
137 EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
138
139 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
140 query_tokenizer->TokenizeAll(kText));
141 std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
142 EXPECT_THAT(query_terms, ElementsAre("foo", "bar", "baz"));
143 }
144
TEST_F(CombinedTokenizerTest,Colons)145 TEST_F(CombinedTokenizerTest, Colons) {
146 const std::string_view kText = ":foo: :bar baz:";
147 ICING_ASSERT_OK_AND_ASSIGN(
148 std::unique_ptr<Tokenizer> indexing_tokenizer,
149 tokenizer_factory::CreateIndexingTokenizer(
150 StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
151
152 ICING_ASSERT_OK_AND_ASSIGN(
153 std::unique_ptr<Tokenizer> query_tokenizer,
154 CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
155 lang_segmenter_.get()));
156
157 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
158 indexing_tokenizer->TokenizeAll(kText));
159 std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
160 EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
161
162 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
163 query_tokenizer->TokenizeAll(kText));
164 std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
165 EXPECT_THAT(query_terms, ElementsAre("foo", "bar", "baz"));
166 }
167
TEST_F(CombinedTokenizerTest,ColonsPropertyRestricts)168 TEST_F(CombinedTokenizerTest, ColonsPropertyRestricts) {
169 ICING_ASSERT_OK_AND_ASSIGN(
170 std::unique_ptr<Tokenizer> indexing_tokenizer,
171 tokenizer_factory::CreateIndexingTokenizer(
172 StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
173
174 ICING_ASSERT_OK_AND_ASSIGN(
175 std::unique_ptr<Tokenizer> query_tokenizer,
176 CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
177 lang_segmenter_.get()));
178
179 // This is a difference between the two tokenizers. "foo:bar" is a single
180 // token to the plain tokenizer because ':' is a word connector. But "foo:bar"
181 // is a property restrict to the query tokenizer - so "foo" is the property
182 // and "bar" is the only text term.
183 constexpr std::string_view kText = "foo:bar";
184 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
185 indexing_tokenizer->TokenizeAll(kText));
186 std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
187 EXPECT_THAT(indexing_terms, ElementsAre("foo:bar"));
188
189 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
190 query_tokenizer->TokenizeAll(kText));
191 std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
192 EXPECT_THAT(query_terms, ElementsAre("bar"));
193
194 // This difference, however, should only apply to the first ':'. A
195 // second ':' should be treated by both tokenizers as a word connector.
196 constexpr std::string_view kText2 = "foo:bar:baz";
197 ICING_ASSERT_OK_AND_ASSIGN(indexing_tokens,
198 indexing_tokenizer->TokenizeAll(kText2));
199 indexing_terms = GetTokenTerms(indexing_tokens);
200 EXPECT_THAT(indexing_terms, ElementsAre("foo:bar:baz"));
201
202 ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
203 query_tokenizer->TokenizeAll(kText2));
204 query_terms = GetTokenTerms(query_tokens);
205 EXPECT_THAT(query_terms, ElementsAre("bar:baz"));
206 }
207
TEST_F(CombinedTokenizerTest,Punctuation)208 TEST_F(CombinedTokenizerTest, Punctuation) {
209 const std::string_view kText = "Who? What!? Why & How.";
210 ICING_ASSERT_OK_AND_ASSIGN(
211 std::unique_ptr<Tokenizer> indexing_tokenizer,
212 tokenizer_factory::CreateIndexingTokenizer(
213 StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
214
215 ICING_ASSERT_OK_AND_ASSIGN(
216 std::unique_ptr<Tokenizer> query_tokenizer,
217 CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
218 lang_segmenter_.get()));
219
220 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
221 indexing_tokenizer->TokenizeAll(kText));
222 std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
223 EXPECT_THAT(indexing_terms, ElementsAre("Who", "What", "Why", "How"));
224
225 ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
226 query_tokenizer->TokenizeAll(kText));
227 std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
228 EXPECT_THAT(query_terms, ElementsAre("Who", "What", "Why", "How"));
229 }
230
231 } // namespace lib
232 } // namespace icing
233