• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include <string_view>
16 #include <vector>
17 
18 #include "testing/base/public/gmock.h"
19 #include "testing/base/public/gunit.h"
20 #include "third_party/icing/portable/platform.h"
21 #include "third_party/icing/proto/schema_proto_portable.pb.h"
22 #include "third_party/icing/testing/common-matchers.h"
23 #include "third_party/icing/testing/icu-data-file-helper.h"
24 #include "third_party/icing/testing/jni-test-helpers.h"
25 #include "third_party/icing/testing/test-data.h"
26 #include "third_party/icing/tokenization/language-segmenter-factory.h"
27 #include "third_party/icing/tokenization/language-segmenter.h"
28 #include "third_party/icing/tokenization/tokenizer-factory.h"
29 #include "third_party/icing/tokenization/tokenizer.h"
30 #include "third_party/icu/include/unicode/uloc.h"
31 
32 namespace icing {
33 namespace lib {
34 
35 namespace {
36 
37 using ::testing::ElementsAre;
38 
39 // This test exists to ensure that the different tokenizers treat different
40 // segments of text in the same manner.
41 class CombinedTokenizerTest : public ::testing::Test {
42  protected:
SetUp()43   void SetUp() override {
44     if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
45       ICING_ASSERT_OK(
46           // File generated via icu_data_file rule in //third_party/icing/BUILD.
47           icu_data_file_helper::SetUpICUDataFile(
48               GetTestFilePath("third_party/icing/icu.dat")));
49     }
50     jni_cache_ = GetTestJniCache();
51 
52     language_segmenter_factory::SegmenterOptions options(ULOC_US,
53                                                          jni_cache_.get());
54     ICING_ASSERT_OK_AND_ASSIGN(
55         lang_segmenter_,
56         language_segmenter_factory::Create(std::move(options)));
57   }
58 
59   std::unique_ptr<const JniCache> jni_cache_;
60   std::unique_ptr<LanguageSegmenter> lang_segmenter_;
61 };
62 
GetTokenTerms(const std::vector<Token> & tokens)63 std::vector<std::string> GetTokenTerms(const std::vector<Token>& tokens) {
64   std::vector<std::string> terms;
65   terms.reserve(tokens.size());
66   for (const Token& token : tokens) {
67     if (token.type == Token::Type::REGULAR) {
68       terms.push_back(std::string(token.text));
69     }
70   }
71   return terms;
72 }
73 
74 }  // namespace
75 
TEST_F(CombinedTokenizerTest,SpecialCharacters)76 TEST_F(CombinedTokenizerTest, SpecialCharacters) {
77   const std::string_view kText = "�� Hello! Goodbye?";
78   ICING_ASSERT_OK_AND_ASSIGN(
79       std::unique_ptr<Tokenizer> indexing_tokenizer,
80       tokenizer_factory::CreateIndexingTokenizer(
81           StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
82 
83   ICING_ASSERT_OK_AND_ASSIGN(
84       std::unique_ptr<Tokenizer> query_tokenizer,
85       CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
86                            lang_segmenter_.get()));
87 
88   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
89                              indexing_tokenizer->TokenizeAll(kText));
90   std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
91   EXPECT_THAT(indexing_terms, ElementsAre("��", "Hello", "Goodbye"));
92 
93   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
94                              query_tokenizer->TokenizeAll(kText));
95   std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
96   EXPECT_THAT(query_terms, ElementsAre("��", "Hello", "Goodbye"));
97 }
98 
TEST_F(CombinedTokenizerTest,Parentheses)99 TEST_F(CombinedTokenizerTest, Parentheses) {
100   const std::string_view kText = "((paren1)(paren2) (last paren))";
101   ICING_ASSERT_OK_AND_ASSIGN(
102       std::unique_ptr<Tokenizer> indexing_tokenizer,
103       tokenizer_factory::CreateIndexingTokenizer(
104           StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
105 
106   ICING_ASSERT_OK_AND_ASSIGN(
107       std::unique_ptr<Tokenizer> query_tokenizer,
108       CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
109                            lang_segmenter_.get()));
110 
111   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
112                              indexing_tokenizer->TokenizeAll(kText));
113   std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
114   EXPECT_THAT(indexing_terms, ElementsAre("paren1", "paren2", "last", "paren"));
115 
116   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
117                              query_tokenizer->TokenizeAll(kText));
118   std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
119   EXPECT_THAT(query_terms, ElementsAre("paren1", "paren2", "last", "paren"));
120 }
121 
TEST_F(CombinedTokenizerTest,Negation)122 TEST_F(CombinedTokenizerTest, Negation) {
123   const std::string_view kText = "-foo -bar -baz";
124   ICING_ASSERT_OK_AND_ASSIGN(
125       std::unique_ptr<Tokenizer> indexing_tokenizer,
126       tokenizer_factory::CreateIndexingTokenizer(
127           StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
128 
129   ICING_ASSERT_OK_AND_ASSIGN(
130       std::unique_ptr<Tokenizer> query_tokenizer,
131       CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
132                            lang_segmenter_.get()));
133 
134   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
135                              indexing_tokenizer->TokenizeAll(kText));
136   std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
137   EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
138 
139   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
140                              query_tokenizer->TokenizeAll(kText));
141   std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
142   EXPECT_THAT(query_terms, ElementsAre("foo", "bar", "baz"));
143 }
144 
TEST_F(CombinedTokenizerTest,Colons)145 TEST_F(CombinedTokenizerTest, Colons) {
146   const std::string_view kText = ":foo: :bar baz:";
147   ICING_ASSERT_OK_AND_ASSIGN(
148       std::unique_ptr<Tokenizer> indexing_tokenizer,
149       tokenizer_factory::CreateIndexingTokenizer(
150           StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
151 
152   ICING_ASSERT_OK_AND_ASSIGN(
153       std::unique_ptr<Tokenizer> query_tokenizer,
154       CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
155                            lang_segmenter_.get()));
156 
157   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
158                              indexing_tokenizer->TokenizeAll(kText));
159   std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
160   EXPECT_THAT(indexing_terms, ElementsAre("foo", "bar", "baz"));
161 
162   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
163                              query_tokenizer->TokenizeAll(kText));
164   std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
165   EXPECT_THAT(query_terms, ElementsAre("foo", "bar", "baz"));
166 }
167 
TEST_F(CombinedTokenizerTest,ColonsPropertyRestricts)168 TEST_F(CombinedTokenizerTest, ColonsPropertyRestricts) {
169   ICING_ASSERT_OK_AND_ASSIGN(
170       std::unique_ptr<Tokenizer> indexing_tokenizer,
171       tokenizer_factory::CreateIndexingTokenizer(
172           StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
173 
174   ICING_ASSERT_OK_AND_ASSIGN(
175       std::unique_ptr<Tokenizer> query_tokenizer,
176       CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
177                            lang_segmenter_.get()));
178 
179   // This is a difference between the two tokenizers. "foo:bar" is a single
180   // token to the plain tokenizer because ':' is a word connector. But "foo:bar"
181   // is a property restrict to the query tokenizer - so "foo" is the property
182   // and "bar" is the only text term.
183   constexpr std::string_view kText = "foo:bar";
184   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
185                              indexing_tokenizer->TokenizeAll(kText));
186   std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
187   EXPECT_THAT(indexing_terms, ElementsAre("foo:bar"));
188 
189   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
190                              query_tokenizer->TokenizeAll(kText));
191   std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
192   EXPECT_THAT(query_terms, ElementsAre("bar"));
193 
194   // This difference, however, should only apply to the first ':'. A
195   // second ':' should be treated by both tokenizers as a word connector.
196   constexpr std::string_view kText2 = "foo:bar:baz";
197   ICING_ASSERT_OK_AND_ASSIGN(indexing_tokens,
198                              indexing_tokenizer->TokenizeAll(kText2));
199   indexing_terms = GetTokenTerms(indexing_tokens);
200   EXPECT_THAT(indexing_terms, ElementsAre("foo:bar:baz"));
201 
202   ICING_ASSERT_OK_AND_ASSIGN(query_tokens,
203                              query_tokenizer->TokenizeAll(kText2));
204   query_terms = GetTokenTerms(query_tokens);
205   EXPECT_THAT(query_terms, ElementsAre("bar:baz"));
206 }
207 
TEST_F(CombinedTokenizerTest,Punctuation)208 TEST_F(CombinedTokenizerTest, Punctuation) {
209   const std::string_view kText = "Who? What!? Why & How.";
210   ICING_ASSERT_OK_AND_ASSIGN(
211       std::unique_ptr<Tokenizer> indexing_tokenizer,
212       tokenizer_factory::CreateIndexingTokenizer(
213           StringIndexingConfig::TokenizerType::PLAIN, lang_segmenter_.get()));
214 
215   ICING_ASSERT_OK_AND_ASSIGN(
216       std::unique_ptr<Tokenizer> query_tokenizer,
217       CreateQueryTokenizer(tokenizer_factory::QueryTokenizerType::RAW_QUERY,
218                            lang_segmenter_.get()));
219 
220   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> indexing_tokens,
221                              indexing_tokenizer->TokenizeAll(kText));
222   std::vector<std::string> indexing_terms = GetTokenTerms(indexing_tokens);
223   EXPECT_THAT(indexing_terms, ElementsAre("Who", "What", "Why", "How"));
224 
225   ICING_ASSERT_OK_AND_ASSIGN(std::vector<Token> query_tokens,
226                              query_tokenizer->TokenizeAll(kText));
227   std::vector<std::string> query_terms = GetTokenTerms(query_tokens);
228   EXPECT_THAT(query_terms, ElementsAre("Who", "What", "Why", "How"));
229 }
230 
231 }  // namespace lib
232 }  // namespace icing
233