1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/result/snippet-retriever.h"
16
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20
21 #include "gmock/gmock.h"
22 #include "gtest/gtest.h"
23 #include "icing/document-builder.h"
24 #include "icing/file/mock-filesystem.h"
25 #include "icing/portable/equals-proto.h"
26 #include "icing/portable/platform.h"
27 #include "icing/proto/document.pb.h"
28 #include "icing/proto/schema.pb.h"
29 #include "icing/proto/search.pb.h"
30 #include "icing/proto/term.pb.h"
31 #include "icing/query/query-terms.h"
32 #include "icing/schema-builder.h"
33 #include "icing/schema/schema-store.h"
34 #include "icing/schema/section-manager.h"
35 #include "icing/store/document-id.h"
36 #include "icing/store/key-mapper.h"
37 #include "icing/testing/common-matchers.h"
38 #include "icing/testing/fake-clock.h"
39 #include "icing/testing/icu-data-file-helper.h"
40 #include "icing/testing/jni-test-helpers.h"
41 #include "icing/testing/test-data.h"
42 #include "icing/testing/tmp-directory.h"
43 #include "icing/tokenization/language-segmenter-factory.h"
44 #include "icing/tokenization/language-segmenter.h"
45 #include "icing/transform/map/map-normalizer.h"
46 #include "icing/transform/normalizer-factory.h"
47 #include "icing/transform/normalizer.h"
48 #include "icing/util/snippet-helpers.h"
49 #include "unicode/uloc.h"
50
51 namespace icing {
52 namespace lib {
53
54 namespace {
55
56 using ::testing::ElementsAre;
57 using ::testing::Eq;
58 using ::testing::IsEmpty;
59 using ::testing::SizeIs;
60
61 // TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export
62 // to Android. Also move it to schema-builder.h
63 #ifdef ENABLE_URL_TOKENIZER
64 constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_URL =
65 StringIndexingConfig::TokenizerType::URL;
66 #endif // ENABLE_URL_TOKENIZER
67
GetPropertyPaths(const SnippetProto & snippet)68 std::vector<std::string_view> GetPropertyPaths(const SnippetProto& snippet) {
69 std::vector<std::string_view> paths;
70 for (const SnippetProto::EntryProto& entry : snippet.entries()) {
71 paths.push_back(entry.property_name());
72 }
73 return paths;
74 }
75
76 class SnippetRetrieverTest : public testing::Test {
77 protected:
SetUp()78 void SetUp() override {
79 test_dir_ = GetTestTempDir() + "/icing";
80 filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
81
82 if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
83 ICING_ASSERT_OK(
84 // File generated via icu_data_file rule in //icing/BUILD.
85 icu_data_file_helper::SetUpICUDataFile(
86 GetTestFilePath("icing/icu.dat")));
87 }
88
89 jni_cache_ = GetTestJniCache();
90 language_segmenter_factory::SegmenterOptions options(ULOC_US,
91 jni_cache_.get());
92 ICING_ASSERT_OK_AND_ASSIGN(
93 language_segmenter_,
94 language_segmenter_factory::Create(std::move(options)));
95
96 // Setup the schema
97 ICING_ASSERT_OK_AND_ASSIGN(
98 schema_store_,
99 SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
100 SchemaProto schema =
101 SchemaBuilder()
102 .AddType(
103 SchemaTypeConfigBuilder()
104 .SetType("email")
105 .AddProperty(PropertyConfigBuilder()
106 .SetName("subject")
107 .SetDataTypeString(TERM_MATCH_PREFIX,
108 TOKENIZER_PLAIN)
109 .SetCardinality(CARDINALITY_OPTIONAL))
110 .AddProperty(PropertyConfigBuilder()
111 .SetName("body")
112 .SetDataTypeString(TERM_MATCH_EXACT,
113 TOKENIZER_PLAIN)
114 .SetCardinality(CARDINALITY_OPTIONAL)))
115 .Build();
116 ICING_ASSERT_OK(schema_store_->SetSchema(
117 schema, /*ignore_errors_and_delete_documents=*/false,
118 /*allow_circular_schema_definitions=*/false));
119
120 ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
121 /*max_term_byte_size=*/10000));
122 ICING_ASSERT_OK_AND_ASSIGN(
123 snippet_retriever_,
124 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
125 normalizer_.get()));
126
127 // Set limits to max - effectively no limit. Enable matching and request a
128 // window of 64 bytes.
129 snippet_spec_.set_num_to_snippet(std::numeric_limits<int32_t>::max());
130 snippet_spec_.set_num_matches_per_property(
131 std::numeric_limits<int32_t>::max());
132 snippet_spec_.set_max_window_utf32_length(64);
133 }
134
TearDown()135 void TearDown() override {
136 filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
137 }
138
139 Filesystem filesystem_;
140 FakeClock fake_clock_;
141 std::unique_ptr<SchemaStore> schema_store_;
142 std::unique_ptr<LanguageSegmenter> language_segmenter_;
143 std::unique_ptr<SnippetRetriever> snippet_retriever_;
144 std::unique_ptr<Normalizer> normalizer_;
145 std::unique_ptr<const JniCache> jni_cache_;
146 ResultSpecProto::SnippetSpecProto snippet_spec_;
147 std::string test_dir_;
148 };
149
TEST_F(SnippetRetrieverTest,CreationWithNullPointerShouldFail)150 TEST_F(SnippetRetrieverTest, CreationWithNullPointerShouldFail) {
151 EXPECT_THAT(
152 SnippetRetriever::Create(/*schema_store=*/nullptr,
153 language_segmenter_.get(), normalizer_.get()),
154 StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
155 EXPECT_THAT(SnippetRetriever::Create(schema_store_.get(),
156 /*language_segmenter=*/nullptr,
157 normalizer_.get()),
158 StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
159 EXPECT_THAT(
160 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
161 /*normalizer=*/nullptr),
162 StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
163 }
164
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeSmallerThanMatch)165 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) {
166 DocumentProto document =
167 DocumentBuilder()
168 .SetKey("icing", "email/1")
169 .SetSchema("email")
170 .AddStringProperty("subject", "counting")
171 .AddStringProperty("body", "one two three four.... five")
172 .Build();
173
174 SectionIdMask section_mask = 0b00000011;
175 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
176
177 // Window starts at the beginning of "three" and ends in the middle of
178 // "three". len=4, orig_window= "thre"
179 snippet_spec_.set_max_window_utf32_length(4);
180 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
181 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
182
183 EXPECT_THAT(snippet.entries(), SizeIs(1));
184 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
185 std::string_view content =
186 GetString(&document, snippet.entries(0).property_name());
187 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre(""));
188 }
189
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeEqualToMatch_OddLengthMatch)190 TEST_F(SnippetRetrieverTest,
191 SnippetingWindowMaxWindowSizeEqualToMatch_OddLengthMatch) {
192 DocumentProto document =
193 DocumentBuilder()
194 .SetKey("icing", "email/1")
195 .SetSchema("email")
196 .AddStringProperty("subject", "counting")
197 .AddStringProperty("body", "one two three four.... five")
198 .Build();
199
200 SectionIdMask section_mask = 0b00000011;
201 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
202
203 // Window starts at the beginning of "three" and at the exact end of
204 // "three". len=5, orig_window= "three"
205 snippet_spec_.set_max_window_utf32_length(5);
206 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
207 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
208
209 EXPECT_THAT(snippet.entries(), SizeIs(1));
210 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
211 std::string_view content =
212 GetString(&document, snippet.entries(0).property_name());
213 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("three"));
214 }
215
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeEqualToMatch_EvenLengthMatch)216 TEST_F(SnippetRetrieverTest,
217 SnippetingWindowMaxWindowSizeEqualToMatch_EvenLengthMatch) {
218 DocumentProto document =
219 DocumentBuilder()
220 .SetKey("icing", "email/1")
221 .SetSchema("email")
222 .AddStringProperty("subject", "counting")
223 .AddStringProperty("body", "one two three four.... five")
224 .Build();
225
226 SectionIdMask section_mask = 0b00000011;
227 SectionRestrictQueryTermsMap query_terms{{"", {"four"}}};
228
229 // Window starts at the beginning of "four" and at the exact end of
230 // "four". len=4, orig_window= "four"
231 snippet_spec_.set_max_window_utf32_length(4);
232 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
233 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
234
235 EXPECT_THAT(snippet.entries(), SizeIs(1));
236 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
237 std::string_view content =
238 GetString(&document, snippet.entries(0).property_name());
239 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("four"));
240 }
241
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowStartsInWhitespace)242 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) {
243 DocumentProto document =
244 DocumentBuilder()
245 .SetKey("icing", "email/1")
246 .SetSchema("email")
247 .AddStringProperty("subject", "counting")
248 .AddStringProperty("body", "one two three four.... five")
249 .Build();
250
251 SectionIdMask section_mask = 0b00000011;
252 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
253
254 // String: "one two three four.... five"
255 // ^ ^ ^ ^ ^ ^
256 // UTF-8 idx: 0 4 8 14 23 27
257 // UTF-32 idx: 0 4 8 14 23 27
258 //
259 // The window will be:
260 // 1. untrimmed, no-shifting window will be (2,17).
261 // 2. trimmed, no-shifting window [4,13) "two three"
262 // 3. trimmed, shifted window [4,18) "two three four"
263 snippet_spec_.set_max_window_utf32_length(14);
264 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
265 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
266
267 EXPECT_THAT(snippet.entries(), SizeIs(1));
268 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
269 std::string_view content =
270 GetString(&document, snippet.entries(0).property_name());
271 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
272 ElementsAre("two three four"));
273 }
274
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowStartsMidToken)275 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) {
276 DocumentProto document =
277 DocumentBuilder()
278 .SetKey("icing", "email/1")
279 .SetSchema("email")
280 .AddStringProperty("subject", "counting")
281 .AddStringProperty("body", "one two three four.... five")
282 .Build();
283
284 SectionIdMask section_mask = 0b00000011;
285 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
286
287 // String: "one two three four.... five"
288 // ^ ^ ^ ^ ^ ^
289 // UTF-8 idx: 0 4 8 14 23 27
290 // UTF-32 idx: 0 4 8 14 23 27
291 //
292 // The window will be:
293 // 1. untrimmed, no-shifting window will be (1,18).
294 // 2. trimmed, no-shifting window [4,18) "two three four"
295 // 3. trimmed, shifted window [4,20) "two three four.."
296 snippet_spec_.set_max_window_utf32_length(16);
297 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
298 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
299
300 EXPECT_THAT(snippet.entries(), SizeIs(1));
301 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
302 std::string_view content =
303 GetString(&document, snippet.entries(0).property_name());
304 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
305 ElementsAre("two three four.."));
306 }
307
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowEndsInPunctuation)308 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) {
309 DocumentProto document =
310 DocumentBuilder()
311 .SetKey("icing", "email/1")
312 .SetSchema("email")
313 .AddStringProperty("subject", "counting")
314 .AddStringProperty("body", "one two three four.... five")
315 .Build();
316
317 SectionIdMask section_mask = 0b00000011;
318 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
319
320 // Window ends in the middle of all the punctuation and window starts at 0.
321 // len=20, orig_window="one two three four.."
322 snippet_spec_.set_max_window_utf32_length(20);
323 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
324 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
325
326 EXPECT_THAT(snippet.entries(), SizeIs(1));
327 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
328 std::string_view content =
329 GetString(&document, snippet.entries(0).property_name());
330 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
331 ElementsAre("one two three four.."));
332 }
333
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowEndsMultiBytePunctuation)334 TEST_F(SnippetRetrieverTest,
335 SnippetingWindowMaxWindowEndsMultiBytePunctuation) {
336 DocumentProto document =
337 DocumentBuilder()
338 .SetKey("icing", "email/1")
339 .SetSchema("email")
340 .AddStringProperty("subject", "counting")
341 .AddStringProperty("body",
342 "Is everything upside down in Australia¿ Crikey!")
343 .Build();
344
345 SectionIdMask section_mask = 0b00000011;
346 SectionRestrictQueryTermsMap query_terms{{"", {"in"}}};
347
348 // Window ends in the middle of all the punctuation and window starts at 0.
349 // len=26, orig_window="pside down in Australia¿"
350 snippet_spec_.set_max_window_utf32_length(24);
351 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
352 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
353
354 EXPECT_THAT(snippet.entries(), SizeIs(1));
355 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
356 std::string_view content =
357 GetString(&document, snippet.entries(0).property_name());
358 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
359 ElementsAre("down in Australia¿"));
360 }
361
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowBeyondMultiBytePunctuation)362 TEST_F(SnippetRetrieverTest,
363 SnippetingWindowMaxWindowBeyondMultiBytePunctuation) {
364 DocumentProto document =
365 DocumentBuilder()
366 .SetKey("icing", "email/1")
367 .SetSchema("email")
368 .AddStringProperty("subject", "counting")
369 .AddStringProperty("body",
370 "Is everything upside down in Australia¿ Crikey!")
371 .Build();
372
373 SectionIdMask section_mask = 0b00000011;
374 SectionRestrictQueryTermsMap query_terms{{"", {"in"}}};
375
376 // Window ends in the middle of all the punctuation and window starts at 0.
377 // len=26, orig_window="upside down in Australia¿ "
378 snippet_spec_.set_max_window_utf32_length(26);
379 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
380 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
381
382 EXPECT_THAT(snippet.entries(), SizeIs(1));
383 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
384 std::string_view content =
385 GetString(&document, snippet.entries(0).property_name());
386 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
387 ElementsAre("upside down in Australia¿"));
388 }
389
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowStartsBeforeValueStart)390 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) {
391 DocumentProto document =
392 DocumentBuilder()
393 .SetKey("icing", "email/1")
394 .SetSchema("email")
395 .AddStringProperty("subject", "counting")
396 .AddStringProperty("body", "one two three four.... five")
397 .Build();
398
399 SectionIdMask section_mask = 0b00000011;
400 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
401
402 // String: "one two three four.... five"
403 // ^ ^ ^ ^ ^ ^
404 // UTF-8 idx: 0 4 8 14 23 27
405 // UTF-32 idx: 0 4 8 14 23 27
406 //
407 // The window will be:
408 // 1. untrimmed, no-shifting window will be (-2,21).
409 // 2. trimmed, no-shifting window [0,21) "one two three four..."
410 // 3. trimmed, shifted window [0,22) "one two three four...."
411 snippet_spec_.set_max_window_utf32_length(22);
412 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
413 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
414
415 EXPECT_THAT(snippet.entries(), SizeIs(1));
416 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
417 std::string_view content =
418 GetString(&document, snippet.entries(0).property_name());
419 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
420 ElementsAre("one two three four...."));
421 }
422
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowEndsInWhitespace)423 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) {
424 DocumentProto document =
425 DocumentBuilder()
426 .SetKey("icing", "email/1")
427 .SetSchema("email")
428 .AddStringProperty("subject", "counting")
429 .AddStringProperty("body", "one two three four.... five")
430 .Build();
431
432 SectionIdMask section_mask = 0b00000011;
433 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
434
435 // Window ends before "five" but after all the punctuation
436 // len=26, orig_window="one two three four.... "
437 snippet_spec_.set_max_window_utf32_length(26);
438 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
439 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
440
441 EXPECT_THAT(snippet.entries(), SizeIs(1));
442 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
443 std::string_view content =
444 GetString(&document, snippet.entries(0).property_name());
445 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
446 ElementsAre("one two three four...."));
447 }
448
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowEndsMidToken)449 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) {
450 DocumentProto document =
451 DocumentBuilder()
452 .SetKey("icing", "email/1")
453 .SetSchema("email")
454 .AddStringProperty("subject", "counting")
455 .AddStringProperty("body", "one two three four.... five")
456 .Build();
457
458 SectionIdMask section_mask = 0b00000011;
459 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
460
461 // String: "one two three four.... five"
462 // ^ ^ ^ ^ ^ ^
463 // UTF-8 idx: 0 4 8 14 23 27
464 // UTF-32 idx: 0 4 8 14 23 27
465 //
466 // The window will be:
467 // 1. untrimmed, no-shifting window will be ((-7,26).
468 // 2. trimmed, no-shifting window [0,26) "one two three four...."
469 // 3. trimmed, shifted window [0,27) "one two three four.... five"
470 snippet_spec_.set_max_window_utf32_length(32);
471 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
472 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
473
474 EXPECT_THAT(snippet.entries(), SizeIs(1));
475 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
476 std::string_view content =
477 GetString(&document, snippet.entries(0).property_name());
478 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
479 ElementsAre("one two three four.... five"));
480 }
481
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeEqualToValueSize)482 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) {
483 DocumentProto document =
484 DocumentBuilder()
485 .SetKey("icing", "email/1")
486 .SetSchema("email")
487 .AddStringProperty("subject", "counting")
488 .AddStringProperty("body", "one two three four.... five")
489 .Build();
490
491 SectionIdMask section_mask = 0b00000011;
492 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
493
494 // Max window size equals the size of the value.
495 // len=34, orig_window="one two three four.... five"
496 snippet_spec_.set_max_window_utf32_length(34);
497 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
498 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
499
500 EXPECT_THAT(snippet.entries(), SizeIs(1));
501 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
502 std::string_view content =
503 GetString(&document, snippet.entries(0).property_name());
504 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
505 ElementsAre("one two three four.... five"));
506 }
507
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeLargerThanValueSize)508 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) {
509 DocumentProto document =
510 DocumentBuilder()
511 .SetKey("icing", "email/1")
512 .SetSchema("email")
513 .AddStringProperty("subject", "counting")
514 .AddStringProperty("body", "one two three four.... five")
515 .Build();
516
517 SectionIdMask section_mask = 0b00000011;
518 SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
519
520 // Max window size exceeds the size of the value.
521 // len=36, orig_window="one two three four.... five"
522 snippet_spec_.set_max_window_utf32_length(36);
523 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
524 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
525
526 EXPECT_THAT(snippet.entries(), SizeIs(1));
527 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
528 std::string_view content =
529 GetString(&document, snippet.entries(0).property_name());
530 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
531 ElementsAre("one two three four.... five"));
532 }
533
TEST_F(SnippetRetrieverTest,SnippetingWindowMatchAtTextStart)534 TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) {
535 DocumentProto document =
536 DocumentBuilder()
537 .SetKey("icing", "email/1")
538 .SetSchema("email")
539 .AddStringProperty("subject", "counting")
540 .AddStringProperty("body", "one two three four.... five six")
541 .Build();
542
543 SectionIdMask section_mask = 0b00000011;
544 SectionRestrictQueryTermsMap query_terms{{"", {"two"}}};
545
546 // String: "one two three four.... five six"
547 // ^ ^ ^ ^ ^ ^ ^
548 // UTF-8 idx: 0 4 8 14 23 28 31
549 // UTF-32 idx: 0 4 8 14 23 28 31
550 //
551 // Window size will go past the start of the window.
552 // The window will be:
553 // 1. untrimmed, no-shifting window will be (-10,19).
554 // 2. trimmed, no-shifting window [0,19) "one two three four."
555 // 3. trimmed, shifted window [0,27) "one two three four.... five"
556 snippet_spec_.set_max_window_utf32_length(28);
557 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
558 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
559
560 EXPECT_THAT(snippet.entries(), SizeIs(1));
561 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
562 std::string_view content =
563 GetString(&document, snippet.entries(0).property_name());
564 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
565 ElementsAre("one two three four.... five"));
566 }
567
TEST_F(SnippetRetrieverTest,SnippetingWindowMatchAtTextEnd)568 TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) {
569 DocumentProto document =
570 DocumentBuilder()
571 .SetKey("icing", "email/1")
572 .SetSchema("email")
573 .AddStringProperty("subject", "counting")
574 .AddStringProperty("body", "one two three four.... five six")
575 .Build();
576
577 SectionIdMask section_mask = 0b00000011;
578 SectionRestrictQueryTermsMap query_terms{{"", {"five"}}};
579
580 // String: "one two three four.... five six"
581 // ^ ^ ^ ^ ^ ^ ^
582 // UTF-8 idx: 0 4 8 14 23 28 31
583 // UTF-32 idx: 0 4 8 14 23 28 31
584 //
585 // Window size will go past the end of the window.
586 // The window will be:
587 // 1. untrimmed, no-shifting window will be (10,39).
588 // 2. trimmed, no-shifting window [14,31) "four.... five six"
589 // 3. trimmed, shifted window [4,31) "two three four.... five six"
590 snippet_spec_.set_max_window_utf32_length(28);
591 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
592 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
593
594 EXPECT_THAT(snippet.entries(), SizeIs(1));
595 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
596 std::string_view content =
597 GetString(&document, snippet.entries(0).property_name());
598 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
599 ElementsAre("two three four.... five six"));
600 }
601
TEST_F(SnippetRetrieverTest,SnippetingWindowMatchAtTextStartShortText)602 TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) {
603 DocumentProto document =
604 DocumentBuilder()
605 .SetKey("icing", "email/1")
606 .SetSchema("email")
607 .AddStringProperty("subject", "counting")
608 .AddStringProperty("body", "one two three four....")
609 .Build();
610
611 SectionIdMask section_mask = 0b00000011;
612 SectionRestrictQueryTermsMap query_terms{{"", {"two"}}};
613
614 // String: "one two three four...."
615 // ^ ^ ^ ^ ^
616 // UTF-8 idx: 0 4 8 14 22
617 // UTF-32 idx: 0 4 8 14 22
618 //
619 // Window size will go past the start of the window.
620 // The window will be:
621 // 1. untrimmed, no-shifting window will be (-10,19).
622 // 2. trimmed, no-shifting window [0, 19) "one two three four."
623 // 3. trimmed, shifted window [0, 22) "one two three four...."
624 snippet_spec_.set_max_window_utf32_length(28);
625 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
626 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
627
628 EXPECT_THAT(snippet.entries(), SizeIs(1));
629 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
630 std::string_view content =
631 GetString(&document, snippet.entries(0).property_name());
632 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
633 ElementsAre("one two three four...."));
634 }
635
TEST_F(SnippetRetrieverTest,SnippetingWindowMatchAtTextEndShortText)636 TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) {
637 DocumentProto document =
638 DocumentBuilder()
639 .SetKey("icing", "email/1")
640 .SetSchema("email")
641 .AddStringProperty("subject", "counting")
642 .AddStringProperty("body", "one two three four....")
643 .Build();
644
645 SectionIdMask section_mask = 0b00000011;
646 SectionRestrictQueryTermsMap query_terms{{"", {"four"}}};
647
648 // String: "one two three four...."
649 // ^ ^ ^ ^ ^
650 // UTF-8 idx: 0 4 8 14 22
651 // UTF-32 idx: 0 4 8 14 22
652 //
653 // Window size will go past the start of the window.
654 // The window will be:
655 // 1. untrimmed, no-shifting window will be (1,30).
656 // 2. trimmed, no-shifting window [4, 22) "two three four...."
657 // 3. trimmed, shifted window [0, 22) "one two three four...."
658 snippet_spec_.set_max_window_utf32_length(28);
659 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
660 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
661
662 EXPECT_THAT(snippet.entries(), SizeIs(1));
663 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
664 std::string_view content =
665 GetString(&document, snippet.entries(0).property_name());
666 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
667 ElementsAre("one two three four...."));
668 }
669
TEST_F(SnippetRetrieverTest,PrefixSnippeting)670 TEST_F(SnippetRetrieverTest, PrefixSnippeting) {
671 DocumentProto document =
672 DocumentBuilder()
673 .SetKey("icing", "email/1")
674 .SetSchema("email")
675 .AddStringProperty("subject", "subject foo")
676 .AddStringProperty("body", "Only a fool would match this content.")
677 .Build();
678 SectionIdMask section_mask = 0b00000011;
679 SectionRestrictQueryTermsMap query_terms{{"", {"f"}}};
680 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
681 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
682
683 // Check the snippets. 'f' should match prefix-enabled property 'subject', but
684 // not exact-only property 'body'
685 EXPECT_THAT(snippet.entries(), SizeIs(1));
686 EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject"));
687 std::string_view content =
688 GetString(&document, snippet.entries(0).property_name());
689 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
690 ElementsAre("subject foo"));
691 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
692 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("f"));
693 }
694
TEST_F(SnippetRetrieverTest,ExactSnippeting)695 TEST_F(SnippetRetrieverTest, ExactSnippeting) {
696 DocumentProto document =
697 DocumentBuilder()
698 .SetKey("icing", "email/1")
699 .SetSchema("email")
700 .AddStringProperty("subject", "subject foo")
701 .AddStringProperty("body", "Only a fool would match this content.")
702 .Build();
703
704 SectionIdMask section_mask = 0b00000011;
705 SectionRestrictQueryTermsMap query_terms{{"", {"f"}}};
706 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
707 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
708
709 // Check the snippets
710 EXPECT_THAT(snippet.entries(), IsEmpty());
711 }
712
TEST_F(SnippetRetrieverTest,SimpleSnippetingNoWindowing)713 TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) {
714 DocumentProto document =
715 DocumentBuilder()
716 .SetKey("icing", "email/1")
717 .SetSchema("email")
718 .AddStringProperty("subject", "subject foo")
719 .AddStringProperty("body", "Only a fool would match this content.")
720 .Build();
721
722 snippet_spec_.set_max_window_utf32_length(0);
723
724 SectionIdMask section_mask = 0b00000011;
725 SectionRestrictQueryTermsMap query_terms{{"", {"foo"}}};
726 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
727 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
728
729 // Check the snippets
730 EXPECT_THAT(snippet.entries(), SizeIs(1));
731 EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject"));
732 std::string_view content =
733 GetString(&document, snippet.entries(0).property_name());
734 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre(""));
735 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
736 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
737 }
738
TEST_F(SnippetRetrieverTest,SnippetingMultipleMatches)739 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) {
740 DocumentProto document =
741 DocumentBuilder()
742 .SetKey("icing", "email/1")
743 .SetSchema("email")
744 .AddStringProperty("subject", "subject foo")
745 .AddStringProperty("body",
746 "Concerning the subject of foo, we need to begin "
747 "considering our options regarding body bar.")
748 .Build();
749 // String: "Concerning the subject of foo, we need to begin considering "
750 // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
751 // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48
752 // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48
753 //
754 // String ctd: "our options regarding body bar."
755 // ^ ^ ^ ^ ^ ^
756 // UTF-8 idx: 60 64 72 82 87 91
757 // UTF-32 idx: 60 64 72 82 87 91
758 SectionIdMask section_mask = 0b00000011;
759 SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
760 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
761 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
762
763 // Check the snippets
764 EXPECT_THAT(snippet.entries(), SizeIs(2));
765 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
766 std::string_view content =
767 GetString(&document, snippet.entries(0).property_name());
768 // The first window will be:
769 // 1. untrimmed, no-shifting window will be (-6,59).
770 // 2. trimmed, no-shifting window [0, 59) "Concerning... considering".
771 // 3. trimmed, shifted window [0, 63) "Concerning... our"
772 // The second window will be:
773 // 1. untrimmed, no-shifting window will be (54,91).
774 // 2. trimmed, no-shifting window [60, 91) "our... bar.".
775 // 3. trimmed, shifted window [31, 91) "we... bar."
776 EXPECT_THAT(
777 GetWindows(content, snippet.entries(0)),
778 ElementsAre(
779 "Concerning the subject of foo, we need to begin considering our",
780 "we need to begin considering our options regarding body bar."));
781 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
782 ElementsAre("foo", "bar"));
783 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
784 ElementsAre("foo", "bar"));
785
786 EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
787 content = GetString(&document, snippet.entries(1).property_name());
788 EXPECT_THAT(GetWindows(content, snippet.entries(1)),
789 ElementsAre("subject foo"));
790 EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo"));
791 EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
792 }
793
TEST_F(SnippetRetrieverTest,SnippetingMultipleMatchesSectionRestrict)794 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) {
795 DocumentProto document =
796 DocumentBuilder()
797 .SetKey("icing", "email/1")
798 .SetSchema("email")
799 .AddStringProperty("subject", "subject foo")
800 .AddStringProperty("body",
801 "Concerning the subject of foo, we need to begin "
802 "considering our options regarding body bar.")
803 .Build();
804 // String: "Concerning the subject of foo, we need to begin considering "
805 // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
806 // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48
807 // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48
808 //
809 // String ctd: "our options regarding body bar."
810 // ^ ^ ^ ^ ^ ^
811 // UTF-8 idx: 60 64 72 82 87 91
812 // UTF-32 idx: 60 64 72 82 87 91
813 //
814 // Section 1 "subject" is not in the section_mask, so no snippet information
815 // from that section should be returned by the SnippetRetriever.
816 SectionIdMask section_mask = 0b00000001;
817 SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
818 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
819 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
820
821 // Check the snippets
822 EXPECT_THAT(snippet.entries(), SizeIs(1));
823 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
824 std::string_view content =
825 GetString(&document, snippet.entries(0).property_name());
826 // The first window will be:
827 // 1. untrimmed, no-shifting window will be (-6,59).
828 // 2. trimmed, no-shifting window [0, 59) "Concerning... considering".
829 // 3. trimmed, shifted window [0, 63) "Concerning... our"
830 // The second window will be:
831 // 1. untrimmed, no-shifting window will be (54,91).
832 // 2. trimmed, no-shifting window [60, 91) "our... bar.".
833 // 3. trimmed, shifted window [31, 91) "we... bar."
834 EXPECT_THAT(
835 GetWindows(content, snippet.entries(0)),
836 ElementsAre(
837 "Concerning the subject of foo, we need to begin considering our",
838 "we need to begin considering our options regarding body bar."));
839 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
840 ElementsAre("foo", "bar"));
841 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
842 ElementsAre("foo", "bar"));
843 }
844
TEST_F(SnippetRetrieverTest,SnippetingMultipleMatchesSectionRestrictedTerm)845 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) {
846 DocumentProto document =
847 DocumentBuilder()
848 .SetKey("icing", "email/1")
849 .SetSchema("email")
850 .AddStringProperty("subject", "subject foo")
851 .AddStringProperty("body",
852 "Concerning the subject of foo, we need to begin "
853 "considering our options regarding body bar.")
854 .Build();
855 // String: "Concerning the subject of foo, we need to begin considering "
856 // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
857 // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48
858 // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48
859 //
860 // String ctd: "our options regarding body bar."
861 // ^ ^ ^ ^ ^ ^
862 // UTF-8 idx: 60 64 72 82 87 91
863 // UTF-32 idx: 60 64 72 82 87 91
864 SectionIdMask section_mask = 0b00000011;
865 // "subject" should match in both sections, but "foo" is restricted to "body"
866 // so it should only match in the 'body' section and not the 'subject'
867 // section.
868 SectionRestrictQueryTermsMap query_terms{{"", {"subject"}},
869 {"body", {"foo"}}};
870 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
871 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
872
873 // Check the snippets
874 EXPECT_THAT(snippet.entries(), SizeIs(2));
875 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
876 std::string_view content =
877 GetString(&document, snippet.entries(0).property_name());
878 // The first window will be:
879 // 1. untrimmed, no-shifting window will be (-15,50).
880 // 2. trimmed, no-shifting window [0, 47) "Concerning... begin".
881 // 3. trimmed, shifted window [0, 63) "Concerning... our"
882 // The second window will be:
883 // 1. untrimmed, no-shifting window will be (-6,59).
884 // 2. trimmed, no-shifting window [0, 59) "Concerning... considering".
885 // 3. trimmed, shifted window [0, 63) "Concerning... our"
886 EXPECT_THAT(
887 GetWindows(content, snippet.entries(0)),
888 ElementsAre(
889 "Concerning the subject of foo, we need to begin considering our",
890 "Concerning the subject of foo, we need to begin considering our"));
891 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
892 ElementsAre("subject", "foo"));
893 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
894 ElementsAre("subject", "foo"));
895
896 EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
897 content = GetString(&document, snippet.entries(1).property_name());
898 EXPECT_THAT(GetWindows(content, snippet.entries(1)),
899 ElementsAre("subject foo"));
900 EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("subject"));
901 EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
902 ElementsAre("subject"));
903 }
904
TEST_F(SnippetRetrieverTest,SnippetingMultipleMatchesOneMatchPerProperty)905 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) {
906 DocumentProto document =
907 DocumentBuilder()
908 .SetKey("icing", "email/1")
909 .SetSchema("email")
910 .AddStringProperty("subject", "subject foo")
911 .AddStringProperty("body",
912 "Concerning the subject of foo, we need to begin "
913 "considering our options regarding body bar.")
914 .Build();
915
916 // String: "Concerning the subject of foo, we need to begin considering "
917 // ^ ^ ^ ^ ^ ^ ^ ^ ^ ^
918 // UTF-8 idx: 0 11 15 23 26 31 34 39 42 48
919 // UTF-32 idx: 0 11 15 23 26 31 34 39 42 48
920 //
921 // String ctd: "our options regarding body bar."
922 // ^ ^ ^ ^ ^ ^
923 // UTF-8 idx: 60 64 72 82 87 91
924 // UTF-32 idx: 60 64 72 82 87 91
925 snippet_spec_.set_num_matches_per_property(1);
926
927 SectionIdMask section_mask = 0b00000011;
928 SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
929 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
930 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
931
932 // Check the snippets
933 EXPECT_THAT(snippet.entries(), SizeIs(2));
934 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
935 std::string_view content =
936 GetString(&document, snippet.entries(0).property_name());
937 // The window will be:
938 // 1. untrimmed, no-shifting window will be (-6,59).
939 // 2. trimmed, no-shifting window [0, 59) "Concerning... considering".
940 // 3. trimmed, shifted window [0, 63) "Concerning... our"
941 EXPECT_THAT(
942 GetWindows(content, snippet.entries(0)),
943 ElementsAre(
944 "Concerning the subject of foo, we need to begin considering our"));
945 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
946 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
947
948 EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
949 content = GetString(&document, snippet.entries(1).property_name());
950 EXPECT_THAT(GetWindows(content, snippet.entries(1)),
951 ElementsAre("subject foo"));
952 EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo"));
953 EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
954 }
955
TEST_F(SnippetRetrieverTest,PrefixSnippetingNormalization)956 TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) {
957 DocumentProto document =
958 DocumentBuilder()
959 .SetKey("icing", "email/1")
960 .SetSchema("email")
961 .AddStringProperty("subject", "MDI team")
962 .AddStringProperty("body", "Some members are in Zürich.")
963 .Build();
964 SectionIdMask section_mask = 0b00000011;
965 SectionRestrictQueryTermsMap query_terms{{"", {"md"}}};
966 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
967 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
968
969 EXPECT_THAT(snippet.entries(), SizeIs(1));
970 EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject"));
971 std::string_view content =
972 GetString(&document, snippet.entries(0).property_name());
973 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("MDI team"));
974 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("MDI"));
975 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("MD"));
976 }
977
TEST_F(SnippetRetrieverTest,ExactSnippetingNormalization)978 TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) {
979 DocumentProto document =
980 DocumentBuilder()
981 .SetKey("icing", "email/1")
982 .SetSchema("email")
983 .AddStringProperty("subject", "MDI team")
984 .AddStringProperty("body", "Some members are in Zürich.")
985 .Build();
986
987 SectionIdMask section_mask = 0b00000011;
988 SectionRestrictQueryTermsMap query_terms{{"", {"zurich"}}};
989 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
990 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
991
992 EXPECT_THAT(snippet.entries(), SizeIs(1));
993 EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
994 std::string_view content =
995 GetString(&document, snippet.entries(0).property_name());
996 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
997 ElementsAre("Some members are in Zürich."));
998 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("Zürich"));
999
1000 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1001 ElementsAre("Zürich"));
1002 }
1003
TEST_F(SnippetRetrieverTest,SnippetingTestOneLevel)1004 TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) {
1005 SchemaProto schema =
1006 SchemaBuilder()
1007 .AddType(SchemaTypeConfigBuilder()
1008 .SetType("SingleLevelType")
1009 .AddProperty(PropertyConfigBuilder()
1010 .SetName("X")
1011 .SetDataTypeString(TERM_MATCH_PREFIX,
1012 TOKENIZER_PLAIN)
1013 .SetCardinality(CARDINALITY_REPEATED))
1014 .AddProperty(PropertyConfigBuilder()
1015 .SetName("Y")
1016 .SetDataTypeString(TERM_MATCH_PREFIX,
1017 TOKENIZER_PLAIN)
1018 .SetCardinality(CARDINALITY_REPEATED))
1019 .AddProperty(PropertyConfigBuilder()
1020 .SetName("Z")
1021 .SetDataTypeString(TERM_MATCH_PREFIX,
1022 TOKENIZER_PLAIN)
1023 .SetCardinality(CARDINALITY_REPEATED)))
1024 .Build();
1025 ICING_ASSERT_OK(schema_store_->SetSchema(
1026 schema, /*ignore_errors_and_delete_documents=*/true,
1027 /*allow_circular_schema_definitions=*/false));
1028 ICING_ASSERT_OK_AND_ASSIGN(
1029 snippet_retriever_,
1030 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1031 normalizer_.get()));
1032
1033 std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"};
1034 DocumentProto document;
1035 document.set_schema("SingleLevelType");
1036 PropertyProto* prop = document.add_properties();
1037 prop->set_name("X");
1038 for (const std::string& s : string_values) {
1039 prop->add_string_values(s);
1040 }
1041 prop = document.add_properties();
1042 prop->set_name("Y");
1043 for (const std::string& s : string_values) {
1044 prop->add_string_values(s);
1045 }
1046 prop = document.add_properties();
1047 prop->set_name("Z");
1048 for (const std::string& s : string_values) {
1049 prop->add_string_values(s);
1050 }
1051
1052 SectionIdMask section_mask = 0b00000111;
1053 SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
1054 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1055 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1056
1057 EXPECT_THAT(snippet.entries(), SizeIs(6));
1058 EXPECT_THAT(snippet.entries(0).property_name(), Eq("X[1]"));
1059 std::string_view content =
1060 GetString(&document, snippet.entries(0).property_name());
1061 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
1062 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
1063 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
1064
1065 EXPECT_THAT(snippet.entries(1).property_name(), Eq("X[3]"));
1066 content = GetString(&document, snippet.entries(1).property_name());
1067 EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
1068 EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
1069 EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
1070
1071 EXPECT_THAT(GetPropertyPaths(snippet),
1072 ElementsAre("X[1]", "X[3]", "Y[1]", "Y[3]", "Z[1]", "Z[3]"));
1073 }
1074
TEST_F(SnippetRetrieverTest,SnippetingTestMultiLevel)1075 TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevel) {
1076 SchemaProto schema =
1077 SchemaBuilder()
1078 .AddType(SchemaTypeConfigBuilder()
1079 .SetType("SingleLevelType")
1080 .AddProperty(PropertyConfigBuilder()
1081 .SetName("X")
1082 .SetDataTypeString(TERM_MATCH_PREFIX,
1083 TOKENIZER_PLAIN)
1084 .SetCardinality(CARDINALITY_REPEATED))
1085 .AddProperty(PropertyConfigBuilder()
1086 .SetName("Y")
1087 .SetDataTypeString(TERM_MATCH_PREFIX,
1088 TOKENIZER_PLAIN)
1089 .SetCardinality(CARDINALITY_REPEATED))
1090 .AddProperty(PropertyConfigBuilder()
1091 .SetName("Z")
1092 .SetDataTypeString(TERM_MATCH_PREFIX,
1093 TOKENIZER_PLAIN)
1094 .SetCardinality(CARDINALITY_REPEATED)))
1095 .AddType(SchemaTypeConfigBuilder()
1096 .SetType("MultiLevelType")
1097 .AddProperty(PropertyConfigBuilder()
1098 .SetName("A")
1099 .SetDataTypeDocument(
1100 "SingleLevelType",
1101 /*index_nested_properties=*/true)
1102 .SetCardinality(CARDINALITY_OPTIONAL))
1103 .AddProperty(PropertyConfigBuilder()
1104 .SetName("B")
1105 .SetDataTypeDocument(
1106 "SingleLevelType",
1107 /*index_nested_properties=*/true)
1108 .SetCardinality(CARDINALITY_OPTIONAL))
1109 .AddProperty(PropertyConfigBuilder()
1110 .SetName("C")
1111 .SetDataTypeDocument(
1112 "SingleLevelType",
1113 /*index_nested_properties=*/true)
1114 .SetCardinality(CARDINALITY_OPTIONAL)))
1115 .Build();
1116 ICING_ASSERT_OK(schema_store_->SetSchema(
1117 schema, /*ignore_errors_and_delete_documents=*/true,
1118 /*allow_circular_schema_definitions=*/false));
1119 ICING_ASSERT_OK_AND_ASSIGN(
1120 snippet_retriever_,
1121 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1122 normalizer_.get()));
1123
1124 std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"};
1125 DocumentProto subdocument;
1126 PropertyProto* prop = subdocument.add_properties();
1127 prop->set_name("X");
1128 for (const std::string& s : string_values) {
1129 prop->add_string_values(s);
1130 }
1131 prop = subdocument.add_properties();
1132 prop->set_name("Y");
1133 for (const std::string& s : string_values) {
1134 prop->add_string_values(s);
1135 }
1136 prop = subdocument.add_properties();
1137 prop->set_name("Z");
1138 for (const std::string& s : string_values) {
1139 prop->add_string_values(s);
1140 }
1141
1142 DocumentProto document;
1143 document.set_schema("MultiLevelType");
1144 prop = document.add_properties();
1145 prop->set_name("A");
1146 *prop->add_document_values() = subdocument;
1147
1148 prop = document.add_properties();
1149 prop->set_name("B");
1150 *prop->add_document_values() = subdocument;
1151
1152 prop = document.add_properties();
1153 prop->set_name("C");
1154 *prop->add_document_values() = subdocument;
1155
1156 SectionIdMask section_mask = 0b111111111;
1157 SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
1158 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1159 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1160
1161 EXPECT_THAT(snippet.entries(), SizeIs(18));
1162 EXPECT_THAT(snippet.entries(0).property_name(), Eq("A.X[1]"));
1163 std::string_view content =
1164 GetString(&document, snippet.entries(0).property_name());
1165 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
1166 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
1167 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
1168
1169 EXPECT_THAT(snippet.entries(1).property_name(), Eq("A.X[3]"));
1170 content = GetString(&document, snippet.entries(1).property_name());
1171 EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
1172 EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
1173 EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
1174
1175 EXPECT_THAT(
1176 GetPropertyPaths(snippet),
1177 ElementsAre("A.X[1]", "A.X[3]", "A.Y[1]", "A.Y[3]", "A.Z[1]", "A.Z[3]",
1178 "B.X[1]", "B.X[3]", "B.Y[1]", "B.Y[3]", "B.Z[1]", "B.Z[3]",
1179 "C.X[1]", "C.X[3]", "C.Y[1]", "C.Y[3]", "C.Z[1]", "C.Z[3]"));
1180 }
1181
TEST_F(SnippetRetrieverTest,SnippetingTestMultiLevelRepeated)1182 TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelRepeated) {
1183 SchemaProto schema =
1184 SchemaBuilder()
1185 .AddType(SchemaTypeConfigBuilder()
1186 .SetType("SingleLevelType")
1187 .AddProperty(PropertyConfigBuilder()
1188 .SetName("X")
1189 .SetDataTypeString(TERM_MATCH_PREFIX,
1190 TOKENIZER_PLAIN)
1191 .SetCardinality(CARDINALITY_REPEATED))
1192 .AddProperty(PropertyConfigBuilder()
1193 .SetName("Y")
1194 .SetDataTypeString(TERM_MATCH_PREFIX,
1195 TOKENIZER_PLAIN)
1196 .SetCardinality(CARDINALITY_REPEATED))
1197 .AddProperty(PropertyConfigBuilder()
1198 .SetName("Z")
1199 .SetDataTypeString(TERM_MATCH_PREFIX,
1200 TOKENIZER_PLAIN)
1201 .SetCardinality(CARDINALITY_REPEATED)))
1202 .AddType(SchemaTypeConfigBuilder()
1203 .SetType("MultiLevelType")
1204 .AddProperty(PropertyConfigBuilder()
1205 .SetName("A")
1206 .SetDataTypeDocument(
1207 "SingleLevelType",
1208 /*index_nested_properties=*/true)
1209 .SetCardinality(CARDINALITY_REPEATED))
1210 .AddProperty(PropertyConfigBuilder()
1211 .SetName("B")
1212 .SetDataTypeDocument(
1213 "SingleLevelType",
1214 /*index_nested_properties=*/true)
1215 .SetCardinality(CARDINALITY_REPEATED))
1216 .AddProperty(PropertyConfigBuilder()
1217 .SetName("C")
1218 .SetDataTypeDocument(
1219 "SingleLevelType",
1220 /*index_nested_properties=*/true)
1221 .SetCardinality(CARDINALITY_REPEATED)))
1222 .Build();
1223 ICING_ASSERT_OK(schema_store_->SetSchema(
1224 schema, /*ignore_errors_and_delete_documents=*/true,
1225 /*allow_circular_schema_definitions=*/false));
1226 ICING_ASSERT_OK_AND_ASSIGN(
1227 snippet_retriever_,
1228 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1229 normalizer_.get()));
1230
1231 std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"};
1232 DocumentProto subdocument;
1233 PropertyProto* prop = subdocument.add_properties();
1234 prop->set_name("X");
1235 for (const std::string& s : string_values) {
1236 prop->add_string_values(s);
1237 }
1238 prop = subdocument.add_properties();
1239 prop->set_name("Y");
1240 for (const std::string& s : string_values) {
1241 prop->add_string_values(s);
1242 }
1243 prop = subdocument.add_properties();
1244 prop->set_name("Z");
1245 for (const std::string& s : string_values) {
1246 prop->add_string_values(s);
1247 }
1248
1249 DocumentProto document;
1250 document.set_schema("MultiLevelType");
1251 prop = document.add_properties();
1252 prop->set_name("A");
1253 *prop->add_document_values() = subdocument;
1254 *prop->add_document_values() = subdocument;
1255
1256 prop = document.add_properties();
1257 prop->set_name("B");
1258 *prop->add_document_values() = subdocument;
1259 *prop->add_document_values() = subdocument;
1260
1261 prop = document.add_properties();
1262 prop->set_name("C");
1263 *prop->add_document_values() = subdocument;
1264 *prop->add_document_values() = subdocument;
1265
1266 SectionIdMask section_mask = 0b111111111;
1267 SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
1268 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1269 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1270
1271 EXPECT_THAT(snippet.entries(), SizeIs(36));
1272 EXPECT_THAT(snippet.entries(0).property_name(), Eq("A[0].X[1]"));
1273 std::string_view content =
1274 GetString(&document, snippet.entries(0).property_name());
1275 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
1276 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
1277 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
1278
1279 EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[0].X[3]"));
1280 content = GetString(&document, snippet.entries(1).property_name());
1281 EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
1282 EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
1283 EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
1284
1285 EXPECT_THAT(GetPropertyPaths(snippet),
1286 ElementsAre("A[0].X[1]", "A[0].X[3]", "A[1].X[1]", "A[1].X[3]",
1287 "A[0].Y[1]", "A[0].Y[3]", "A[1].Y[1]", "A[1].Y[3]",
1288 "A[0].Z[1]", "A[0].Z[3]", "A[1].Z[1]", "A[1].Z[3]",
1289 "B[0].X[1]", "B[0].X[3]", "B[1].X[1]", "B[1].X[3]",
1290 "B[0].Y[1]", "B[0].Y[3]", "B[1].Y[1]", "B[1].Y[3]",
1291 "B[0].Z[1]", "B[0].Z[3]", "B[1].Z[1]", "B[1].Z[3]",
1292 "C[0].X[1]", "C[0].X[3]", "C[1].X[1]", "C[1].X[3]",
1293 "C[0].Y[1]", "C[0].Y[3]", "C[1].Y[1]", "C[1].Y[3]",
1294 "C[0].Z[1]", "C[0].Z[3]", "C[1].Z[1]", "C[1].Z[3]"));
1295 }
1296
TEST_F(SnippetRetrieverTest,SnippetingTestMultiLevelSingleValue)1297 TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelSingleValue) {
1298 SchemaProto schema =
1299 SchemaBuilder()
1300 .AddType(SchemaTypeConfigBuilder()
1301 .SetType("SingleLevelType")
1302 .AddProperty(PropertyConfigBuilder()
1303 .SetName("X")
1304 .SetDataTypeString(TERM_MATCH_PREFIX,
1305 TOKENIZER_PLAIN)
1306 .SetCardinality(CARDINALITY_OPTIONAL))
1307 .AddProperty(PropertyConfigBuilder()
1308 .SetName("Y")
1309 .SetDataTypeString(TERM_MATCH_PREFIX,
1310 TOKENIZER_PLAIN)
1311 .SetCardinality(CARDINALITY_OPTIONAL))
1312 .AddProperty(PropertyConfigBuilder()
1313 .SetName("Z")
1314 .SetDataTypeString(TERM_MATCH_PREFIX,
1315 TOKENIZER_PLAIN)
1316 .SetCardinality(CARDINALITY_OPTIONAL)))
1317 .AddType(SchemaTypeConfigBuilder()
1318 .SetType("MultiLevelType")
1319 .AddProperty(PropertyConfigBuilder()
1320 .SetName("A")
1321 .SetDataTypeDocument(
1322 "SingleLevelType",
1323 /*index_nested_properties=*/true)
1324 .SetCardinality(CARDINALITY_REPEATED))
1325 .AddProperty(PropertyConfigBuilder()
1326 .SetName("B")
1327 .SetDataTypeDocument(
1328 "SingleLevelType",
1329 /*index_nested_properties=*/true)
1330 .SetCardinality(CARDINALITY_REPEATED))
1331 .AddProperty(PropertyConfigBuilder()
1332 .SetName("C")
1333 .SetDataTypeDocument(
1334 "SingleLevelType",
1335 /*index_nested_properties=*/true)
1336 .SetCardinality(CARDINALITY_REPEATED)))
1337 .Build();
1338 ICING_ASSERT_OK(schema_store_->SetSchema(
1339 schema, /*ignore_errors_and_delete_documents=*/true,
1340 /*allow_circular_schema_definitions=*/false));
1341 ICING_ASSERT_OK_AND_ASSIGN(
1342 snippet_retriever_,
1343 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1344 normalizer_.get()));
1345
1346 DocumentProto subdocument;
1347 PropertyProto* prop = subdocument.add_properties();
1348 prop->set_name("X");
1349 prop->add_string_values("polo");
1350 prop = subdocument.add_properties();
1351 prop->set_name("Y");
1352 prop->add_string_values("marco");
1353 prop = subdocument.add_properties();
1354 prop->set_name("Z");
1355 prop->add_string_values("polo");
1356
1357 DocumentProto document;
1358 document.set_schema("MultiLevelType");
1359 prop = document.add_properties();
1360 prop->set_name("A");
1361 *prop->add_document_values() = subdocument;
1362 *prop->add_document_values() = subdocument;
1363
1364 prop = document.add_properties();
1365 prop->set_name("B");
1366 *prop->add_document_values() = subdocument;
1367 *prop->add_document_values() = subdocument;
1368
1369 prop = document.add_properties();
1370 prop->set_name("C");
1371 *prop->add_document_values() = subdocument;
1372 *prop->add_document_values() = subdocument;
1373
1374 SectionIdMask section_mask = 0b111111111;
1375 SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
1376 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1377 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1378
1379 EXPECT_THAT(snippet.entries(), SizeIs(12));
1380 EXPECT_THAT(snippet.entries(0).property_name(), Eq("A[0].X"));
1381 std::string_view content =
1382 GetString(&document, snippet.entries(0).property_name());
1383 EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
1384 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
1385 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
1386
1387 EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[1].X"));
1388 content = GetString(&document, snippet.entries(1).property_name());
1389 EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
1390 EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
1391 EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
1392
1393 EXPECT_THAT(
1394 GetPropertyPaths(snippet),
1395 ElementsAre("A[0].X", "A[1].X", "A[0].Z", "A[1].Z", "B[0].X", "B[1].X",
1396 "B[0].Z", "B[1].Z", "C[0].X", "C[1].X", "C[0].Z", "C[1].Z"));
1397 }
1398
TEST_F(SnippetRetrieverTest,CJKSnippetMatchTest)1399 TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) {
1400 // String: "我每天走路去上班。"
1401 // ^ ^ ^ ^^
1402 // UTF8 idx: 0 3 9 15 18
1403 // UTF16 idx: 0 1 3 5 6
1404 // Breaks into segments: "我", "每天", "走路", "去", "上班"
1405 constexpr std::string_view kChinese = "我每天走路去上班。";
1406 DocumentProto document =
1407 DocumentBuilder()
1408 .SetKey("icing", "email/1")
1409 .SetSchema("email")
1410 .AddStringProperty("subject", kChinese)
1411 .AddStringProperty("body",
1412 "Concerning the subject of foo, we need to begin "
1413 "considering our options regarding body bar.")
1414 .Build();
1415
1416 SectionIdMask section_mask = 0b00000011;
1417 SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
1418
1419 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1420 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1421
1422 // Ensure that one and only one property was matched and it was "body"
1423 ASSERT_THAT(snippet.entries(), SizeIs(1));
1424 const SnippetProto::EntryProto* entry = &snippet.entries(0);
1425 EXPECT_THAT(entry->property_name(), Eq("subject"));
1426 std::string_view content =
1427 GetString(&document, snippet.entries(0).property_name());
1428
1429 // Ensure that there is one and only one match within "subject"
1430 ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1431 const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1432
1433 // Ensure that the match is correct.
1434 EXPECT_THAT(GetMatches(content, *entry), ElementsAre("走路"));
1435 EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("走"));
1436
1437 // Ensure that the utf-16 values are also as expected
1438 EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3));
1439 EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
1440 EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(1));
1441 }
1442
TEST_F(SnippetRetrieverTest,CJKSnippetWindowTest)1443 TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
1444 language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE,
1445 jni_cache_.get());
1446 ICING_ASSERT_OK_AND_ASSIGN(
1447 language_segmenter_,
1448 language_segmenter_factory::Create(std::move(options)));
1449 ICING_ASSERT_OK_AND_ASSIGN(
1450 snippet_retriever_,
1451 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1452 normalizer_.get()));
1453
1454 // String: "我每天走路去上班。"
1455 // ^ ^ ^ ^^
1456 // UTF8 idx: 0 3 9 15 18
1457 // UTF16 idx: 0 1 3 5 6
1458 // UTF32 idx: 0 1 3 5 6
1459 // Breaks into segments: "我", "每天", "走路", "去", "上班"
1460 constexpr std::string_view kChinese = "我每天走路去上班。";
1461 DocumentProto document =
1462 DocumentBuilder()
1463 .SetKey("icing", "email/1")
1464 .SetSchema("email")
1465 .AddStringProperty("subject", kChinese)
1466 .AddStringProperty("body",
1467 "Concerning the subject of foo, we need to begin "
1468 "considering our options regarding body bar.")
1469 .Build();
1470
1471 SectionIdMask section_mask = 0b00000011;
1472 SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
1473
1474 // The window will be:
1475 // 1. untrimmed, no-shifting window will be (0,7).
1476 // 2. trimmed, no-shifting window [1, 6) "每天走路去".
1477 // 3. trimmed, shifted window [0, 6) "我每天走路去"
1478 snippet_spec_.set_max_window_utf32_length(6);
1479
1480 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1481 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1482
1483 // Ensure that one and only one property was matched and it was "body"
1484 ASSERT_THAT(snippet.entries(), SizeIs(1));
1485 const SnippetProto::EntryProto* entry = &snippet.entries(0);
1486 EXPECT_THAT(entry->property_name(), Eq("subject"));
1487 std::string_view content =
1488 GetString(&document, snippet.entries(0).property_name());
1489
1490 // Ensure that there is one and only one match within "subject"
1491 ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1492 const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1493
1494 // Ensure that the match is correct.
1495 EXPECT_THAT(GetWindows(content, *entry), ElementsAre("我每天走路去"));
1496
1497 // Ensure that the utf-16 values are also as expected
1498 EXPECT_THAT(match_proto.window_utf16_position(), Eq(0));
1499 EXPECT_THAT(match_proto.window_utf16_length(), Eq(6));
1500 }
1501
TEST_F(SnippetRetrieverTest,Utf16MultiCodeUnitSnippetMatchTest)1502 TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) {
1503 // The following string has four-byte UTF-8 characters. Most importantly, it
1504 // is also two code units in UTF-16.
1505 // String: " "
1506 // ^ ^ ^
1507 // UTF8 idx: 0 9 18
1508 // UTF16 idx: 0 5 10
1509 // Breaks into segments: "", "", ""
1510 constexpr std::string_view kText = " ";
1511 DocumentProto document =
1512 DocumentBuilder()
1513 .SetKey("icing", "email/1")
1514 .SetSchema("email")
1515 .AddStringProperty("subject", kText)
1516 .AddStringProperty("body",
1517 "Concerning the subject of foo, we need to begin "
1518 "considering our options regarding body bar.")
1519 .Build();
1520
1521 SectionIdMask section_mask = 0b00000011;
1522 SectionRestrictQueryTermsMap query_terms{{"", {""}}};
1523
1524 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1525 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1526
1527 // Ensure that one and only one property was matched and it was "body"
1528 ASSERT_THAT(snippet.entries(), SizeIs(1));
1529 const SnippetProto::EntryProto* entry = &snippet.entries(0);
1530 EXPECT_THAT(entry->property_name(), Eq("subject"));
1531 std::string_view content =
1532 GetString(&document, snippet.entries(0).property_name());
1533
1534 // Ensure that there is one and only one match within "subject"
1535 ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1536 const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1537
1538 // Ensure that the match is correct.
1539 EXPECT_THAT(GetMatches(content, *entry), ElementsAre(""));
1540 EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre(""));
1541
1542 // Ensure that the utf-16 values are also as expected
1543 EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(5));
1544 EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(4));
1545 EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
1546 }
1547
TEST_F(SnippetRetrieverTest,Utf16MultiCodeUnitWindowTest)1548 TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
1549 // The following string has four-byte UTF-8 characters. Most importantly, it
1550 // is also two code units in UTF-16.
1551 // String: " "
1552 // ^ ^ ^
1553 // UTF8 idx: 0 9 18
1554 // UTF16 idx: 0 5 10
1555 // UTF32 idx: 0 3 6
1556 // Breaks into segments: "", "", ""
1557 constexpr std::string_view kText = " ";
1558 DocumentProto document =
1559 DocumentBuilder()
1560 .SetKey("icing", "email/1")
1561 .SetSchema("email")
1562 .AddStringProperty("subject", kText)
1563 .AddStringProperty("body",
1564 "Concerning the subject of foo, we need to begin "
1565 "considering our options regarding body bar.")
1566 .Build();
1567
1568 SectionIdMask section_mask = 0b00000011;
1569 SectionRestrictQueryTermsMap query_terms{{"", {""}}};
1570
1571 // Set a six character window. This will produce a window like this:
1572 // String: " "
1573 // ^ ^
1574 // UTF8 idx: 9 22
1575 // UTF16 idx: 5 12
1576 // UTF32 idx: 3 7
1577 snippet_spec_.set_max_window_utf32_length(6);
1578
1579 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1580 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1581
1582 // Ensure that one and only one property was matched and it was "body"
1583 ASSERT_THAT(snippet.entries(), SizeIs(1));
1584 const SnippetProto::EntryProto* entry = &snippet.entries(0);
1585 EXPECT_THAT(entry->property_name(), Eq("subject"));
1586 std::string_view content =
1587 GetString(&document, snippet.entries(0).property_name());
1588
1589 // Ensure that there is one and only one match within "subject"
1590 ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1591 const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1592
1593 // Ensure that the match is correct.
1594 EXPECT_THAT(GetWindows(content, *entry), ElementsAre(" "));
1595
1596 // Ensure that the utf-16 values are also as expected
1597 EXPECT_THAT(match_proto.window_utf16_position(), Eq(5));
1598 EXPECT_THAT(match_proto.window_utf16_length(), Eq(7));
1599 }
1600
TEST_F(SnippetRetrieverTest,SnippettingVerbatimAscii)1601 TEST_F(SnippetRetrieverTest, SnippettingVerbatimAscii) {
1602 SchemaProto schema =
1603 SchemaBuilder()
1604 .AddType(SchemaTypeConfigBuilder()
1605 .SetType("verbatimType")
1606 .AddProperty(PropertyConfigBuilder()
1607 .SetName("verbatim")
1608 .SetDataTypeString(TERM_MATCH_EXACT,
1609 TOKENIZER_VERBATIM)
1610 .SetCardinality(CARDINALITY_REPEATED)))
1611 .Build();
1612 ICING_ASSERT_OK(schema_store_->SetSchema(
1613 schema, /*ignore_errors_and_delete_documents=*/true,
1614 /*allow_circular_schema_definitions=*/false));
1615 ICING_ASSERT_OK_AND_ASSIGN(
1616 snippet_retriever_,
1617 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1618 normalizer_.get()));
1619
1620 DocumentProto document = DocumentBuilder()
1621 .SetKey("icing", "verbatim/1")
1622 .SetSchema("verbatimType")
1623 .AddStringProperty("verbatim", "Hello, world!")
1624 .Build();
1625
1626 SectionIdMask section_mask = 0b00000001;
1627 SectionRestrictQueryTermsMap query_terms{{"", {"Hello, world!"}}};
1628
1629 snippet_spec_.set_max_window_utf32_length(13);
1630 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1631 query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1632
1633 // There should only be one snippet entry and match, the verbatim token in its
1634 // entirety.
1635 ASSERT_THAT(snippet.entries(), SizeIs(1));
1636
1637 const SnippetProto::EntryProto* entry = &snippet.entries(0);
1638 ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1639 ASSERT_THAT(entry->property_name(), "verbatim");
1640
1641 const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1642 // We expect the match to begin at position 0, and to span the entire token
1643 // which contains 13 characters.
1644 EXPECT_THAT(match_proto.window_byte_position(), Eq(0));
1645 EXPECT_THAT(match_proto.window_utf16_length(), Eq(13));
1646
1647 // We expect the submatch to begin at position 0 of the verbatim token and
1648 // span the length of our query term "Hello, world!", which has utf-16 length
1649 // of 13. The submatch length is equal to the window length as the query the
1650 // snippet is retrieved with an exact term match.
1651 EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0));
1652 EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(13));
1653 }
1654
TEST_F(SnippetRetrieverTest,SnippettingVerbatimCJK)1655 TEST_F(SnippetRetrieverTest, SnippettingVerbatimCJK) {
1656 SchemaProto schema =
1657 SchemaBuilder()
1658 .AddType(SchemaTypeConfigBuilder()
1659 .SetType("verbatimType")
1660 .AddProperty(PropertyConfigBuilder()
1661 .SetName("verbatim")
1662 .SetDataTypeString(TERM_MATCH_PREFIX,
1663 TOKENIZER_VERBATIM)
1664 .SetCardinality(CARDINALITY_REPEATED)))
1665 .Build();
1666 ICING_ASSERT_OK(schema_store_->SetSchema(
1667 schema, /*ignore_errors_and_delete_documents=*/true,
1668 /*allow_circular_schema_definitions=*/false));
1669 ICING_ASSERT_OK_AND_ASSIGN(
1670 snippet_retriever_,
1671 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1672 normalizer_.get()));
1673
1674 // String: "我每天走路去上班。"
1675 // ^ ^ ^ ^^
1676 // UTF8 idx: 0 3 9 15 18
1677 // UTF16 idx: 0 1 3 5 6
1678 // UTF32 idx: 0 1 3 5 6
1679 // Breaks into segments: "我", "每天", "走路", "去", "上班"
1680 std::string chinese_string = "我每天走路去上班。";
1681 DocumentProto document = DocumentBuilder()
1682 .SetKey("icing", "verbatim/1")
1683 .SetSchema("verbatimType")
1684 .AddStringProperty("verbatim", chinese_string)
1685 .Build();
1686
1687 SectionIdMask section_mask = 0b00000001;
1688 SectionRestrictQueryTermsMap query_terms{{"", {"我每"}}};
1689
1690 snippet_spec_.set_max_window_utf32_length(9);
1691 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1692 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1693
1694 // There should only be one snippet entry and match, the verbatim token in its
1695 // entirety.
1696 ASSERT_THAT(snippet.entries(), SizeIs(1));
1697
1698 const SnippetProto::EntryProto* entry = &snippet.entries(0);
1699 ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1700 ASSERT_THAT(entry->property_name(), "verbatim");
1701
1702 const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1703 // We expect the match to begin at position 0, and to span the entire token
1704 // which has utf-16 length of 9.
1705 EXPECT_THAT(match_proto.window_byte_position(), Eq(0));
1706 EXPECT_THAT(match_proto.window_utf16_length(), Eq(9));
1707
1708 // We expect the submatch to begin at position 0 of the verbatim token and
1709 // span the length of our query term "我每", which has utf-16 length of 2.
1710 EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0));
1711 EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
1712 }
1713
TEST_F(SnippetRetrieverTest,SnippettingRfc822Ascii)1714 TEST_F(SnippetRetrieverTest, SnippettingRfc822Ascii) {
1715 SchemaProto schema =
1716 SchemaBuilder()
1717 .AddType(SchemaTypeConfigBuilder()
1718 .SetType("rfc822Type")
1719 .AddProperty(PropertyConfigBuilder()
1720 .SetName("rfc822")
1721 .SetDataTypeString(TERM_MATCH_PREFIX,
1722 TOKENIZER_RFC822)
1723 .SetCardinality(CARDINALITY_REPEATED)))
1724 .Build();
1725 ICING_ASSERT_OK(schema_store_->SetSchema(
1726 schema, /*ignore_errors_and_delete_documents=*/true,
1727 /*allow_circular_schema_definitions=*/false));
1728
1729 ICING_ASSERT_OK_AND_ASSIGN(
1730 snippet_retriever_,
1731 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1732 normalizer_.get()));
1733
1734 DocumentProto document =
1735 DocumentBuilder()
1736 .SetKey("icing", "rfc822/1")
1737 .SetSchema("rfc822Type")
1738 .AddStringProperty("rfc822",
1739 "Alexander Sav <tom.bar@google.com>, Very Long "
1740 "Name Example <tjbarron@google.com>")
1741 .Build();
1742
1743 SectionIdMask section_mask = 0b00000001;
1744
1745 // This should match both the first name token as well as the entire RFC822.
1746 SectionRestrictQueryTermsMap query_terms{{"", {"alexand"}}};
1747
1748 snippet_spec_.set_max_window_utf32_length(35);
1749
1750 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1751 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1752
1753 ASSERT_THAT(snippet.entries(), SizeIs(1));
1754 EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
1755
1756 std::string_view content =
1757 GetString(&document, snippet.entries(0).property_name());
1758
1759 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1760 ElementsAre("Alexander Sav <tom.bar@google.com>,",
1761 "Alexander Sav <tom.bar@google.com>,"));
1762 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1763 ElementsAre("Alexander Sav <tom.bar@google.com>", "Alexander"));
1764 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1765 ElementsAre("Alexand", "Alexand"));
1766
1767 // "tom" should match the local component, local address, and address tokens.
1768 query_terms = SectionRestrictQueryTermsMap{{"", {"tom"}}};
1769 snippet_spec_.set_max_window_utf32_length(36);
1770
1771 snippet = snippet_retriever_->RetrieveSnippet(
1772 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1773
1774 ASSERT_THAT(snippet.entries(), SizeIs(1));
1775 EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
1776
1777 content = GetString(&document, snippet.entries(0).property_name());
1778
1779 // TODO(b/248362902) Stop returning duplicate matches.
1780 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1781 ElementsAre("Alexander Sav <tom.bar@google.com>,",
1782 "Alexander Sav <tom.bar@google.com>,",
1783 "Alexander Sav <tom.bar@google.com>,"));
1784 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1785 ElementsAre("tom.bar", "tom.bar@google.com", "tom"));
1786 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1787 ElementsAre("tom", "tom", "tom"));
1788 }
1789
TEST_F(SnippetRetrieverTest,SnippettingRfc822CJK)1790 TEST_F(SnippetRetrieverTest, SnippettingRfc822CJK) {
1791 SchemaProto schema =
1792 SchemaBuilder()
1793 .AddType(SchemaTypeConfigBuilder()
1794 .SetType("rfc822Type")
1795 .AddProperty(PropertyConfigBuilder()
1796 .SetName("rfc822")
1797 .SetDataTypeString(TERM_MATCH_PREFIX,
1798 TOKENIZER_RFC822)
1799 .SetCardinality(CARDINALITY_REPEATED)))
1800 .Build();
1801 ICING_ASSERT_OK(schema_store_->SetSchema(
1802 schema, /*ignore_errors_and_delete_documents=*/true,
1803 /*allow_circular_schema_definitions=*/false));
1804
1805 ICING_ASSERT_OK_AND_ASSIGN(
1806 snippet_retriever_,
1807 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1808 normalizer_.get()));
1809
1810 std::string chinese_string = "我, 每天@走路, 去@上班";
1811 DocumentProto document = DocumentBuilder()
1812 .SetKey("icing", "rfc822/1")
1813 .SetSchema("rfc822Type")
1814 .AddStringProperty("rfc822", chinese_string)
1815 .Build();
1816
1817 SectionIdMask section_mask = 0b00000001;
1818
1819 SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
1820
1821 snippet_spec_.set_max_window_utf32_length(8);
1822
1823 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1824 query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1825
1826 // There should only be one snippet entry and match, the local component token
1827 ASSERT_THAT(snippet.entries(), SizeIs(1));
1828 EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
1829
1830 std::string_view content =
1831 GetString(&document, snippet.entries(0).property_name());
1832
1833 // The local component, address, local address, and token will all match. The
1834 // windows for address and token are "" as the snippet window is too small.
1835 // TODO(b/248362902) Stop returning duplicate matches.
1836 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1837 ElementsAre("每天@走路,", "每天@走路,"));
1838 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1839 ElementsAre("走路", "走路"));
1840 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1841 ElementsAre("走", "走"));
1842 }
1843
1844 #ifdef ENABLE_URL_TOKENIZER
TEST_F(SnippetRetrieverTest,SnippettingUrlAscii)1845 TEST_F(SnippetRetrieverTest, SnippettingUrlAscii) {
1846 SchemaProto schema =
1847 SchemaBuilder()
1848 .AddType(SchemaTypeConfigBuilder().SetType("urlType").AddProperty(
1849 PropertyConfigBuilder()
1850 .SetName("url")
1851 .SetDataTypeString(MATCH_PREFIX, TOKENIZER_URL)
1852 .SetCardinality(CARDINALITY_REPEATED)))
1853 .Build();
1854 ICING_ASSERT_OK(schema_store_->SetSchema(
1855 schema, /*ignore_errors_and_delete_documents=*/true));
1856
1857 ICING_ASSERT_OK_AND_ASSIGN(
1858 snippet_retriever_,
1859 SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1860 normalizer_.get()));
1861
1862 DocumentProto document =
1863 DocumentBuilder()
1864 .SetKey("icing", "url/1")
1865 .SetSchema("urlType")
1866 .AddStringProperty("url", "https://mail.google.com/calendar/google/")
1867 .Build();
1868
1869 SectionIdMask section_mask = 0b00000001;
1870
1871 // Query with single url split-token match
1872 SectionRestrictQueryTermsMap query_terms{{"", {"com"}}};
1873 // 40 is the length of the url.
1874 // Window that is the size of the url should return entire url.
1875 snippet_spec_.set_max_window_utf32_length(40);
1876
1877 SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1878 query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1879
1880 ASSERT_THAT(snippet.entries(), SizeIs(1));
1881 EXPECT_THAT(snippet.entries(0).property_name(), "url");
1882
1883 std::string_view content =
1884 GetString(&document, snippet.entries(0).property_name());
1885
1886 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1887 ElementsAre("https://mail.google.com/calendar/google/"));
1888 EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("com"));
1889 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("com"));
1890
1891 // Query with single url suffix-token match
1892 query_terms = SectionRestrictQueryTermsMap{{"", {"mail.goo"}}};
1893 snippet_spec_.set_max_window_utf32_length(40);
1894
1895 snippet = snippet_retriever_->RetrieveSnippet(
1896 query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1897
1898 ASSERT_THAT(snippet.entries(), SizeIs(1));
1899 EXPECT_THAT(snippet.entries(0).property_name(), "url");
1900
1901 content = GetString(&document, snippet.entries(0).property_name());
1902
1903 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1904 ElementsAre("https://mail.google.com/calendar/google/"));
1905 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1906 ElementsAre("mail.google.com/calendar/google/"));
1907 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1908 ElementsAre("mail.goo"));
1909
1910 // Query with multiple url split-token matches
1911 query_terms = SectionRestrictQueryTermsMap{{"", {"goog"}}};
1912 snippet_spec_.set_max_window_utf32_length(40);
1913
1914 snippet = snippet_retriever_->RetrieveSnippet(
1915 query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1916
1917 ASSERT_THAT(snippet.entries(), SizeIs(1));
1918 EXPECT_THAT(snippet.entries(0).property_name(), "url");
1919
1920 content = GetString(&document, snippet.entries(0).property_name());
1921
1922 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1923 ElementsAre("https://mail.google.com/calendar/google/",
1924 "https://mail.google.com/calendar/google/"));
1925 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1926 ElementsAre("google", "google"));
1927 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1928 ElementsAre("goog", "goog"));
1929
1930 // Query with both url split-token and suffix-token matches
1931 query_terms = SectionRestrictQueryTermsMap{{"", {"mail"}}};
1932 snippet_spec_.set_max_window_utf32_length(40);
1933
1934 snippet = snippet_retriever_->RetrieveSnippet(
1935 query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1936
1937 ASSERT_THAT(snippet.entries(), SizeIs(1));
1938 EXPECT_THAT(snippet.entries(0).property_name(), "url");
1939
1940 content = GetString(&document, snippet.entries(0).property_name());
1941
1942 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1943 ElementsAre("https://mail.google.com/calendar/google/",
1944 "https://mail.google.com/calendar/google/"));
1945 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1946 ElementsAre("mail", "mail.google.com/calendar/google/"));
1947 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1948 ElementsAre("mail", "mail"));
1949
1950 // Prefix query with both url split-token and suffix-token matches
1951 query_terms = SectionRestrictQueryTermsMap{{"", {"http"}}};
1952 snippet_spec_.set_max_window_utf32_length(40);
1953
1954 snippet = snippet_retriever_->RetrieveSnippet(
1955 query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1956
1957 ASSERT_THAT(snippet.entries(), SizeIs(1));
1958 EXPECT_THAT(snippet.entries(0).property_name(), "url");
1959
1960 content = GetString(&document, snippet.entries(0).property_name());
1961
1962 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1963 ElementsAre("https://mail.google.com/calendar/google/",
1964 "https://mail.google.com/calendar/google/"));
1965 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1966 ElementsAre("https", "https://mail.google.com/calendar/google/"));
1967 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1968 ElementsAre("http", "http"));
1969
1970 // Window that's smaller than the input size should not return any matches.
1971 query_terms = SectionRestrictQueryTermsMap{{"", {"google"}}};
1972 snippet_spec_.set_max_window_utf32_length(10);
1973
1974 snippet = snippet_retriever_->RetrieveSnippet(
1975 query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1976
1977 ASSERT_THAT(snippet.entries(), SizeIs(0));
1978
1979 // Test case with more than two matches
1980 document =
1981 DocumentBuilder()
1982 .SetKey("icing", "url/1")
1983 .SetSchema("urlType")
1984 .AddStringProperty("url", "https://www.google.com/calendar/google/")
1985 .Build();
1986
1987 // Prefix query with both url split-token and suffix-token matches
1988 query_terms = SectionRestrictQueryTermsMap{{"", {"google"}}};
1989 snippet_spec_.set_max_window_utf32_length(39);
1990
1991 snippet = snippet_retriever_->RetrieveSnippet(
1992 query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1993
1994 ASSERT_THAT(snippet.entries(), SizeIs(1));
1995 EXPECT_THAT(snippet.entries(0).property_name(), "url");
1996
1997 content = GetString(&document, snippet.entries(0).property_name());
1998
1999 EXPECT_THAT(GetWindows(content, snippet.entries(0)),
2000 ElementsAre("https://www.google.com/calendar/google/",
2001 "https://www.google.com/calendar/google/",
2002 "https://www.google.com/calendar/google/"));
2003 EXPECT_THAT(GetMatches(content, snippet.entries(0)),
2004 ElementsAre("google", "google", "google.com/calendar/google/"));
2005 EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
2006 ElementsAre("google", "google", "google"));
2007 }
2008 #endif // ENABLE_URL_TOKENIZER
2009
2010 } // namespace
2011
2012 } // namespace lib
2013 } // namespace icing
2014