• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/result/snippet-retriever.h"
16 
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 
21 #include "gmock/gmock.h"
22 #include "gtest/gtest.h"
23 #include "icing/document-builder.h"
24 #include "icing/file/mock-filesystem.h"
25 #include "icing/portable/equals-proto.h"
26 #include "icing/portable/platform.h"
27 #include "icing/proto/document.pb.h"
28 #include "icing/proto/schema.pb.h"
29 #include "icing/proto/search.pb.h"
30 #include "icing/proto/term.pb.h"
31 #include "icing/query/query-terms.h"
32 #include "icing/schema-builder.h"
33 #include "icing/schema/schema-store.h"
34 #include "icing/schema/section-manager.h"
35 #include "icing/store/document-id.h"
36 #include "icing/store/key-mapper.h"
37 #include "icing/testing/common-matchers.h"
38 #include "icing/testing/fake-clock.h"
39 #include "icing/testing/icu-data-file-helper.h"
40 #include "icing/testing/jni-test-helpers.h"
41 #include "icing/testing/test-data.h"
42 #include "icing/testing/tmp-directory.h"
43 #include "icing/tokenization/language-segmenter-factory.h"
44 #include "icing/tokenization/language-segmenter.h"
45 #include "icing/transform/map/map-normalizer.h"
46 #include "icing/transform/normalizer-factory.h"
47 #include "icing/transform/normalizer.h"
48 #include "icing/util/snippet-helpers.h"
49 #include "unicode/uloc.h"
50 
51 namespace icing {
52 namespace lib {
53 
54 namespace {
55 
56 using ::testing::ElementsAre;
57 using ::testing::Eq;
58 using ::testing::IsEmpty;
59 using ::testing::SizeIs;
60 
61 // TODO (b/246964044): remove ifdef guard when url-tokenizer is ready for export
62 // to Android. Also move it to schema-builder.h
63 #ifdef ENABLE_URL_TOKENIZER
64 constexpr StringIndexingConfig::TokenizerType::Code TOKENIZER_URL =
65     StringIndexingConfig::TokenizerType::URL;
66 #endif  // ENABLE_URL_TOKENIZER
67 
GetPropertyPaths(const SnippetProto & snippet)68 std::vector<std::string_view> GetPropertyPaths(const SnippetProto& snippet) {
69   std::vector<std::string_view> paths;
70   for (const SnippetProto::EntryProto& entry : snippet.entries()) {
71     paths.push_back(entry.property_name());
72   }
73   return paths;
74 }
75 
76 class SnippetRetrieverTest : public testing::Test {
77  protected:
SetUp()78   void SetUp() override {
79     test_dir_ = GetTestTempDir() + "/icing";
80     filesystem_.CreateDirectoryRecursively(test_dir_.c_str());
81 
82     if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
83       ICING_ASSERT_OK(
84           // File generated via icu_data_file rule in //icing/BUILD.
85           icu_data_file_helper::SetUpICUDataFile(
86               GetTestFilePath("icing/icu.dat")));
87     }
88 
89     jni_cache_ = GetTestJniCache();
90     language_segmenter_factory::SegmenterOptions options(ULOC_US,
91                                                          jni_cache_.get());
92     ICING_ASSERT_OK_AND_ASSIGN(
93         language_segmenter_,
94         language_segmenter_factory::Create(std::move(options)));
95 
96     // Setup the schema
97     ICING_ASSERT_OK_AND_ASSIGN(
98         schema_store_,
99         SchemaStore::Create(&filesystem_, test_dir_, &fake_clock_));
100     SchemaProto schema =
101         SchemaBuilder()
102             .AddType(
103                 SchemaTypeConfigBuilder()
104                     .SetType("email")
105                     .AddProperty(PropertyConfigBuilder()
106                                      .SetName("subject")
107                                      .SetDataTypeString(TERM_MATCH_PREFIX,
108                                                         TOKENIZER_PLAIN)
109                                      .SetCardinality(CARDINALITY_OPTIONAL))
110                     .AddProperty(PropertyConfigBuilder()
111                                      .SetName("body")
112                                      .SetDataTypeString(TERM_MATCH_EXACT,
113                                                         TOKENIZER_PLAIN)
114                                      .SetCardinality(CARDINALITY_OPTIONAL)))
115             .Build();
116     ICING_ASSERT_OK(schema_store_->SetSchema(
117         schema, /*ignore_errors_and_delete_documents=*/false,
118         /*allow_circular_schema_definitions=*/false));
119 
120     ICING_ASSERT_OK_AND_ASSIGN(normalizer_, normalizer_factory::Create(
121                                                 /*max_term_byte_size=*/10000));
122     ICING_ASSERT_OK_AND_ASSIGN(
123         snippet_retriever_,
124         SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
125                                  normalizer_.get()));
126 
127     // Set limits to max - effectively no limit. Enable matching and request a
128     // window of 64 bytes.
129     snippet_spec_.set_num_to_snippet(std::numeric_limits<int32_t>::max());
130     snippet_spec_.set_num_matches_per_property(
131         std::numeric_limits<int32_t>::max());
132     snippet_spec_.set_max_window_utf32_length(64);
133   }
134 
TearDown()135   void TearDown() override {
136     filesystem_.DeleteDirectoryRecursively(test_dir_.c_str());
137   }
138 
139   Filesystem filesystem_;
140   FakeClock fake_clock_;
141   std::unique_ptr<SchemaStore> schema_store_;
142   std::unique_ptr<LanguageSegmenter> language_segmenter_;
143   std::unique_ptr<SnippetRetriever> snippet_retriever_;
144   std::unique_ptr<Normalizer> normalizer_;
145   std::unique_ptr<const JniCache> jni_cache_;
146   ResultSpecProto::SnippetSpecProto snippet_spec_;
147   std::string test_dir_;
148 };
149 
TEST_F(SnippetRetrieverTest,CreationWithNullPointerShouldFail)150 TEST_F(SnippetRetrieverTest, CreationWithNullPointerShouldFail) {
151   EXPECT_THAT(
152       SnippetRetriever::Create(/*schema_store=*/nullptr,
153                                language_segmenter_.get(), normalizer_.get()),
154       StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
155   EXPECT_THAT(SnippetRetriever::Create(schema_store_.get(),
156                                        /*language_segmenter=*/nullptr,
157                                        normalizer_.get()),
158               StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
159   EXPECT_THAT(
160       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
161                                /*normalizer=*/nullptr),
162       StatusIs(libtextclassifier3::StatusCode::FAILED_PRECONDITION));
163 }
164 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeSmallerThanMatch)165 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeSmallerThanMatch) {
166   DocumentProto document =
167       DocumentBuilder()
168           .SetKey("icing", "email/1")
169           .SetSchema("email")
170           .AddStringProperty("subject", "counting")
171           .AddStringProperty("body", "one two three four.... five")
172           .Build();
173 
174   SectionIdMask section_mask = 0b00000011;
175   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
176 
177   // Window starts at the beginning of "three" and ends in the middle of
178   // "three". len=4, orig_window= "thre"
179   snippet_spec_.set_max_window_utf32_length(4);
180   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
181       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
182 
183   EXPECT_THAT(snippet.entries(), SizeIs(1));
184   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
185   std::string_view content =
186       GetString(&document, snippet.entries(0).property_name());
187   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre(""));
188 }
189 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeEqualToMatch_OddLengthMatch)190 TEST_F(SnippetRetrieverTest,
191        SnippetingWindowMaxWindowSizeEqualToMatch_OddLengthMatch) {
192   DocumentProto document =
193       DocumentBuilder()
194           .SetKey("icing", "email/1")
195           .SetSchema("email")
196           .AddStringProperty("subject", "counting")
197           .AddStringProperty("body", "one two three four.... five")
198           .Build();
199 
200   SectionIdMask section_mask = 0b00000011;
201   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
202 
203   // Window starts at the beginning of "three" and at the exact end of
204   // "three". len=5, orig_window= "three"
205   snippet_spec_.set_max_window_utf32_length(5);
206   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
207       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
208 
209   EXPECT_THAT(snippet.entries(), SizeIs(1));
210   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
211   std::string_view content =
212       GetString(&document, snippet.entries(0).property_name());
213   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("three"));
214 }
215 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeEqualToMatch_EvenLengthMatch)216 TEST_F(SnippetRetrieverTest,
217        SnippetingWindowMaxWindowSizeEqualToMatch_EvenLengthMatch) {
218   DocumentProto document =
219       DocumentBuilder()
220           .SetKey("icing", "email/1")
221           .SetSchema("email")
222           .AddStringProperty("subject", "counting")
223           .AddStringProperty("body", "one two three four.... five")
224           .Build();
225 
226   SectionIdMask section_mask = 0b00000011;
227   SectionRestrictQueryTermsMap query_terms{{"", {"four"}}};
228 
229   // Window starts at the beginning of "four" and at the exact end of
230   // "four". len=4, orig_window= "four"
231   snippet_spec_.set_max_window_utf32_length(4);
232   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
233       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
234 
235   EXPECT_THAT(snippet.entries(), SizeIs(1));
236   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
237   std::string_view content =
238       GetString(&document, snippet.entries(0).property_name());
239   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("four"));
240 }
241 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowStartsInWhitespace)242 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsInWhitespace) {
243   DocumentProto document =
244       DocumentBuilder()
245           .SetKey("icing", "email/1")
246           .SetSchema("email")
247           .AddStringProperty("subject", "counting")
248           .AddStringProperty("body", "one two three four.... five")
249           .Build();
250 
251   SectionIdMask section_mask = 0b00000011;
252   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
253 
254   // String:      "one two three four.... five"
255   //               ^   ^   ^     ^        ^   ^
256   // UTF-8 idx:    0   4   8     14       23  27
257   // UTF-32 idx:   0   4   8     14       23  27
258   //
259   // The window will be:
260   //   1. untrimmed, no-shifting window will be (2,17).
261   //   2. trimmed, no-shifting window [4,13) "two three"
262   //   3. trimmed, shifted window [4,18) "two three four"
263   snippet_spec_.set_max_window_utf32_length(14);
264   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
265       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
266 
267   EXPECT_THAT(snippet.entries(), SizeIs(1));
268   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
269   std::string_view content =
270       GetString(&document, snippet.entries(0).property_name());
271   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
272               ElementsAre("two three four"));
273 }
274 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowStartsMidToken)275 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsMidToken) {
276   DocumentProto document =
277       DocumentBuilder()
278           .SetKey("icing", "email/1")
279           .SetSchema("email")
280           .AddStringProperty("subject", "counting")
281           .AddStringProperty("body", "one two three four.... five")
282           .Build();
283 
284   SectionIdMask section_mask = 0b00000011;
285   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
286 
287   // String:      "one two three four.... five"
288   //               ^   ^   ^     ^        ^   ^
289   // UTF-8 idx:    0   4   8     14       23  27
290   // UTF-32 idx:   0   4   8     14       23  27
291   //
292   // The window will be:
293   //   1. untrimmed, no-shifting window will be (1,18).
294   //   2. trimmed, no-shifting window [4,18) "two three four"
295   //   3. trimmed, shifted window [4,20) "two three four.."
296   snippet_spec_.set_max_window_utf32_length(16);
297   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
298       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
299 
300   EXPECT_THAT(snippet.entries(), SizeIs(1));
301   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
302   std::string_view content =
303       GetString(&document, snippet.entries(0).property_name());
304   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
305               ElementsAre("two three four.."));
306 }
307 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowEndsInPunctuation)308 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInPunctuation) {
309   DocumentProto document =
310       DocumentBuilder()
311           .SetKey("icing", "email/1")
312           .SetSchema("email")
313           .AddStringProperty("subject", "counting")
314           .AddStringProperty("body", "one two three four.... five")
315           .Build();
316 
317   SectionIdMask section_mask = 0b00000011;
318   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
319 
320   // Window ends in the middle of all the punctuation and window starts at 0.
321   // len=20, orig_window="one two three four.."
322   snippet_spec_.set_max_window_utf32_length(20);
323   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
324       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
325 
326   EXPECT_THAT(snippet.entries(), SizeIs(1));
327   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
328   std::string_view content =
329       GetString(&document, snippet.entries(0).property_name());
330   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
331               ElementsAre("one two three four.."));
332 }
333 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowEndsMultiBytePunctuation)334 TEST_F(SnippetRetrieverTest,
335        SnippetingWindowMaxWindowEndsMultiBytePunctuation) {
336   DocumentProto document =
337       DocumentBuilder()
338           .SetKey("icing", "email/1")
339           .SetSchema("email")
340           .AddStringProperty("subject", "counting")
341           .AddStringProperty("body",
342                              "Is everything upside down in Australia¿ Crikey!")
343           .Build();
344 
345   SectionIdMask section_mask = 0b00000011;
346   SectionRestrictQueryTermsMap query_terms{{"", {"in"}}};
347 
348   // Window ends in the middle of all the punctuation and window starts at 0.
349   // len=26, orig_window="pside down in Australia¿"
350   snippet_spec_.set_max_window_utf32_length(24);
351   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
352       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
353 
354   EXPECT_THAT(snippet.entries(), SizeIs(1));
355   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
356   std::string_view content =
357       GetString(&document, snippet.entries(0).property_name());
358   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
359               ElementsAre("down in Australia¿"));
360 }
361 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowBeyondMultiBytePunctuation)362 TEST_F(SnippetRetrieverTest,
363        SnippetingWindowMaxWindowBeyondMultiBytePunctuation) {
364   DocumentProto document =
365       DocumentBuilder()
366           .SetKey("icing", "email/1")
367           .SetSchema("email")
368           .AddStringProperty("subject", "counting")
369           .AddStringProperty("body",
370                              "Is everything upside down in Australia¿ Crikey!")
371           .Build();
372 
373   SectionIdMask section_mask = 0b00000011;
374   SectionRestrictQueryTermsMap query_terms{{"", {"in"}}};
375 
376   // Window ends in the middle of all the punctuation and window starts at 0.
377   // len=26, orig_window="upside down in Australia¿ "
378   snippet_spec_.set_max_window_utf32_length(26);
379   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
380       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
381 
382   EXPECT_THAT(snippet.entries(), SizeIs(1));
383   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
384   std::string_view content =
385       GetString(&document, snippet.entries(0).property_name());
386   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
387               ElementsAre("upside down in Australia¿"));
388 }
389 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowStartsBeforeValueStart)390 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowStartsBeforeValueStart) {
391   DocumentProto document =
392       DocumentBuilder()
393           .SetKey("icing", "email/1")
394           .SetSchema("email")
395           .AddStringProperty("subject", "counting")
396           .AddStringProperty("body", "one two three four.... five")
397           .Build();
398 
399   SectionIdMask section_mask = 0b00000011;
400   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
401 
402   // String:      "one two three four.... five"
403   //               ^   ^   ^     ^        ^   ^
404   // UTF-8 idx:    0   4   8     14       23  27
405   // UTF-32 idx:   0   4   8     14       23  27
406   //
407   // The window will be:
408   //   1. untrimmed, no-shifting window will be (-2,21).
409   //   2. trimmed, no-shifting window [0,21) "one two three four..."
410   //   3. trimmed, shifted window [0,22) "one two three four...."
411   snippet_spec_.set_max_window_utf32_length(22);
412   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
413       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
414 
415   EXPECT_THAT(snippet.entries(), SizeIs(1));
416   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
417   std::string_view content =
418       GetString(&document, snippet.entries(0).property_name());
419   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
420               ElementsAre("one two three four...."));
421 }
422 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowEndsInWhitespace)423 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsInWhitespace) {
424   DocumentProto document =
425       DocumentBuilder()
426           .SetKey("icing", "email/1")
427           .SetSchema("email")
428           .AddStringProperty("subject", "counting")
429           .AddStringProperty("body", "one two three four.... five")
430           .Build();
431 
432   SectionIdMask section_mask = 0b00000011;
433   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
434 
435   // Window ends before "five" but after all the punctuation
436   // len=26, orig_window="one two three four.... "
437   snippet_spec_.set_max_window_utf32_length(26);
438   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
439       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
440 
441   EXPECT_THAT(snippet.entries(), SizeIs(1));
442   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
443   std::string_view content =
444       GetString(&document, snippet.entries(0).property_name());
445   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
446               ElementsAre("one two three four...."));
447 }
448 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowEndsMidToken)449 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowEndsMidToken) {
450   DocumentProto document =
451       DocumentBuilder()
452           .SetKey("icing", "email/1")
453           .SetSchema("email")
454           .AddStringProperty("subject", "counting")
455           .AddStringProperty("body", "one two three four.... five")
456           .Build();
457 
458   SectionIdMask section_mask = 0b00000011;
459   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
460 
461   // String:      "one two three four.... five"
462   //               ^   ^   ^     ^        ^   ^
463   // UTF-8 idx:    0   4   8     14       23  27
464   // UTF-32 idx:   0   4   8     14       23  27
465   //
466   // The window will be:
467   //   1. untrimmed, no-shifting window will be ((-7,26).
468   //   2. trimmed, no-shifting window [0,26) "one two three four...."
469   //   3. trimmed, shifted window [0,27) "one two three four.... five"
470   snippet_spec_.set_max_window_utf32_length(32);
471   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
472       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
473 
474   EXPECT_THAT(snippet.entries(), SizeIs(1));
475   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
476   std::string_view content =
477       GetString(&document, snippet.entries(0).property_name());
478   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
479               ElementsAre("one two three four.... five"));
480 }
481 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeEqualToValueSize)482 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeEqualToValueSize) {
483   DocumentProto document =
484       DocumentBuilder()
485           .SetKey("icing", "email/1")
486           .SetSchema("email")
487           .AddStringProperty("subject", "counting")
488           .AddStringProperty("body", "one two three four.... five")
489           .Build();
490 
491   SectionIdMask section_mask = 0b00000011;
492   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
493 
494   // Max window size equals the size of the value.
495   // len=34, orig_window="one two three four.... five"
496   snippet_spec_.set_max_window_utf32_length(34);
497   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
498       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
499 
500   EXPECT_THAT(snippet.entries(), SizeIs(1));
501   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
502   std::string_view content =
503       GetString(&document, snippet.entries(0).property_name());
504   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
505               ElementsAre("one two three four.... five"));
506 }
507 
TEST_F(SnippetRetrieverTest,SnippetingWindowMaxWindowSizeLargerThanValueSize)508 TEST_F(SnippetRetrieverTest, SnippetingWindowMaxWindowSizeLargerThanValueSize) {
509   DocumentProto document =
510       DocumentBuilder()
511           .SetKey("icing", "email/1")
512           .SetSchema("email")
513           .AddStringProperty("subject", "counting")
514           .AddStringProperty("body", "one two three four.... five")
515           .Build();
516 
517   SectionIdMask section_mask = 0b00000011;
518   SectionRestrictQueryTermsMap query_terms{{"", {"three"}}};
519 
520   // Max window size exceeds the size of the value.
521   // len=36, orig_window="one two three four.... five"
522   snippet_spec_.set_max_window_utf32_length(36);
523   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
524       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
525 
526   EXPECT_THAT(snippet.entries(), SizeIs(1));
527   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
528   std::string_view content =
529       GetString(&document, snippet.entries(0).property_name());
530   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
531               ElementsAre("one two three four.... five"));
532 }
533 
TEST_F(SnippetRetrieverTest,SnippetingWindowMatchAtTextStart)534 TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStart) {
535   DocumentProto document =
536       DocumentBuilder()
537           .SetKey("icing", "email/1")
538           .SetSchema("email")
539           .AddStringProperty("subject", "counting")
540           .AddStringProperty("body", "one two three four.... five six")
541           .Build();
542 
543   SectionIdMask section_mask = 0b00000011;
544   SectionRestrictQueryTermsMap query_terms{{"", {"two"}}};
545 
546   // String:      "one two three four.... five six"
547   //               ^   ^   ^     ^        ^    ^  ^
548   // UTF-8 idx:    0   4   8     14       23  28  31
549   // UTF-32 idx:   0   4   8     14       23  28  31
550   //
551   // Window size will go past the start of the window.
552   // The window will be:
553   //   1. untrimmed, no-shifting window will be (-10,19).
554   //   2. trimmed, no-shifting window [0,19) "one two three four."
555   //   3. trimmed, shifted window [0,27) "one two three four.... five"
556   snippet_spec_.set_max_window_utf32_length(28);
557   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
558       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
559 
560   EXPECT_THAT(snippet.entries(), SizeIs(1));
561   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
562   std::string_view content =
563       GetString(&document, snippet.entries(0).property_name());
564   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
565               ElementsAre("one two three four.... five"));
566 }
567 
TEST_F(SnippetRetrieverTest,SnippetingWindowMatchAtTextEnd)568 TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEnd) {
569   DocumentProto document =
570       DocumentBuilder()
571           .SetKey("icing", "email/1")
572           .SetSchema("email")
573           .AddStringProperty("subject", "counting")
574           .AddStringProperty("body", "one two three four.... five six")
575           .Build();
576 
577   SectionIdMask section_mask = 0b00000011;
578   SectionRestrictQueryTermsMap query_terms{{"", {"five"}}};
579 
580   // String:      "one two three four.... five six"
581   //               ^   ^   ^     ^        ^    ^  ^
582   // UTF-8 idx:    0   4   8     14       23  28  31
583   // UTF-32 idx:   0   4   8     14       23  28  31
584   //
585   // Window size will go past the end of the window.
586   // The window will be:
587   //   1. untrimmed, no-shifting window will be (10,39).
588   //   2. trimmed, no-shifting window [14,31) "four.... five six"
589   //   3. trimmed, shifted window [4,31) "two three four.... five six"
590   snippet_spec_.set_max_window_utf32_length(28);
591   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
592       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
593 
594   EXPECT_THAT(snippet.entries(), SizeIs(1));
595   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
596   std::string_view content =
597       GetString(&document, snippet.entries(0).property_name());
598   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
599               ElementsAre("two three four.... five six"));
600 }
601 
TEST_F(SnippetRetrieverTest,SnippetingWindowMatchAtTextStartShortText)602 TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextStartShortText) {
603   DocumentProto document =
604       DocumentBuilder()
605           .SetKey("icing", "email/1")
606           .SetSchema("email")
607           .AddStringProperty("subject", "counting")
608           .AddStringProperty("body", "one two three four....")
609           .Build();
610 
611   SectionIdMask section_mask = 0b00000011;
612   SectionRestrictQueryTermsMap query_terms{{"", {"two"}}};
613 
614   // String:      "one two three four...."
615   //               ^   ^   ^     ^       ^
616   // UTF-8 idx:    0   4   8     14      22
617   // UTF-32 idx:   0   4   8     14      22
618   //
619   // Window size will go past the start of the window.
620   // The window will be:
621   //   1. untrimmed, no-shifting window will be (-10,19).
622   //   2. trimmed, no-shifting window [0, 19) "one two three four."
623   //   3. trimmed, shifted window [0, 22) "one two three four...."
624   snippet_spec_.set_max_window_utf32_length(28);
625   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
626       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
627 
628   EXPECT_THAT(snippet.entries(), SizeIs(1));
629   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
630   std::string_view content =
631       GetString(&document, snippet.entries(0).property_name());
632   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
633               ElementsAre("one two three four...."));
634 }
635 
TEST_F(SnippetRetrieverTest,SnippetingWindowMatchAtTextEndShortText)636 TEST_F(SnippetRetrieverTest, SnippetingWindowMatchAtTextEndShortText) {
637   DocumentProto document =
638       DocumentBuilder()
639           .SetKey("icing", "email/1")
640           .SetSchema("email")
641           .AddStringProperty("subject", "counting")
642           .AddStringProperty("body", "one two three four....")
643           .Build();
644 
645   SectionIdMask section_mask = 0b00000011;
646   SectionRestrictQueryTermsMap query_terms{{"", {"four"}}};
647 
648   // String:      "one two three four...."
649   //               ^   ^   ^     ^       ^
650   // UTF-8 idx:    0   4   8     14      22
651   // UTF-32 idx:   0   4   8     14      22
652   //
653   // Window size will go past the start of the window.
654   // The window will be:
655   //   1. untrimmed, no-shifting window will be (1,30).
656   //   2. trimmed, no-shifting window [4, 22) "two three four...."
657   //   3. trimmed, shifted window [0, 22) "one two three four...."
658   snippet_spec_.set_max_window_utf32_length(28);
659   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
660       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
661 
662   EXPECT_THAT(snippet.entries(), SizeIs(1));
663   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
664   std::string_view content =
665       GetString(&document, snippet.entries(0).property_name());
666   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
667               ElementsAre("one two three four...."));
668 }
669 
TEST_F(SnippetRetrieverTest,PrefixSnippeting)670 TEST_F(SnippetRetrieverTest, PrefixSnippeting) {
671   DocumentProto document =
672       DocumentBuilder()
673           .SetKey("icing", "email/1")
674           .SetSchema("email")
675           .AddStringProperty("subject", "subject foo")
676           .AddStringProperty("body", "Only a fool would match this content.")
677           .Build();
678   SectionIdMask section_mask = 0b00000011;
679   SectionRestrictQueryTermsMap query_terms{{"", {"f"}}};
680   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
681       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
682 
683   // Check the snippets. 'f' should match prefix-enabled property 'subject', but
684   // not exact-only property 'body'
685   EXPECT_THAT(snippet.entries(), SizeIs(1));
686   EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject"));
687   std::string_view content =
688       GetString(&document, snippet.entries(0).property_name());
689   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
690               ElementsAre("subject foo"));
691   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
692   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("f"));
693 }
694 
TEST_F(SnippetRetrieverTest,ExactSnippeting)695 TEST_F(SnippetRetrieverTest, ExactSnippeting) {
696   DocumentProto document =
697       DocumentBuilder()
698           .SetKey("icing", "email/1")
699           .SetSchema("email")
700           .AddStringProperty("subject", "subject foo")
701           .AddStringProperty("body", "Only a fool would match this content.")
702           .Build();
703 
704   SectionIdMask section_mask = 0b00000011;
705   SectionRestrictQueryTermsMap query_terms{{"", {"f"}}};
706   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
707       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
708 
709   // Check the snippets
710   EXPECT_THAT(snippet.entries(), IsEmpty());
711 }
712 
TEST_F(SnippetRetrieverTest,SimpleSnippetingNoWindowing)713 TEST_F(SnippetRetrieverTest, SimpleSnippetingNoWindowing) {
714   DocumentProto document =
715       DocumentBuilder()
716           .SetKey("icing", "email/1")
717           .SetSchema("email")
718           .AddStringProperty("subject", "subject foo")
719           .AddStringProperty("body", "Only a fool would match this content.")
720           .Build();
721 
722   snippet_spec_.set_max_window_utf32_length(0);
723 
724   SectionIdMask section_mask = 0b00000011;
725   SectionRestrictQueryTermsMap query_terms{{"", {"foo"}}};
726   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
727       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
728 
729   // Check the snippets
730   EXPECT_THAT(snippet.entries(), SizeIs(1));
731   EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject"));
732   std::string_view content =
733       GetString(&document, snippet.entries(0).property_name());
734   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre(""));
735   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
736   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
737 }
738 
TEST_F(SnippetRetrieverTest,SnippetingMultipleMatches)739 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatches) {
740   DocumentProto document =
741       DocumentBuilder()
742           .SetKey("icing", "email/1")
743           .SetSchema("email")
744           .AddStringProperty("subject", "subject foo")
745           .AddStringProperty("body",
746                              "Concerning the subject of foo, we need to begin "
747                              "considering our options regarding body bar.")
748           .Build();
749   // String:      "Concerning the subject of foo, we need to begin considering "
750   //               ^          ^   ^       ^  ^    ^  ^    ^  ^     ^
751   // UTF-8 idx:    0          11  15     23  26  31  34  39  42    48
752   // UTF-32 idx:   0          11  15     23  26  31  34  39  42    48
753   //
754   // String ctd:  "our options regarding body bar."
755   //               ^   ^       ^         ^    ^   ^
756   // UTF-8 idx:    60  64      72        82   87  91
757   // UTF-32 idx:   60  64      72        82   87  91
758   SectionIdMask section_mask = 0b00000011;
759   SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
760   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
761       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
762 
763   // Check the snippets
764   EXPECT_THAT(snippet.entries(), SizeIs(2));
765   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
766   std::string_view content =
767       GetString(&document, snippet.entries(0).property_name());
768   // The first window will be:
769   //   1. untrimmed, no-shifting window will be (-6,59).
770   //   2. trimmed, no-shifting window [0, 59) "Concerning... considering".
771   //   3. trimmed, shifted window [0, 63) "Concerning... our"
772   // The second window will be:
773   //   1. untrimmed, no-shifting window will be (54,91).
774   //   2. trimmed, no-shifting window [60, 91) "our... bar.".
775   //   3. trimmed, shifted window [31, 91) "we... bar."
776   EXPECT_THAT(
777       GetWindows(content, snippet.entries(0)),
778       ElementsAre(
779           "Concerning the subject of foo, we need to begin considering our",
780           "we need to begin considering our options regarding body bar."));
781   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
782               ElementsAre("foo", "bar"));
783   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
784               ElementsAre("foo", "bar"));
785 
786   EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
787   content = GetString(&document, snippet.entries(1).property_name());
788   EXPECT_THAT(GetWindows(content, snippet.entries(1)),
789               ElementsAre("subject foo"));
790   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo"));
791   EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
792 }
793 
TEST_F(SnippetRetrieverTest,SnippetingMultipleMatchesSectionRestrict)794 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrict) {
795   DocumentProto document =
796       DocumentBuilder()
797           .SetKey("icing", "email/1")
798           .SetSchema("email")
799           .AddStringProperty("subject", "subject foo")
800           .AddStringProperty("body",
801                              "Concerning the subject of foo, we need to begin "
802                              "considering our options regarding body bar.")
803           .Build();
804   // String:      "Concerning the subject of foo, we need to begin considering "
805   //               ^          ^   ^       ^  ^    ^  ^    ^  ^     ^
806   // UTF-8 idx:    0          11  15     23  26  31  34  39  42    48
807   // UTF-32 idx:   0          11  15     23  26  31  34  39  42    48
808   //
809   // String ctd:  "our options regarding body bar."
810   //               ^   ^       ^         ^    ^   ^
811   // UTF-8 idx:    60  64      72        82   87  91
812   // UTF-32 idx:   60  64      72        82   87  91
813   //
814   // Section 1 "subject" is not in the section_mask, so no snippet information
815   // from that section should be returned by the SnippetRetriever.
816   SectionIdMask section_mask = 0b00000001;
817   SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
818   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
819       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
820 
821   // Check the snippets
822   EXPECT_THAT(snippet.entries(), SizeIs(1));
823   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
824   std::string_view content =
825       GetString(&document, snippet.entries(0).property_name());
826   // The first window will be:
827   //   1. untrimmed, no-shifting window will be (-6,59).
828   //   2. trimmed, no-shifting window [0, 59) "Concerning... considering".
829   //   3. trimmed, shifted window [0, 63) "Concerning... our"
830   // The second window will be:
831   //   1. untrimmed, no-shifting window will be (54,91).
832   //   2. trimmed, no-shifting window [60, 91) "our... bar.".
833   //   3. trimmed, shifted window [31, 91) "we... bar."
834   EXPECT_THAT(
835       GetWindows(content, snippet.entries(0)),
836       ElementsAre(
837           "Concerning the subject of foo, we need to begin considering our",
838           "we need to begin considering our options regarding body bar."));
839   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
840               ElementsAre("foo", "bar"));
841   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
842               ElementsAre("foo", "bar"));
843 }
844 
TEST_F(SnippetRetrieverTest,SnippetingMultipleMatchesSectionRestrictedTerm)845 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesSectionRestrictedTerm) {
846   DocumentProto document =
847       DocumentBuilder()
848           .SetKey("icing", "email/1")
849           .SetSchema("email")
850           .AddStringProperty("subject", "subject foo")
851           .AddStringProperty("body",
852                              "Concerning the subject of foo, we need to begin "
853                              "considering our options regarding body bar.")
854           .Build();
855   // String:      "Concerning the subject of foo, we need to begin considering "
856   //               ^          ^   ^       ^  ^    ^  ^    ^  ^     ^
857   // UTF-8 idx:    0          11  15     23  26  31  34  39  42    48
858   // UTF-32 idx:   0          11  15     23  26  31  34  39  42    48
859   //
860   // String ctd:  "our options regarding body bar."
861   //               ^   ^       ^         ^    ^   ^
862   // UTF-8 idx:    60  64      72        82   87  91
863   // UTF-32 idx:   60  64      72        82   87  91
864   SectionIdMask section_mask = 0b00000011;
865   // "subject" should match in both sections, but "foo" is restricted to "body"
866   // so it should only match in the 'body' section and not the 'subject'
867   // section.
868   SectionRestrictQueryTermsMap query_terms{{"", {"subject"}},
869                                            {"body", {"foo"}}};
870   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
871       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
872 
873   // Check the snippets
874   EXPECT_THAT(snippet.entries(), SizeIs(2));
875   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
876   std::string_view content =
877       GetString(&document, snippet.entries(0).property_name());
878   // The first window will be:
879   //   1. untrimmed, no-shifting window will be (-15,50).
880   //   2. trimmed, no-shifting window [0, 47) "Concerning... begin".
881   //   3. trimmed, shifted window [0, 63) "Concerning... our"
882   // The second window will be:
883   //   1. untrimmed, no-shifting window will be (-6,59).
884   //   2. trimmed, no-shifting window [0, 59) "Concerning... considering".
885   //   3. trimmed, shifted window [0, 63) "Concerning... our"
886   EXPECT_THAT(
887       GetWindows(content, snippet.entries(0)),
888       ElementsAre(
889           "Concerning the subject of foo, we need to begin considering our",
890           "Concerning the subject of foo, we need to begin considering our"));
891   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
892               ElementsAre("subject", "foo"));
893   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
894               ElementsAre("subject", "foo"));
895 
896   EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
897   content = GetString(&document, snippet.entries(1).property_name());
898   EXPECT_THAT(GetWindows(content, snippet.entries(1)),
899               ElementsAre("subject foo"));
900   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("subject"));
901   EXPECT_THAT(GetSubMatches(content, snippet.entries(1)),
902               ElementsAre("subject"));
903 }
904 
TEST_F(SnippetRetrieverTest,SnippetingMultipleMatchesOneMatchPerProperty)905 TEST_F(SnippetRetrieverTest, SnippetingMultipleMatchesOneMatchPerProperty) {
906   DocumentProto document =
907       DocumentBuilder()
908           .SetKey("icing", "email/1")
909           .SetSchema("email")
910           .AddStringProperty("subject", "subject foo")
911           .AddStringProperty("body",
912                              "Concerning the subject of foo, we need to begin "
913                              "considering our options regarding body bar.")
914           .Build();
915 
916   // String:      "Concerning the subject of foo, we need to begin considering "
917   //               ^          ^   ^       ^  ^    ^  ^    ^  ^     ^
918   // UTF-8 idx:    0          11  15     23  26  31  34  39  42    48
919   // UTF-32 idx:   0          11  15     23  26  31  34  39  42    48
920   //
921   // String ctd:  "our options regarding body bar."
922   //               ^   ^       ^         ^    ^   ^
923   // UTF-8 idx:    60  64      72        82   87  91
924   // UTF-32 idx:   60  64      72        82   87  91
925   snippet_spec_.set_num_matches_per_property(1);
926 
927   SectionIdMask section_mask = 0b00000011;
928   SectionRestrictQueryTermsMap query_terms{{"", {"foo", "bar"}}};
929   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
930       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
931 
932   // Check the snippets
933   EXPECT_THAT(snippet.entries(), SizeIs(2));
934   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
935   std::string_view content =
936       GetString(&document, snippet.entries(0).property_name());
937   // The window will be:
938   //   1. untrimmed, no-shifting window will be (-6,59).
939   //   2. trimmed, no-shifting window [0, 59) "Concerning... considering".
940   //   3. trimmed, shifted window [0, 63) "Concerning... our"
941   EXPECT_THAT(
942       GetWindows(content, snippet.entries(0)),
943       ElementsAre(
944           "Concerning the subject of foo, we need to begin considering our"));
945   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("foo"));
946   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("foo"));
947 
948   EXPECT_THAT(snippet.entries(1).property_name(), Eq("subject"));
949   content = GetString(&document, snippet.entries(1).property_name());
950   EXPECT_THAT(GetWindows(content, snippet.entries(1)),
951               ElementsAre("subject foo"));
952   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("foo"));
953   EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("foo"));
954 }
955 
TEST_F(SnippetRetrieverTest,PrefixSnippetingNormalization)956 TEST_F(SnippetRetrieverTest, PrefixSnippetingNormalization) {
957   DocumentProto document =
958       DocumentBuilder()
959           .SetKey("icing", "email/1")
960           .SetSchema("email")
961           .AddStringProperty("subject", "MDI team")
962           .AddStringProperty("body", "Some members are in Zürich.")
963           .Build();
964   SectionIdMask section_mask = 0b00000011;
965   SectionRestrictQueryTermsMap query_terms{{"", {"md"}}};
966   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
967       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
968 
969   EXPECT_THAT(snippet.entries(), SizeIs(1));
970   EXPECT_THAT(snippet.entries(0).property_name(), Eq("subject"));
971   std::string_view content =
972       GetString(&document, snippet.entries(0).property_name());
973   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("MDI team"));
974   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("MDI"));
975   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("MD"));
976 }
977 
TEST_F(SnippetRetrieverTest,ExactSnippetingNormalization)978 TEST_F(SnippetRetrieverTest, ExactSnippetingNormalization) {
979   DocumentProto document =
980       DocumentBuilder()
981           .SetKey("icing", "email/1")
982           .SetSchema("email")
983           .AddStringProperty("subject", "MDI team")
984           .AddStringProperty("body", "Some members are in Zürich.")
985           .Build();
986 
987   SectionIdMask section_mask = 0b00000011;
988   SectionRestrictQueryTermsMap query_terms{{"", {"zurich"}}};
989   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
990       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
991 
992   EXPECT_THAT(snippet.entries(), SizeIs(1));
993   EXPECT_THAT(snippet.entries(0).property_name(), Eq("body"));
994   std::string_view content =
995       GetString(&document, snippet.entries(0).property_name());
996   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
997               ElementsAre("Some members are in Zürich."));
998   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("Zürich"));
999 
1000   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1001               ElementsAre("Zürich"));
1002 }
1003 
TEST_F(SnippetRetrieverTest,SnippetingTestOneLevel)1004 TEST_F(SnippetRetrieverTest, SnippetingTestOneLevel) {
1005   SchemaProto schema =
1006       SchemaBuilder()
1007           .AddType(SchemaTypeConfigBuilder()
1008                        .SetType("SingleLevelType")
1009                        .AddProperty(PropertyConfigBuilder()
1010                                         .SetName("X")
1011                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1012                                                            TOKENIZER_PLAIN)
1013                                         .SetCardinality(CARDINALITY_REPEATED))
1014                        .AddProperty(PropertyConfigBuilder()
1015                                         .SetName("Y")
1016                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1017                                                            TOKENIZER_PLAIN)
1018                                         .SetCardinality(CARDINALITY_REPEATED))
1019                        .AddProperty(PropertyConfigBuilder()
1020                                         .SetName("Z")
1021                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1022                                                            TOKENIZER_PLAIN)
1023                                         .SetCardinality(CARDINALITY_REPEATED)))
1024           .Build();
1025   ICING_ASSERT_OK(schema_store_->SetSchema(
1026       schema, /*ignore_errors_and_delete_documents=*/true,
1027       /*allow_circular_schema_definitions=*/false));
1028   ICING_ASSERT_OK_AND_ASSIGN(
1029       snippet_retriever_,
1030       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1031                                normalizer_.get()));
1032 
1033   std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"};
1034   DocumentProto document;
1035   document.set_schema("SingleLevelType");
1036   PropertyProto* prop = document.add_properties();
1037   prop->set_name("X");
1038   for (const std::string& s : string_values) {
1039     prop->add_string_values(s);
1040   }
1041   prop = document.add_properties();
1042   prop->set_name("Y");
1043   for (const std::string& s : string_values) {
1044     prop->add_string_values(s);
1045   }
1046   prop = document.add_properties();
1047   prop->set_name("Z");
1048   for (const std::string& s : string_values) {
1049     prop->add_string_values(s);
1050   }
1051 
1052   SectionIdMask section_mask = 0b00000111;
1053   SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
1054   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1055       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1056 
1057   EXPECT_THAT(snippet.entries(), SizeIs(6));
1058   EXPECT_THAT(snippet.entries(0).property_name(), Eq("X[1]"));
1059   std::string_view content =
1060       GetString(&document, snippet.entries(0).property_name());
1061   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
1062   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
1063   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
1064 
1065   EXPECT_THAT(snippet.entries(1).property_name(), Eq("X[3]"));
1066   content = GetString(&document, snippet.entries(1).property_name());
1067   EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
1068   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
1069   EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
1070 
1071   EXPECT_THAT(GetPropertyPaths(snippet),
1072               ElementsAre("X[1]", "X[3]", "Y[1]", "Y[3]", "Z[1]", "Z[3]"));
1073 }
1074 
TEST_F(SnippetRetrieverTest,SnippetingTestMultiLevel)1075 TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevel) {
1076   SchemaProto schema =
1077       SchemaBuilder()
1078           .AddType(SchemaTypeConfigBuilder()
1079                        .SetType("SingleLevelType")
1080                        .AddProperty(PropertyConfigBuilder()
1081                                         .SetName("X")
1082                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1083                                                            TOKENIZER_PLAIN)
1084                                         .SetCardinality(CARDINALITY_REPEATED))
1085                        .AddProperty(PropertyConfigBuilder()
1086                                         .SetName("Y")
1087                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1088                                                            TOKENIZER_PLAIN)
1089                                         .SetCardinality(CARDINALITY_REPEATED))
1090                        .AddProperty(PropertyConfigBuilder()
1091                                         .SetName("Z")
1092                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1093                                                            TOKENIZER_PLAIN)
1094                                         .SetCardinality(CARDINALITY_REPEATED)))
1095           .AddType(SchemaTypeConfigBuilder()
1096                        .SetType("MultiLevelType")
1097                        .AddProperty(PropertyConfigBuilder()
1098                                         .SetName("A")
1099                                         .SetDataTypeDocument(
1100                                             "SingleLevelType",
1101                                             /*index_nested_properties=*/true)
1102                                         .SetCardinality(CARDINALITY_OPTIONAL))
1103                        .AddProperty(PropertyConfigBuilder()
1104                                         .SetName("B")
1105                                         .SetDataTypeDocument(
1106                                             "SingleLevelType",
1107                                             /*index_nested_properties=*/true)
1108                                         .SetCardinality(CARDINALITY_OPTIONAL))
1109                        .AddProperty(PropertyConfigBuilder()
1110                                         .SetName("C")
1111                                         .SetDataTypeDocument(
1112                                             "SingleLevelType",
1113                                             /*index_nested_properties=*/true)
1114                                         .SetCardinality(CARDINALITY_OPTIONAL)))
1115           .Build();
1116   ICING_ASSERT_OK(schema_store_->SetSchema(
1117       schema, /*ignore_errors_and_delete_documents=*/true,
1118       /*allow_circular_schema_definitions=*/false));
1119   ICING_ASSERT_OK_AND_ASSIGN(
1120       snippet_retriever_,
1121       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1122                                normalizer_.get()));
1123 
1124   std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"};
1125   DocumentProto subdocument;
1126   PropertyProto* prop = subdocument.add_properties();
1127   prop->set_name("X");
1128   for (const std::string& s : string_values) {
1129     prop->add_string_values(s);
1130   }
1131   prop = subdocument.add_properties();
1132   prop->set_name("Y");
1133   for (const std::string& s : string_values) {
1134     prop->add_string_values(s);
1135   }
1136   prop = subdocument.add_properties();
1137   prop->set_name("Z");
1138   for (const std::string& s : string_values) {
1139     prop->add_string_values(s);
1140   }
1141 
1142   DocumentProto document;
1143   document.set_schema("MultiLevelType");
1144   prop = document.add_properties();
1145   prop->set_name("A");
1146   *prop->add_document_values() = subdocument;
1147 
1148   prop = document.add_properties();
1149   prop->set_name("B");
1150   *prop->add_document_values() = subdocument;
1151 
1152   prop = document.add_properties();
1153   prop->set_name("C");
1154   *prop->add_document_values() = subdocument;
1155 
1156   SectionIdMask section_mask = 0b111111111;
1157   SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
1158   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1159       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1160 
1161   EXPECT_THAT(snippet.entries(), SizeIs(18));
1162   EXPECT_THAT(snippet.entries(0).property_name(), Eq("A.X[1]"));
1163   std::string_view content =
1164       GetString(&document, snippet.entries(0).property_name());
1165   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
1166   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
1167   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
1168 
1169   EXPECT_THAT(snippet.entries(1).property_name(), Eq("A.X[3]"));
1170   content = GetString(&document, snippet.entries(1).property_name());
1171   EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
1172   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
1173   EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
1174 
1175   EXPECT_THAT(
1176       GetPropertyPaths(snippet),
1177       ElementsAre("A.X[1]", "A.X[3]", "A.Y[1]", "A.Y[3]", "A.Z[1]", "A.Z[3]",
1178                   "B.X[1]", "B.X[3]", "B.Y[1]", "B.Y[3]", "B.Z[1]", "B.Z[3]",
1179                   "C.X[1]", "C.X[3]", "C.Y[1]", "C.Y[3]", "C.Z[1]", "C.Z[3]"));
1180 }
1181 
TEST_F(SnippetRetrieverTest,SnippetingTestMultiLevelRepeated)1182 TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelRepeated) {
1183   SchemaProto schema =
1184       SchemaBuilder()
1185           .AddType(SchemaTypeConfigBuilder()
1186                        .SetType("SingleLevelType")
1187                        .AddProperty(PropertyConfigBuilder()
1188                                         .SetName("X")
1189                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1190                                                            TOKENIZER_PLAIN)
1191                                         .SetCardinality(CARDINALITY_REPEATED))
1192                        .AddProperty(PropertyConfigBuilder()
1193                                         .SetName("Y")
1194                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1195                                                            TOKENIZER_PLAIN)
1196                                         .SetCardinality(CARDINALITY_REPEATED))
1197                        .AddProperty(PropertyConfigBuilder()
1198                                         .SetName("Z")
1199                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1200                                                            TOKENIZER_PLAIN)
1201                                         .SetCardinality(CARDINALITY_REPEATED)))
1202           .AddType(SchemaTypeConfigBuilder()
1203                        .SetType("MultiLevelType")
1204                        .AddProperty(PropertyConfigBuilder()
1205                                         .SetName("A")
1206                                         .SetDataTypeDocument(
1207                                             "SingleLevelType",
1208                                             /*index_nested_properties=*/true)
1209                                         .SetCardinality(CARDINALITY_REPEATED))
1210                        .AddProperty(PropertyConfigBuilder()
1211                                         .SetName("B")
1212                                         .SetDataTypeDocument(
1213                                             "SingleLevelType",
1214                                             /*index_nested_properties=*/true)
1215                                         .SetCardinality(CARDINALITY_REPEATED))
1216                        .AddProperty(PropertyConfigBuilder()
1217                                         .SetName("C")
1218                                         .SetDataTypeDocument(
1219                                             "SingleLevelType",
1220                                             /*index_nested_properties=*/true)
1221                                         .SetCardinality(CARDINALITY_REPEATED)))
1222           .Build();
1223   ICING_ASSERT_OK(schema_store_->SetSchema(
1224       schema, /*ignore_errors_and_delete_documents=*/true,
1225       /*allow_circular_schema_definitions=*/false));
1226   ICING_ASSERT_OK_AND_ASSIGN(
1227       snippet_retriever_,
1228       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1229                                normalizer_.get()));
1230 
1231   std::vector<std::string> string_values = {"marco", "polo", "marco", "polo"};
1232   DocumentProto subdocument;
1233   PropertyProto* prop = subdocument.add_properties();
1234   prop->set_name("X");
1235   for (const std::string& s : string_values) {
1236     prop->add_string_values(s);
1237   }
1238   prop = subdocument.add_properties();
1239   prop->set_name("Y");
1240   for (const std::string& s : string_values) {
1241     prop->add_string_values(s);
1242   }
1243   prop = subdocument.add_properties();
1244   prop->set_name("Z");
1245   for (const std::string& s : string_values) {
1246     prop->add_string_values(s);
1247   }
1248 
1249   DocumentProto document;
1250   document.set_schema("MultiLevelType");
1251   prop = document.add_properties();
1252   prop->set_name("A");
1253   *prop->add_document_values() = subdocument;
1254   *prop->add_document_values() = subdocument;
1255 
1256   prop = document.add_properties();
1257   prop->set_name("B");
1258   *prop->add_document_values() = subdocument;
1259   *prop->add_document_values() = subdocument;
1260 
1261   prop = document.add_properties();
1262   prop->set_name("C");
1263   *prop->add_document_values() = subdocument;
1264   *prop->add_document_values() = subdocument;
1265 
1266   SectionIdMask section_mask = 0b111111111;
1267   SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
1268   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1269       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1270 
1271   EXPECT_THAT(snippet.entries(), SizeIs(36));
1272   EXPECT_THAT(snippet.entries(0).property_name(), Eq("A[0].X[1]"));
1273   std::string_view content =
1274       GetString(&document, snippet.entries(0).property_name());
1275   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
1276   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
1277   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
1278 
1279   EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[0].X[3]"));
1280   content = GetString(&document, snippet.entries(1).property_name());
1281   EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
1282   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
1283   EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
1284 
1285   EXPECT_THAT(GetPropertyPaths(snippet),
1286               ElementsAre("A[0].X[1]", "A[0].X[3]", "A[1].X[1]", "A[1].X[3]",
1287                           "A[0].Y[1]", "A[0].Y[3]", "A[1].Y[1]", "A[1].Y[3]",
1288                           "A[0].Z[1]", "A[0].Z[3]", "A[1].Z[1]", "A[1].Z[3]",
1289                           "B[0].X[1]", "B[0].X[3]", "B[1].X[1]", "B[1].X[3]",
1290                           "B[0].Y[1]", "B[0].Y[3]", "B[1].Y[1]", "B[1].Y[3]",
1291                           "B[0].Z[1]", "B[0].Z[3]", "B[1].Z[1]", "B[1].Z[3]",
1292                           "C[0].X[1]", "C[0].X[3]", "C[1].X[1]", "C[1].X[3]",
1293                           "C[0].Y[1]", "C[0].Y[3]", "C[1].Y[1]", "C[1].Y[3]",
1294                           "C[0].Z[1]", "C[0].Z[3]", "C[1].Z[1]", "C[1].Z[3]"));
1295 }
1296 
TEST_F(SnippetRetrieverTest,SnippetingTestMultiLevelSingleValue)1297 TEST_F(SnippetRetrieverTest, SnippetingTestMultiLevelSingleValue) {
1298   SchemaProto schema =
1299       SchemaBuilder()
1300           .AddType(SchemaTypeConfigBuilder()
1301                        .SetType("SingleLevelType")
1302                        .AddProperty(PropertyConfigBuilder()
1303                                         .SetName("X")
1304                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1305                                                            TOKENIZER_PLAIN)
1306                                         .SetCardinality(CARDINALITY_OPTIONAL))
1307                        .AddProperty(PropertyConfigBuilder()
1308                                         .SetName("Y")
1309                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1310                                                            TOKENIZER_PLAIN)
1311                                         .SetCardinality(CARDINALITY_OPTIONAL))
1312                        .AddProperty(PropertyConfigBuilder()
1313                                         .SetName("Z")
1314                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1315                                                            TOKENIZER_PLAIN)
1316                                         .SetCardinality(CARDINALITY_OPTIONAL)))
1317           .AddType(SchemaTypeConfigBuilder()
1318                        .SetType("MultiLevelType")
1319                        .AddProperty(PropertyConfigBuilder()
1320                                         .SetName("A")
1321                                         .SetDataTypeDocument(
1322                                             "SingleLevelType",
1323                                             /*index_nested_properties=*/true)
1324                                         .SetCardinality(CARDINALITY_REPEATED))
1325                        .AddProperty(PropertyConfigBuilder()
1326                                         .SetName("B")
1327                                         .SetDataTypeDocument(
1328                                             "SingleLevelType",
1329                                             /*index_nested_properties=*/true)
1330                                         .SetCardinality(CARDINALITY_REPEATED))
1331                        .AddProperty(PropertyConfigBuilder()
1332                                         .SetName("C")
1333                                         .SetDataTypeDocument(
1334                                             "SingleLevelType",
1335                                             /*index_nested_properties=*/true)
1336                                         .SetCardinality(CARDINALITY_REPEATED)))
1337           .Build();
1338   ICING_ASSERT_OK(schema_store_->SetSchema(
1339       schema, /*ignore_errors_and_delete_documents=*/true,
1340       /*allow_circular_schema_definitions=*/false));
1341   ICING_ASSERT_OK_AND_ASSIGN(
1342       snippet_retriever_,
1343       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1344                                normalizer_.get()));
1345 
1346   DocumentProto subdocument;
1347   PropertyProto* prop = subdocument.add_properties();
1348   prop->set_name("X");
1349   prop->add_string_values("polo");
1350   prop = subdocument.add_properties();
1351   prop->set_name("Y");
1352   prop->add_string_values("marco");
1353   prop = subdocument.add_properties();
1354   prop->set_name("Z");
1355   prop->add_string_values("polo");
1356 
1357   DocumentProto document;
1358   document.set_schema("MultiLevelType");
1359   prop = document.add_properties();
1360   prop->set_name("A");
1361   *prop->add_document_values() = subdocument;
1362   *prop->add_document_values() = subdocument;
1363 
1364   prop = document.add_properties();
1365   prop->set_name("B");
1366   *prop->add_document_values() = subdocument;
1367   *prop->add_document_values() = subdocument;
1368 
1369   prop = document.add_properties();
1370   prop->set_name("C");
1371   *prop->add_document_values() = subdocument;
1372   *prop->add_document_values() = subdocument;
1373 
1374   SectionIdMask section_mask = 0b111111111;
1375   SectionRestrictQueryTermsMap query_terms{{"", {"polo"}}};
1376   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1377       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1378 
1379   EXPECT_THAT(snippet.entries(), SizeIs(12));
1380   EXPECT_THAT(snippet.entries(0).property_name(), Eq("A[0].X"));
1381   std::string_view content =
1382       GetString(&document, snippet.entries(0).property_name());
1383   EXPECT_THAT(GetWindows(content, snippet.entries(0)), ElementsAre("polo"));
1384   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("polo"));
1385   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("polo"));
1386 
1387   EXPECT_THAT(snippet.entries(1).property_name(), Eq("A[1].X"));
1388   content = GetString(&document, snippet.entries(1).property_name());
1389   EXPECT_THAT(GetWindows(content, snippet.entries(1)), ElementsAre("polo"));
1390   EXPECT_THAT(GetMatches(content, snippet.entries(1)), ElementsAre("polo"));
1391   EXPECT_THAT(GetSubMatches(content, snippet.entries(1)), ElementsAre("polo"));
1392 
1393   EXPECT_THAT(
1394       GetPropertyPaths(snippet),
1395       ElementsAre("A[0].X", "A[1].X", "A[0].Z", "A[1].Z", "B[0].X", "B[1].X",
1396                   "B[0].Z", "B[1].Z", "C[0].X", "C[1].X", "C[0].Z", "C[1].Z"));
1397 }
1398 
TEST_F(SnippetRetrieverTest,CJKSnippetMatchTest)1399 TEST_F(SnippetRetrieverTest, CJKSnippetMatchTest) {
1400   // String:     "我每天走路去上班。"
1401   //              ^ ^  ^   ^^
1402   // UTF8 idx:    0 3  9  15 18
1403   // UTF16 idx:   0 1  3   5 6
1404   // Breaks into segments: "我", "每天", "走路", "去", "上班"
1405   constexpr std::string_view kChinese = "我每天走路去上班。";
1406   DocumentProto document =
1407       DocumentBuilder()
1408           .SetKey("icing", "email/1")
1409           .SetSchema("email")
1410           .AddStringProperty("subject", kChinese)
1411           .AddStringProperty("body",
1412                              "Concerning the subject of foo, we need to begin "
1413                              "considering our options regarding body bar.")
1414           .Build();
1415 
1416   SectionIdMask section_mask = 0b00000011;
1417   SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
1418 
1419   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1420       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1421 
1422   // Ensure that one and only one property was matched and it was "body"
1423   ASSERT_THAT(snippet.entries(), SizeIs(1));
1424   const SnippetProto::EntryProto* entry = &snippet.entries(0);
1425   EXPECT_THAT(entry->property_name(), Eq("subject"));
1426   std::string_view content =
1427       GetString(&document, snippet.entries(0).property_name());
1428 
1429   // Ensure that there is one and only one match within "subject"
1430   ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1431   const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1432 
1433   // Ensure that the match is correct.
1434   EXPECT_THAT(GetMatches(content, *entry), ElementsAre("走路"));
1435   EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("走"));
1436 
1437   // Ensure that the utf-16 values are also as expected
1438   EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(3));
1439   EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(2));
1440   EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(1));
1441 }
1442 
TEST_F(SnippetRetrieverTest,CJKSnippetWindowTest)1443 TEST_F(SnippetRetrieverTest, CJKSnippetWindowTest) {
1444   language_segmenter_factory::SegmenterOptions options(ULOC_SIMPLIFIED_CHINESE,
1445                                                        jni_cache_.get());
1446   ICING_ASSERT_OK_AND_ASSIGN(
1447       language_segmenter_,
1448       language_segmenter_factory::Create(std::move(options)));
1449   ICING_ASSERT_OK_AND_ASSIGN(
1450       snippet_retriever_,
1451       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1452                                normalizer_.get()));
1453 
1454   // String:     "我每天走路去上班。"
1455   //              ^ ^  ^   ^^
1456   // UTF8 idx:    0 3  9  15 18
1457   // UTF16 idx:   0 1  3   5 6
1458   // UTF32 idx:   0 1  3   5 6
1459   // Breaks into segments: "我", "每天", "走路", "去", "上班"
1460   constexpr std::string_view kChinese = "我每天走路去上班。";
1461   DocumentProto document =
1462       DocumentBuilder()
1463           .SetKey("icing", "email/1")
1464           .SetSchema("email")
1465           .AddStringProperty("subject", kChinese)
1466           .AddStringProperty("body",
1467                              "Concerning the subject of foo, we need to begin "
1468                              "considering our options regarding body bar.")
1469           .Build();
1470 
1471   SectionIdMask section_mask = 0b00000011;
1472   SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
1473 
1474   // The window will be:
1475   //   1. untrimmed, no-shifting window will be (0,7).
1476   //   2. trimmed, no-shifting window [1, 6) "每天走路去".
1477   //   3. trimmed, shifted window [0, 6) "我每天走路去"
1478   snippet_spec_.set_max_window_utf32_length(6);
1479 
1480   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1481       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1482 
1483   // Ensure that one and only one property was matched and it was "body"
1484   ASSERT_THAT(snippet.entries(), SizeIs(1));
1485   const SnippetProto::EntryProto* entry = &snippet.entries(0);
1486   EXPECT_THAT(entry->property_name(), Eq("subject"));
1487   std::string_view content =
1488       GetString(&document, snippet.entries(0).property_name());
1489 
1490   // Ensure that there is one and only one match within "subject"
1491   ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1492   const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1493 
1494   // Ensure that the match is correct.
1495   EXPECT_THAT(GetWindows(content, *entry), ElementsAre("我每天走路去"));
1496 
1497   // Ensure that the utf-16 values are also as expected
1498   EXPECT_THAT(match_proto.window_utf16_position(), Eq(0));
1499   EXPECT_THAT(match_proto.window_utf16_length(), Eq(6));
1500 }
1501 
TEST_F(SnippetRetrieverTest,Utf16MultiCodeUnitSnippetMatchTest)1502 TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitSnippetMatchTest) {
1503   // The following string has four-byte UTF-8 characters. Most importantly, it
1504   // is also two code units in UTF-16.
1505   // String:     "���� ���� ��"
1506   //              ^  ^  ^
1507   // UTF8 idx:    0  9  18
1508   // UTF16 idx:   0  5  10
1509   // Breaks into segments: "����", "����", "��"
1510   constexpr std::string_view kText = "���� ���� ��";
1511   DocumentProto document =
1512       DocumentBuilder()
1513           .SetKey("icing", "email/1")
1514           .SetSchema("email")
1515           .AddStringProperty("subject", kText)
1516           .AddStringProperty("body",
1517                              "Concerning the subject of foo, we need to begin "
1518                              "considering our options regarding body bar.")
1519           .Build();
1520 
1521   SectionIdMask section_mask = 0b00000011;
1522   SectionRestrictQueryTermsMap query_terms{{"", {"��"}}};
1523 
1524   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1525       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1526 
1527   // Ensure that one and only one property was matched and it was "body"
1528   ASSERT_THAT(snippet.entries(), SizeIs(1));
1529   const SnippetProto::EntryProto* entry = &snippet.entries(0);
1530   EXPECT_THAT(entry->property_name(), Eq("subject"));
1531   std::string_view content =
1532       GetString(&document, snippet.entries(0).property_name());
1533 
1534   // Ensure that there is one and only one match within "subject"
1535   ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1536   const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1537 
1538   // Ensure that the match is correct.
1539   EXPECT_THAT(GetMatches(content, *entry), ElementsAre("����"));
1540   EXPECT_THAT(GetSubMatches(content, *entry), ElementsAre("��"));
1541 
1542   // Ensure that the utf-16 values are also as expected
1543   EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(5));
1544   EXPECT_THAT(match_proto.exact_match_utf16_length(), Eq(4));
1545   EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
1546 }
1547 
TEST_F(SnippetRetrieverTest,Utf16MultiCodeUnitWindowTest)1548 TEST_F(SnippetRetrieverTest, Utf16MultiCodeUnitWindowTest) {
1549   // The following string has four-byte UTF-8 characters. Most importantly, it
1550   // is also two code units in UTF-16.
1551   // String:     "���� ���� ��"
1552   //              ^  ^  ^
1553   // UTF8 idx:    0  9  18
1554   // UTF16 idx:   0  5  10
1555   // UTF32 idx:   0  3  6
1556   // Breaks into segments: "����", "����", "��"
1557   constexpr std::string_view kText = "���� ���� ��";
1558   DocumentProto document =
1559       DocumentBuilder()
1560           .SetKey("icing", "email/1")
1561           .SetSchema("email")
1562           .AddStringProperty("subject", kText)
1563           .AddStringProperty("body",
1564                              "Concerning the subject of foo, we need to begin "
1565                              "considering our options regarding body bar.")
1566           .Build();
1567 
1568   SectionIdMask section_mask = 0b00000011;
1569   SectionRestrictQueryTermsMap query_terms{{"", {"��"}}};
1570 
1571   // Set a six character window. This will produce a window like this:
1572   // String:     "���� ���� ��"
1573   //                 ^   ^
1574   // UTF8 idx:       9   22
1575   // UTF16 idx:      5   12
1576   // UTF32 idx:      3   7
1577   snippet_spec_.set_max_window_utf32_length(6);
1578 
1579   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1580       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1581 
1582   // Ensure that one and only one property was matched and it was "body"
1583   ASSERT_THAT(snippet.entries(), SizeIs(1));
1584   const SnippetProto::EntryProto* entry = &snippet.entries(0);
1585   EXPECT_THAT(entry->property_name(), Eq("subject"));
1586   std::string_view content =
1587       GetString(&document, snippet.entries(0).property_name());
1588 
1589   // Ensure that there is one and only one match within "subject"
1590   ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1591   const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1592 
1593   // Ensure that the match is correct.
1594   EXPECT_THAT(GetWindows(content, *entry), ElementsAre("���� ��"));
1595 
1596   // Ensure that the utf-16 values are also as expected
1597   EXPECT_THAT(match_proto.window_utf16_position(), Eq(5));
1598   EXPECT_THAT(match_proto.window_utf16_length(), Eq(7));
1599 }
1600 
TEST_F(SnippetRetrieverTest,SnippettingVerbatimAscii)1601 TEST_F(SnippetRetrieverTest, SnippettingVerbatimAscii) {
1602   SchemaProto schema =
1603       SchemaBuilder()
1604           .AddType(SchemaTypeConfigBuilder()
1605                        .SetType("verbatimType")
1606                        .AddProperty(PropertyConfigBuilder()
1607                                         .SetName("verbatim")
1608                                         .SetDataTypeString(TERM_MATCH_EXACT,
1609                                                            TOKENIZER_VERBATIM)
1610                                         .SetCardinality(CARDINALITY_REPEATED)))
1611           .Build();
1612   ICING_ASSERT_OK(schema_store_->SetSchema(
1613       schema, /*ignore_errors_and_delete_documents=*/true,
1614       /*allow_circular_schema_definitions=*/false));
1615   ICING_ASSERT_OK_AND_ASSIGN(
1616       snippet_retriever_,
1617       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1618                                normalizer_.get()));
1619 
1620   DocumentProto document = DocumentBuilder()
1621                                .SetKey("icing", "verbatim/1")
1622                                .SetSchema("verbatimType")
1623                                .AddStringProperty("verbatim", "Hello, world!")
1624                                .Build();
1625 
1626   SectionIdMask section_mask = 0b00000001;
1627   SectionRestrictQueryTermsMap query_terms{{"", {"Hello, world!"}}};
1628 
1629   snippet_spec_.set_max_window_utf32_length(13);
1630   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1631       query_terms, TERM_MATCH_EXACT, snippet_spec_, document, section_mask);
1632 
1633   // There should only be one snippet entry and match, the verbatim token in its
1634   // entirety.
1635   ASSERT_THAT(snippet.entries(), SizeIs(1));
1636 
1637   const SnippetProto::EntryProto* entry = &snippet.entries(0);
1638   ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1639   ASSERT_THAT(entry->property_name(), "verbatim");
1640 
1641   const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1642   // We expect the match to begin at position 0, and to span the entire token
1643   // which contains 13 characters.
1644   EXPECT_THAT(match_proto.window_byte_position(), Eq(0));
1645   EXPECT_THAT(match_proto.window_utf16_length(), Eq(13));
1646 
1647   // We expect the submatch to begin at position 0 of the verbatim token and
1648   // span the length of our query term "Hello, world!", which has utf-16 length
1649   // of 13. The submatch length is equal to the window length as the query the
1650   // snippet is retrieved with an exact term match.
1651   EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0));
1652   EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(13));
1653 }
1654 
TEST_F(SnippetRetrieverTest,SnippettingVerbatimCJK)1655 TEST_F(SnippetRetrieverTest, SnippettingVerbatimCJK) {
1656   SchemaProto schema =
1657       SchemaBuilder()
1658           .AddType(SchemaTypeConfigBuilder()
1659                        .SetType("verbatimType")
1660                        .AddProperty(PropertyConfigBuilder()
1661                                         .SetName("verbatim")
1662                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1663                                                            TOKENIZER_VERBATIM)
1664                                         .SetCardinality(CARDINALITY_REPEATED)))
1665           .Build();
1666   ICING_ASSERT_OK(schema_store_->SetSchema(
1667       schema, /*ignore_errors_and_delete_documents=*/true,
1668       /*allow_circular_schema_definitions=*/false));
1669   ICING_ASSERT_OK_AND_ASSIGN(
1670       snippet_retriever_,
1671       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1672                                normalizer_.get()));
1673 
1674   // String:     "我每天走路去上班。"
1675   //              ^ ^  ^   ^^
1676   // UTF8 idx:    0 3  9  15 18
1677   // UTF16 idx:   0 1  3   5 6
1678   // UTF32 idx:   0 1  3   5 6
1679   // Breaks into segments: "我", "每天", "走路", "去", "上班"
1680   std::string chinese_string = "我每天走路去上班。";
1681   DocumentProto document = DocumentBuilder()
1682                                .SetKey("icing", "verbatim/1")
1683                                .SetSchema("verbatimType")
1684                                .AddStringProperty("verbatim", chinese_string)
1685                                .Build();
1686 
1687   SectionIdMask section_mask = 0b00000001;
1688   SectionRestrictQueryTermsMap query_terms{{"", {"我每"}}};
1689 
1690   snippet_spec_.set_max_window_utf32_length(9);
1691   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1692       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1693 
1694   // There should only be one snippet entry and match, the verbatim token in its
1695   // entirety.
1696   ASSERT_THAT(snippet.entries(), SizeIs(1));
1697 
1698   const SnippetProto::EntryProto* entry = &snippet.entries(0);
1699   ASSERT_THAT(entry->snippet_matches(), SizeIs(1));
1700   ASSERT_THAT(entry->property_name(), "verbatim");
1701 
1702   const SnippetMatchProto& match_proto = entry->snippet_matches(0);
1703   // We expect the match to begin at position 0, and to span the entire token
1704   // which has utf-16 length of 9.
1705   EXPECT_THAT(match_proto.window_byte_position(), Eq(0));
1706   EXPECT_THAT(match_proto.window_utf16_length(), Eq(9));
1707 
1708   // We expect the submatch to begin at position 0 of the verbatim token and
1709   // span the length of our query term "我每", which has utf-16 length of 2.
1710   EXPECT_THAT(match_proto.exact_match_utf16_position(), Eq(0));
1711   EXPECT_THAT(match_proto.submatch_utf16_length(), Eq(2));
1712 }
1713 
TEST_F(SnippetRetrieverTest,SnippettingRfc822Ascii)1714 TEST_F(SnippetRetrieverTest, SnippettingRfc822Ascii) {
1715   SchemaProto schema =
1716       SchemaBuilder()
1717           .AddType(SchemaTypeConfigBuilder()
1718                        .SetType("rfc822Type")
1719                        .AddProperty(PropertyConfigBuilder()
1720                                         .SetName("rfc822")
1721                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1722                                                            TOKENIZER_RFC822)
1723                                         .SetCardinality(CARDINALITY_REPEATED)))
1724           .Build();
1725   ICING_ASSERT_OK(schema_store_->SetSchema(
1726       schema, /*ignore_errors_and_delete_documents=*/true,
1727       /*allow_circular_schema_definitions=*/false));
1728 
1729   ICING_ASSERT_OK_AND_ASSIGN(
1730       snippet_retriever_,
1731       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1732                                normalizer_.get()));
1733 
1734   DocumentProto document =
1735       DocumentBuilder()
1736           .SetKey("icing", "rfc822/1")
1737           .SetSchema("rfc822Type")
1738           .AddStringProperty("rfc822",
1739                              "Alexander Sav <tom.bar@google.com>, Very Long "
1740                              "Name Example <tjbarron@google.com>")
1741           .Build();
1742 
1743   SectionIdMask section_mask = 0b00000001;
1744 
1745   // This should match both the first name token as well as the entire RFC822.
1746   SectionRestrictQueryTermsMap query_terms{{"", {"alexand"}}};
1747 
1748   snippet_spec_.set_max_window_utf32_length(35);
1749 
1750   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1751       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1752 
1753   ASSERT_THAT(snippet.entries(), SizeIs(1));
1754   EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
1755 
1756   std::string_view content =
1757       GetString(&document, snippet.entries(0).property_name());
1758 
1759   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1760               ElementsAre("Alexander Sav <tom.bar@google.com>,",
1761                           "Alexander Sav <tom.bar@google.com>,"));
1762   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1763               ElementsAre("Alexander Sav <tom.bar@google.com>", "Alexander"));
1764   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1765               ElementsAre("Alexand", "Alexand"));
1766 
1767   // "tom" should match the local component, local address, and address tokens.
1768   query_terms = SectionRestrictQueryTermsMap{{"", {"tom"}}};
1769   snippet_spec_.set_max_window_utf32_length(36);
1770 
1771   snippet = snippet_retriever_->RetrieveSnippet(
1772       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1773 
1774   ASSERT_THAT(snippet.entries(), SizeIs(1));
1775   EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
1776 
1777   content = GetString(&document, snippet.entries(0).property_name());
1778 
1779   // TODO(b/248362902) Stop returning duplicate matches.
1780   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1781               ElementsAre("Alexander Sav <tom.bar@google.com>,",
1782                           "Alexander Sav <tom.bar@google.com>,",
1783                           "Alexander Sav <tom.bar@google.com>,"));
1784   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1785               ElementsAre("tom.bar", "tom.bar@google.com", "tom"));
1786   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1787               ElementsAre("tom", "tom", "tom"));
1788 }
1789 
TEST_F(SnippetRetrieverTest,SnippettingRfc822CJK)1790 TEST_F(SnippetRetrieverTest, SnippettingRfc822CJK) {
1791   SchemaProto schema =
1792       SchemaBuilder()
1793           .AddType(SchemaTypeConfigBuilder()
1794                        .SetType("rfc822Type")
1795                        .AddProperty(PropertyConfigBuilder()
1796                                         .SetName("rfc822")
1797                                         .SetDataTypeString(TERM_MATCH_PREFIX,
1798                                                            TOKENIZER_RFC822)
1799                                         .SetCardinality(CARDINALITY_REPEATED)))
1800           .Build();
1801   ICING_ASSERT_OK(schema_store_->SetSchema(
1802       schema, /*ignore_errors_and_delete_documents=*/true,
1803       /*allow_circular_schema_definitions=*/false));
1804 
1805   ICING_ASSERT_OK_AND_ASSIGN(
1806       snippet_retriever_,
1807       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1808                                normalizer_.get()));
1809 
1810   std::string chinese_string = "我, 每天@走路, 去@上班";
1811   DocumentProto document = DocumentBuilder()
1812                                .SetKey("icing", "rfc822/1")
1813                                .SetSchema("rfc822Type")
1814                                .AddStringProperty("rfc822", chinese_string)
1815                                .Build();
1816 
1817   SectionIdMask section_mask = 0b00000001;
1818 
1819   SectionRestrictQueryTermsMap query_terms{{"", {"走"}}};
1820 
1821   snippet_spec_.set_max_window_utf32_length(8);
1822 
1823   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1824       query_terms, TERM_MATCH_PREFIX, snippet_spec_, document, section_mask);
1825 
1826   // There should only be one snippet entry and match, the local component token
1827   ASSERT_THAT(snippet.entries(), SizeIs(1));
1828   EXPECT_THAT(snippet.entries(0).property_name(), "rfc822");
1829 
1830   std::string_view content =
1831       GetString(&document, snippet.entries(0).property_name());
1832 
1833   // The local component, address, local address, and token will all match. The
1834   // windows for address and token are "" as the snippet window is too small.
1835   // TODO(b/248362902) Stop returning duplicate matches.
1836   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1837               ElementsAre("每天@走路,", "每天@走路,"));
1838   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1839               ElementsAre("走路", "走路"));
1840   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1841               ElementsAre("走", "走"));
1842 }
1843 
1844 #ifdef ENABLE_URL_TOKENIZER
TEST_F(SnippetRetrieverTest,SnippettingUrlAscii)1845 TEST_F(SnippetRetrieverTest, SnippettingUrlAscii) {
1846   SchemaProto schema =
1847       SchemaBuilder()
1848           .AddType(SchemaTypeConfigBuilder().SetType("urlType").AddProperty(
1849               PropertyConfigBuilder()
1850                   .SetName("url")
1851                   .SetDataTypeString(MATCH_PREFIX, TOKENIZER_URL)
1852                   .SetCardinality(CARDINALITY_REPEATED)))
1853           .Build();
1854   ICING_ASSERT_OK(schema_store_->SetSchema(
1855       schema, /*ignore_errors_and_delete_documents=*/true));
1856 
1857   ICING_ASSERT_OK_AND_ASSIGN(
1858       snippet_retriever_,
1859       SnippetRetriever::Create(schema_store_.get(), language_segmenter_.get(),
1860                                normalizer_.get()));
1861 
1862   DocumentProto document =
1863       DocumentBuilder()
1864           .SetKey("icing", "url/1")
1865           .SetSchema("urlType")
1866           .AddStringProperty("url", "https://mail.google.com/calendar/google/")
1867           .Build();
1868 
1869   SectionIdMask section_mask = 0b00000001;
1870 
1871   // Query with single url split-token match
1872   SectionRestrictQueryTermsMap query_terms{{"", {"com"}}};
1873   // 40 is the length of the url.
1874   // Window that is the size of the url should return entire url.
1875   snippet_spec_.set_max_window_utf32_length(40);
1876 
1877   SnippetProto snippet = snippet_retriever_->RetrieveSnippet(
1878       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1879 
1880   ASSERT_THAT(snippet.entries(), SizeIs(1));
1881   EXPECT_THAT(snippet.entries(0).property_name(), "url");
1882 
1883   std::string_view content =
1884       GetString(&document, snippet.entries(0).property_name());
1885 
1886   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1887               ElementsAre("https://mail.google.com/calendar/google/"));
1888   EXPECT_THAT(GetMatches(content, snippet.entries(0)), ElementsAre("com"));
1889   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)), ElementsAre("com"));
1890 
1891   // Query with single url suffix-token match
1892   query_terms = SectionRestrictQueryTermsMap{{"", {"mail.goo"}}};
1893   snippet_spec_.set_max_window_utf32_length(40);
1894 
1895   snippet = snippet_retriever_->RetrieveSnippet(
1896       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1897 
1898   ASSERT_THAT(snippet.entries(), SizeIs(1));
1899   EXPECT_THAT(snippet.entries(0).property_name(), "url");
1900 
1901   content = GetString(&document, snippet.entries(0).property_name());
1902 
1903   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1904               ElementsAre("https://mail.google.com/calendar/google/"));
1905   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1906               ElementsAre("mail.google.com/calendar/google/"));
1907   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1908               ElementsAre("mail.goo"));
1909 
1910   // Query with multiple url split-token matches
1911   query_terms = SectionRestrictQueryTermsMap{{"", {"goog"}}};
1912   snippet_spec_.set_max_window_utf32_length(40);
1913 
1914   snippet = snippet_retriever_->RetrieveSnippet(
1915       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1916 
1917   ASSERT_THAT(snippet.entries(), SizeIs(1));
1918   EXPECT_THAT(snippet.entries(0).property_name(), "url");
1919 
1920   content = GetString(&document, snippet.entries(0).property_name());
1921 
1922   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1923               ElementsAre("https://mail.google.com/calendar/google/",
1924                           "https://mail.google.com/calendar/google/"));
1925   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1926               ElementsAre("google", "google"));
1927   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1928               ElementsAre("goog", "goog"));
1929 
1930   // Query with both url split-token and suffix-token matches
1931   query_terms = SectionRestrictQueryTermsMap{{"", {"mail"}}};
1932   snippet_spec_.set_max_window_utf32_length(40);
1933 
1934   snippet = snippet_retriever_->RetrieveSnippet(
1935       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1936 
1937   ASSERT_THAT(snippet.entries(), SizeIs(1));
1938   EXPECT_THAT(snippet.entries(0).property_name(), "url");
1939 
1940   content = GetString(&document, snippet.entries(0).property_name());
1941 
1942   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1943               ElementsAre("https://mail.google.com/calendar/google/",
1944                           "https://mail.google.com/calendar/google/"));
1945   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1946               ElementsAre("mail", "mail.google.com/calendar/google/"));
1947   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1948               ElementsAre("mail", "mail"));
1949 
1950   // Prefix query with both url split-token and suffix-token matches
1951   query_terms = SectionRestrictQueryTermsMap{{"", {"http"}}};
1952   snippet_spec_.set_max_window_utf32_length(40);
1953 
1954   snippet = snippet_retriever_->RetrieveSnippet(
1955       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1956 
1957   ASSERT_THAT(snippet.entries(), SizeIs(1));
1958   EXPECT_THAT(snippet.entries(0).property_name(), "url");
1959 
1960   content = GetString(&document, snippet.entries(0).property_name());
1961 
1962   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
1963               ElementsAre("https://mail.google.com/calendar/google/",
1964                           "https://mail.google.com/calendar/google/"));
1965   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
1966               ElementsAre("https", "https://mail.google.com/calendar/google/"));
1967   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
1968               ElementsAre("http", "http"));
1969 
1970   // Window that's smaller than the input size should not return any matches.
1971   query_terms = SectionRestrictQueryTermsMap{{"", {"google"}}};
1972   snippet_spec_.set_max_window_utf32_length(10);
1973 
1974   snippet = snippet_retriever_->RetrieveSnippet(
1975       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1976 
1977   ASSERT_THAT(snippet.entries(), SizeIs(0));
1978 
1979   // Test case with more than two matches
1980   document =
1981       DocumentBuilder()
1982           .SetKey("icing", "url/1")
1983           .SetSchema("urlType")
1984           .AddStringProperty("url", "https://www.google.com/calendar/google/")
1985           .Build();
1986 
1987   // Prefix query with both url split-token and suffix-token matches
1988   query_terms = SectionRestrictQueryTermsMap{{"", {"google"}}};
1989   snippet_spec_.set_max_window_utf32_length(39);
1990 
1991   snippet = snippet_retriever_->RetrieveSnippet(
1992       query_terms, MATCH_PREFIX, snippet_spec_, document, section_mask);
1993 
1994   ASSERT_THAT(snippet.entries(), SizeIs(1));
1995   EXPECT_THAT(snippet.entries(0).property_name(), "url");
1996 
1997   content = GetString(&document, snippet.entries(0).property_name());
1998 
1999   EXPECT_THAT(GetWindows(content, snippet.entries(0)),
2000               ElementsAre("https://www.google.com/calendar/google/",
2001                           "https://www.google.com/calendar/google/",
2002                           "https://www.google.com/calendar/google/"));
2003   EXPECT_THAT(GetMatches(content, snippet.entries(0)),
2004               ElementsAre("google", "google", "google.com/calendar/google/"));
2005   EXPECT_THAT(GetSubMatches(content, snippet.entries(0)),
2006               ElementsAre("google", "google", "google"));
2007 }
2008 #endif  // ENABLE_URL_TOKENIZER
2009 
2010 }  // namespace
2011 
2012 }  // namespace lib
2013 }  // namespace icing
2014