• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/index/term-indexing-handler.h"
16 
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "gmock/gmock.h"
29 #include "gtest/gtest.h"
30 #include "icing/absl_ports/str_cat.h"
31 #include "icing/document-builder.h"
32 #include "icing/file/filesystem.h"
33 #include "icing/file/portable-file-backed-proto-log.h"
34 #include "icing/index/hit/doc-hit-info.h"
35 #include "icing/index/hit/hit.h"
36 #include "icing/index/index.h"
37 #include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
38 #include "icing/index/iterator/doc-hit-info-iterator.h"
39 #include "icing/index/property-existence-indexing-handler.h"
40 #include "icing/legacy/index/icing-filesystem.h"
41 #include "icing/portable/platform.h"
42 #include "icing/proto/document.pb.h"
43 #include "icing/proto/document_wrapper.pb.h"
44 #include "icing/proto/schema.pb.h"
45 #include "icing/proto/term.pb.h"
46 #include "icing/schema-builder.h"
47 #include "icing/schema/schema-store.h"
48 #include "icing/schema/section.h"
49 #include "icing/store/document-id.h"
50 #include "icing/store/document-store.h"
51 #include "icing/testing/common-matchers.h"
52 #include "icing/testing/fake-clock.h"
53 #include "icing/testing/icu-data-file-helper.h"
54 #include "icing/testing/test-data.h"
55 #include "icing/testing/tmp-directory.h"
56 #include "icing/tokenization/language-segmenter-factory.h"
57 #include "icing/tokenization/language-segmenter.h"
58 #include "icing/transform/normalizer-factory.h"
59 #include "icing/transform/normalizer.h"
60 #include "icing/util/tokenized-document.h"
61 #include "unicode/uloc.h"
62 
63 namespace icing {
64 namespace lib {
65 
66 namespace {
67 
68 using ::testing::ElementsAre;
69 using ::testing::Eq;
70 using ::testing::IsEmpty;
71 using ::testing::IsFalse;
72 using ::testing::IsTrue;
73 using ::testing::Test;
74 
75 // Schema type with indexable properties and section Id.
76 // Section Id is determined by the lexicographical order of indexable property
77 // path.
78 // Section id = 0: body
79 // Section id = 1: title
80 constexpr std::string_view kFakeType = "FakeType";
81 constexpr std::string_view kPropertyBody = "body";
82 constexpr std::string_view kPropertyTitle = "title";
83 
84 constexpr SectionId kSectionIdBody = 0;
85 constexpr SectionId kSectionIdTitle = 1;
86 
87 // Schema type with nested indexable properties and section Id.
88 // Section id = 0: "name"
89 // Section id = 1: "nested.body"
90 // Section id = 3: "nested.title"
91 // Section id = 4: "subject"
92 constexpr std::string_view kNestedType = "NestedType";
93 constexpr std::string_view kPropertyName = "name";
94 constexpr std::string_view kPropertyNestedDoc = "nested";
95 constexpr std::string_view kPropertySubject = "subject";
96 
97 constexpr SectionId kSectionIdNestedBody = 1;
98 
99 class TermIndexingHandlerTest : public Test {
100  protected:
SetUp()101   void SetUp() override {
102     if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
103       ICING_ASSERT_OK(
104           // File generated via icu_data_file rule in //icing/BUILD.
105           icu_data_file_helper::SetUpICUDataFile(
106               GetTestFilePath("icing/icu.dat")));
107     }
108 
109     base_dir_ = GetTestTempDir() + "/icing_test";
110     ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
111                 IsTrue());
112 
113     index_dir_ = base_dir_ + "/index";
114     schema_store_dir_ = base_dir_ + "/schema_store";
115     document_store_dir_ = base_dir_ + "/document_store";
116 
117     language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
118     ICING_ASSERT_OK_AND_ASSIGN(
119         lang_segmenter_,
120         language_segmenter_factory::Create(std::move(segmenter_options)));
121 
122     ICING_ASSERT_OK_AND_ASSIGN(
123         normalizer_,
124         normalizer_factory::Create(
125             /*max_term_byte_size=*/std::numeric_limits<int32_t>::max()));
126 
127     ASSERT_THAT(
128         filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()),
129         IsTrue());
130     ICING_ASSERT_OK_AND_ASSIGN(
131         schema_store_,
132         SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
133     SchemaProto schema =
134         SchemaBuilder()
135             .AddType(
136                 SchemaTypeConfigBuilder()
137                     .SetType(kFakeType)
138                     .AddProperty(PropertyConfigBuilder()
139                                      .SetName(kPropertyTitle)
140                                      .SetDataTypeString(TERM_MATCH_PREFIX,
141                                                         TOKENIZER_PLAIN)
142                                      .SetCardinality(CARDINALITY_OPTIONAL))
143                     .AddProperty(PropertyConfigBuilder()
144                                      .SetName(kPropertyBody)
145                                      .SetDataTypeString(TERM_MATCH_EXACT,
146                                                         TOKENIZER_PLAIN)
147                                      .SetCardinality(CARDINALITY_OPTIONAL)))
148             .AddType(
149                 SchemaTypeConfigBuilder()
150                     .SetType(kNestedType)
151                     .AddProperty(
152                         PropertyConfigBuilder()
153                             .SetName(kPropertyNestedDoc)
154                             .SetDataTypeDocument(
155                                 kFakeType, /*index_nested_properties=*/true)
156                             .SetCardinality(CARDINALITY_OPTIONAL))
157                     .AddProperty(PropertyConfigBuilder()
158                                      .SetName(kPropertySubject)
159                                      .SetDataTypeString(TERM_MATCH_EXACT,
160                                                         TOKENIZER_PLAIN)
161                                      .SetCardinality(CARDINALITY_OPTIONAL))
162                     .AddProperty(PropertyConfigBuilder()
163                                      .SetName(kPropertyName)
164                                      .SetDataTypeString(TERM_MATCH_EXACT,
165                                                         TOKENIZER_PLAIN)
166                                      .SetCardinality(CARDINALITY_OPTIONAL)))
167             .Build();
168     ICING_ASSERT_OK(schema_store_->SetSchema(
169         schema, /*ignore_errors_and_delete_documents=*/false,
170         /*allow_circular_schema_definitions=*/false));
171 
172     ASSERT_TRUE(
173         filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str()));
174     ICING_ASSERT_OK_AND_ASSIGN(
175         DocumentStore::CreateResult doc_store_create_result,
176         DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
177                               schema_store_.get(),
178                               /*force_recovery_and_revalidate_documents=*/false,
179                               /*namespace_id_fingerprint=*/true,
180                               /*pre_mapping_fbv=*/false,
181                               /*use_persistent_hash_map=*/true,
182                               PortableFileBackedProtoLog<
183                                   DocumentWrapper>::kDeflateCompressionLevel,
184                               /*initialize_stats=*/nullptr));
185     document_store_ = std::move(doc_store_create_result.document_store);
186   }
187 
TearDown()188   void TearDown() override {
189     document_store_.reset();
190     schema_store_.reset();
191     normalizer_.reset();
192     lang_segmenter_.reset();
193 
194     filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
195   }
196 
197   Filesystem filesystem_;
198   IcingFilesystem icing_filesystem_;
199   FakeClock fake_clock_;
200   std::string base_dir_;
201   std::string index_dir_;
202   std::string schema_store_dir_;
203   std::string document_store_dir_;
204 
205   std::unique_ptr<LanguageSegmenter> lang_segmenter_;
206   std::unique_ptr<Normalizer> normalizer_;
207   std::unique_ptr<SchemaStore> schema_store_;
208   std::unique_ptr<DocumentStore> document_store_;
209 };
210 
211 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
QueryExistence(Index * index,std::string_view property_path)212 QueryExistence(Index* index, std::string_view property_path) {
213   return index->GetIterator(
214       absl_ports::StrCat(kPropertyExistenceTokenPrefix, property_path),
215       /*term_start_index=*/0,
216       /*unnormalized_term_length=*/0, kSectionIdMaskAll,
217       TermMatchType::EXACT_ONLY,
218       /*need_hit_term_frequency=*/false);
219 }
220 
GetHits(std::unique_ptr<DocHitInfoIterator> iterator)221 std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
222   std::vector<DocHitInfo> infos;
223   while (iterator->Advance().ok()) {
224     infos.push_back(iterator->doc_hit_info());
225   }
226   return infos;
227 }
228 
GetHitsWithTermFrequency(std::unique_ptr<DocHitInfoIterator> iterator)229 std::vector<DocHitInfoTermFrequencyPair> GetHitsWithTermFrequency(
230     std::unique_ptr<DocHitInfoIterator> iterator) {
231   std::vector<DocHitInfoTermFrequencyPair> infos;
232   while (iterator->Advance().ok()) {
233     std::vector<TermMatchInfo> matched_terms_stats;
234     iterator->PopulateMatchedTermsStats(&matched_terms_stats);
235     for (const TermMatchInfo& term_match_info : matched_terms_stats) {
236       infos.push_back(DocHitInfoTermFrequencyPair(
237           iterator->doc_hit_info(), term_match_info.term_frequencies));
238     }
239   }
240   return infos;
241 }
242 
TEST_F(TermIndexingHandlerTest,HandleBothStringSectionAndPropertyExistence)243 TEST_F(TermIndexingHandlerTest, HandleBothStringSectionAndPropertyExistence) {
244   Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
245                          /*lite_index_sort_at_indexing=*/true,
246                          /*lite_index_sort_size=*/1024 * 8);
247   ICING_ASSERT_OK_AND_ASSIGN(
248       std::unique_ptr<Index> index,
249       Index::Create(options, &filesystem_, &icing_filesystem_));
250 
251   DocumentProto document =
252       DocumentBuilder()
253           .SetKey("icing", "fake_type/1")
254           .SetSchema(std::string(kFakeType))
255           .AddStringProperty(std::string(kPropertyTitle), "foo")
256           .AddStringProperty(std::string(kPropertyBody), "")
257           .Build();
258 
259   ICING_ASSERT_OK_AND_ASSIGN(
260       TokenizedDocument tokenized_document,
261       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
262                                 std::move(document)));
263 
264   ICING_ASSERT_OK_AND_ASSIGN(
265       DocumentId document_id,
266       document_store_->Put(tokenized_document.document()));
267 
268   EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
269 
270   ICING_ASSERT_OK_AND_ASSIGN(
271       std::unique_ptr<TermIndexingHandler> handler,
272       TermIndexingHandler::Create(
273           &fake_clock_, normalizer_.get(), index.get(),
274           /*build_property_existence_metadata_hits=*/true));
275   EXPECT_THAT(
276       handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false,
277                       /*put_document_stats=*/nullptr),
278       IsOk());
279 
280   EXPECT_THAT(index->last_added_document_id(), Eq(document_id));
281 
282   // Query 'foo'
283   ICING_ASSERT_OK_AND_ASSIGN(
284       std::unique_ptr<DocHitInfoIterator> itr,
285       index->GetIterator("foo", /*term_start_index=*/0,
286                          /*unnormalized_term_length=*/0, kSectionIdMaskAll,
287                          TermMatchType::EXACT_ONLY));
288   std::vector<DocHitInfoTermFrequencyPair> hits =
289       GetHitsWithTermFrequency(std::move(itr));
290   std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
291       {kSectionIdTitle, 1}};
292   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
293                         document_id, expected_map)));
294 
295   // Query for "title" property existence.
296   ICING_ASSERT_OK_AND_ASSIGN(itr, QueryExistence(index.get(), kPropertyTitle));
297   EXPECT_THAT(
298       GetHits(std::move(itr)),
299       ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0})));
300 
301   // Query for "body" property existence.
302   ICING_ASSERT_OK_AND_ASSIGN(itr, QueryExistence(index.get(), kPropertyBody));
303   EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
304 }
305 
TEST_F(TermIndexingHandlerTest,HandleIntoLiteIndex_sortInIndexingNotTriggered)306 TEST_F(TermIndexingHandlerTest,
307        HandleIntoLiteIndex_sortInIndexingNotTriggered) {
308   Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
309                          /*lite_index_sort_at_indexing=*/true,
310                          /*lite_index_sort_size=*/1024 * 8);
311   ICING_ASSERT_OK_AND_ASSIGN(
312       std::unique_ptr<Index> index,
313       Index::Create(options, &filesystem_, &icing_filesystem_));
314 
315   DocumentProto document =
316       DocumentBuilder()
317           .SetKey("icing", "fake_type/1")
318           .SetSchema(std::string(kFakeType))
319           .AddStringProperty(std::string(kPropertyTitle), "foo")
320           .AddStringProperty(std::string(kPropertyBody), "foo bar baz")
321           .Build();
322 
323   ICING_ASSERT_OK_AND_ASSIGN(
324       TokenizedDocument tokenized_document,
325       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
326                                 std::move(document)));
327 
328   ICING_ASSERT_OK_AND_ASSIGN(
329       DocumentId document_id,
330       document_store_->Put(tokenized_document.document()));
331 
332   EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
333 
334   ICING_ASSERT_OK_AND_ASSIGN(
335       std::unique_ptr<TermIndexingHandler> handler,
336       TermIndexingHandler::Create(
337           &fake_clock_, normalizer_.get(), index.get(),
338           /*build_property_existence_metadata_hits=*/true));
339   EXPECT_THAT(
340       handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false,
341                       /*put_document_stats=*/nullptr),
342       IsOk());
343 
344   EXPECT_THAT(index->last_added_document_id(), Eq(document_id));
345 
346   // Query 'foo'
347   ICING_ASSERT_OK_AND_ASSIGN(
348       std::unique_ptr<DocHitInfoIterator> itr,
349       index->GetIterator("foo", /*term_start_index=*/0,
350                          /*unnormalized_term_length=*/0, kSectionIdMaskAll,
351                          TermMatchType::EXACT_ONLY));
352   std::vector<DocHitInfoTermFrequencyPair> hits =
353       GetHitsWithTermFrequency(std::move(itr));
354   std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
355       {kSectionIdTitle, 1}, {kSectionIdBody, 1}};
356   EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
357                         document_id, expected_map)));
358 
359   // Query 'foo' with sectionId mask that masks all results
360   ICING_ASSERT_OK_AND_ASSIGN(
361       itr, index->GetIterator("foo", /*term_start_index=*/0,
362                               /*unnormalized_term_length=*/0, 1U << 2,
363                               TermMatchType::EXACT_ONLY));
364   EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
365 }
366 
TEST_F(TermIndexingHandlerTest,HandleIntoLiteIndex_sortInIndexingTriggered)367 TEST_F(TermIndexingHandlerTest, HandleIntoLiteIndex_sortInIndexingTriggered) {
368   // Create the LiteIndex with a smaller sort threshold. At 64 bytes we sort the
369   // HitBuffer after inserting 8 hits
370   Index::Options options(index_dir_,
371                          /*index_merge_size=*/1024 * 1024,
372                          /*lite_index_sort_at_indexing=*/true,
373                          /*lite_index_sort_size=*/64);
374   ICING_ASSERT_OK_AND_ASSIGN(
375       std::unique_ptr<Index> index,
376       Index::Create(options, &filesystem_, &icing_filesystem_));
377 
378   DocumentProto document0 =
379       DocumentBuilder()
380           .SetKey("icing", "fake_type/0")
381           .SetSchema(std::string(kFakeType))
382           .AddStringProperty(std::string(kPropertyTitle), "foo foo foo")
383           .AddStringProperty(std::string(kPropertyBody), "foo bar baz")
384           .Build();
385   DocumentProto document1 =
386       DocumentBuilder()
387           .SetKey("icing", "fake_type/1")
388           .SetSchema(std::string(kFakeType))
389           .AddStringProperty(std::string(kPropertyTitle), "bar baz baz")
390           .AddStringProperty(std::string(kPropertyBody), "foo foo baz")
391           .Build();
392   DocumentProto document2 =
393       DocumentBuilder()
394           .SetKey("icing", "nested_type/0")
395           .SetSchema(std::string(kNestedType))
396           .AddDocumentProperty(std::string(kPropertyNestedDoc), document1)
397           .AddStringProperty(std::string(kPropertyName), "qux")
398           .AddStringProperty(std::string(kPropertySubject), "bar bar")
399           .Build();
400 
401   ICING_ASSERT_OK_AND_ASSIGN(
402       TokenizedDocument tokenized_document0,
403       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
404                                 std::move(document0)));
405   ICING_ASSERT_OK_AND_ASSIGN(
406       DocumentId document_id0,
407       document_store_->Put(tokenized_document0.document()));
408 
409   ICING_ASSERT_OK_AND_ASSIGN(
410       TokenizedDocument tokenized_document1,
411       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
412                                 std::move(document1)));
413   ICING_ASSERT_OK_AND_ASSIGN(
414       DocumentId document_id1,
415       document_store_->Put(tokenized_document1.document()));
416 
417   ICING_ASSERT_OK_AND_ASSIGN(
418       TokenizedDocument tokenized_document2,
419       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
420                                 std::move(document2)));
421   ICING_ASSERT_OK_AND_ASSIGN(
422       DocumentId document_id2,
423       document_store_->Put(tokenized_document2.document()));
424   EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
425 
426   ICING_ASSERT_OK_AND_ASSIGN(
427       std::unique_ptr<TermIndexingHandler> handler,
428       TermIndexingHandler::Create(
429           &fake_clock_, normalizer_.get(), index.get(),
430           /*build_property_existence_metadata_hits=*/true));
431 
432   // Handle doc0 and doc1. The LiteIndex should sort and merge after adding
433   // these
434   EXPECT_THAT(handler->Handle(tokenized_document0, document_id0,
435                               /*recovery_mode=*/false,
436                               /*put_document_stats=*/nullptr),
437               IsOk());
438   EXPECT_THAT(handler->Handle(tokenized_document1, document_id1,
439                               /*recovery_mode=*/false,
440                               /*put_document_stats=*/nullptr),
441               IsOk());
442   EXPECT_THAT(index->last_added_document_id(), Eq(document_id1));
443   EXPECT_THAT(index->LiteIndexNeedSort(), IsFalse());
444 
445   // Handle doc2. The LiteIndex should have an unsorted portion after adding
446   EXPECT_THAT(handler->Handle(tokenized_document2, document_id2,
447                               /*recovery_mode=*/false,
448                               /*put_document_stats=*/nullptr),
449               IsOk());
450   EXPECT_THAT(index->last_added_document_id(), Eq(document_id2));
451 
452   // Hits in the hit buffer:
453   // <term>: {(docId, sectionId, term_freq)...}
454   // foo: {(0, kSectionIdTitle, 3); (0, kSectionIdBody, 1);
455   //       (1, kSectionIdBody, 2);
456   //       (2, kSectionIdNestedBody, 2)}
457   // bar: {(0, kSectionIdBody, 1);
458   //       (1, kSectionIdTitle, 1);
459   //       (2, kSectionIdNestedTitle, 1); (2, kSectionIdSubject, 2)}
460   // baz: {(0, kSectionIdBody, 1);
461   //       (1, kSectionIdTitle, 2); (1, kSectionIdBody, 1),
462   //       (2, kSectionIdNestedTitle, 2); (2, kSectionIdNestedBody, 1)}
463   // qux: {(2, kSectionIdName, 1)}
464 
465   // Query 'foo'
466   ICING_ASSERT_OK_AND_ASSIGN(
467       std::unique_ptr<DocHitInfoIterator> itr,
468       index->GetIterator("foo", /*term_start_index=*/0,
469                          /*unnormalized_term_length=*/0, kSectionIdMaskAll,
470                          TermMatchType::EXACT_ONLY));
471 
472   // Advance the iterator and verify that we're returning hits in the correct
473   // order (i.e. in descending order of DocId)
474   ASSERT_THAT(itr->Advance(), IsOk());
475   EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(2));
476   EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
477               Eq(1U << kSectionIdNestedBody));
478   std::vector<TermMatchInfo> matched_terms_stats;
479   std::unordered_map<SectionId, Hit::TermFrequency>
480       expected_section_ids_tf_map2 = {{kSectionIdNestedBody, 2}};
481   itr->PopulateMatchedTermsStats(&matched_terms_stats);
482   EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
483                                        "foo", expected_section_ids_tf_map2)));
484 
485   ASSERT_THAT(itr->Advance(), IsOk());
486   EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(1));
487   EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
488               Eq(1U << kSectionIdBody));
489   std::unordered_map<SectionId, Hit::TermFrequency>
490       expected_section_ids_tf_map1 = {{kSectionIdBody, 2}};
491   matched_terms_stats.clear();
492   itr->PopulateMatchedTermsStats(&matched_terms_stats);
493   EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
494                                        "foo", expected_section_ids_tf_map1)));
495 
496   ASSERT_THAT(itr->Advance(), IsOk());
497   EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(0));
498   EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
499               Eq(1U << kSectionIdTitle | 1U << kSectionIdBody));
500   std::unordered_map<SectionId, Hit::TermFrequency>
501       expected_section_ids_tf_map0 = {{kSectionIdTitle, 3},
502                                       {kSectionIdBody, 1}};
503   matched_terms_stats.clear();
504   itr->PopulateMatchedTermsStats(&matched_terms_stats);
505   EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
506                                        "foo", expected_section_ids_tf_map0)));
507 }
508 
TEST_F(TermIndexingHandlerTest,HandleIntoLiteIndex_enableSortInIndexing)509 TEST_F(TermIndexingHandlerTest, HandleIntoLiteIndex_enableSortInIndexing) {
510   // Create the LiteIndex with a smaller sort threshold. At 64 bytes we sort the
511   // HitBuffer after inserting 8 hits
512   Index::Options options(index_dir_,
513                          /*index_merge_size=*/1024 * 1024,
514                          /*lite_index_sort_at_indexing=*/false,
515                          /*lite_index_sort_size=*/64);
516   ICING_ASSERT_OK_AND_ASSIGN(
517       std::unique_ptr<Index> index,
518       Index::Create(options, &filesystem_, &icing_filesystem_));
519 
520   DocumentProto document0 =
521       DocumentBuilder()
522           .SetKey("icing", "fake_type/0")
523           .SetSchema(std::string(kFakeType))
524           .AddStringProperty(std::string(kPropertyTitle), "foo foo foo")
525           .AddStringProperty(std::string(kPropertyBody), "foo bar baz")
526           .Build();
527   DocumentProto document1 =
528       DocumentBuilder()
529           .SetKey("icing", "fake_type/1")
530           .SetSchema(std::string(kFakeType))
531           .AddStringProperty(std::string(kPropertyTitle), "bar baz baz")
532           .AddStringProperty(std::string(kPropertyBody), "foo foo baz")
533           .Build();
534   DocumentProto document2 =
535       DocumentBuilder()
536           .SetKey("icing", "nested_type/0")
537           .SetSchema(std::string(kNestedType))
538           .AddDocumentProperty(std::string(kPropertyNestedDoc), document1)
539           .AddStringProperty(std::string(kPropertyName), "qux")
540           .AddStringProperty(std::string(kPropertySubject), "bar bar")
541           .Build();
542 
543   ICING_ASSERT_OK_AND_ASSIGN(
544       TokenizedDocument tokenized_document0,
545       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
546                                 std::move(document0)));
547   ICING_ASSERT_OK_AND_ASSIGN(
548       DocumentId document_id0,
549       document_store_->Put(tokenized_document0.document()));
550 
551   ICING_ASSERT_OK_AND_ASSIGN(
552       TokenizedDocument tokenized_document1,
553       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
554                                 std::move(document1)));
555   ICING_ASSERT_OK_AND_ASSIGN(
556       DocumentId document_id1,
557       document_store_->Put(tokenized_document1.document()));
558 
559   ICING_ASSERT_OK_AND_ASSIGN(
560       TokenizedDocument tokenized_document2,
561       TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
562                                 std::move(document2)));
563   ICING_ASSERT_OK_AND_ASSIGN(
564       DocumentId document_id2,
565       document_store_->Put(tokenized_document2.document()));
566   EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
567 
568   ICING_ASSERT_OK_AND_ASSIGN(
569       std::unique_ptr<TermIndexingHandler> handler,
570       TermIndexingHandler::Create(
571           &fake_clock_, normalizer_.get(), index.get(),
572           /*build_property_existence_metadata_hits=*/true));
573 
574   // Handle all docs
575   EXPECT_THAT(handler->Handle(tokenized_document0, document_id0,
576                               /*recovery_mode=*/false,
577                               /*put_document_stats=*/nullptr),
578               IsOk());
579   EXPECT_THAT(handler->Handle(tokenized_document1, document_id1,
580                               /*recovery_mode=*/false,
581                               /*put_document_stats=*/nullptr),
582               IsOk());
583   EXPECT_THAT(handler->Handle(tokenized_document2, document_id2,
584                               /*recovery_mode=*/false,
585                               /*put_document_stats=*/nullptr),
586               IsOk());
587   EXPECT_THAT(index->last_added_document_id(), Eq(document_id2));
588 
589   // We've disabled sorting during indexing so the HitBuffer's unsorted section
590   // should exceed the sort threshold. PersistToDisk and reinitialize the
591   // LiteIndex with sort_at_indexing=true.
592   ASSERT_THAT(index->PersistToDisk(), IsOk());
593   options = Index::Options(index_dir_,
594                            /*index_merge_size=*/1024 * 1024,
595                            /*lite_index_sort_at_indexing=*/true,
596                            /*lite_index_sort_size=*/64);
597   ICING_ASSERT_OK_AND_ASSIGN(
598       index, Index::Create(options, &filesystem_, &icing_filesystem_));
599 
600   // Verify that the HitBuffer has been sorted after initializing with
601   // sort_at_indexing enabled.
602   EXPECT_THAT(index->LiteIndexNeedSort(), IsFalse());
603 
604   // Hits in the hit buffer:
605   // <term>: {(docId, sectionId, term_freq)...}
606   // foo: {(0, kSectionIdTitle, 3); (0, kSectionIdBody, 1);
607   //       (1, kSectionIdBody, 2);
608   //       (2, kSectionIdNestedBody, 2)}
609   // bar: {(0, kSectionIdBody, 1);
610   //       (1, kSectionIdTitle, 1);
611   //       (2, kSectionIdNestedTitle, 1); (2, kSectionIdSubject, 2)}
612   // baz: {(0, kSectionIdBody, 1);
613   //       (1, kSectionIdTitle, 2); (1, kSectionIdBody, 1),
614   //       (2, kSectionIdNestedTitle, 2); (2, kSectionIdNestedBody, 1)}
615   // qux: {(2, kSectionIdName, 1)}
616 
617   // Query 'foo'
618   ICING_ASSERT_OK_AND_ASSIGN(
619       std::unique_ptr<DocHitInfoIterator> itr,
620       index->GetIterator("foo", /*term_start_index=*/0,
621                          /*unnormalized_term_length=*/0, kSectionIdMaskAll,
622                          TermMatchType::EXACT_ONLY));
623 
624   // Advance the iterator and verify that we're returning hits in the correct
625   // order (i.e. in descending order of DocId)
626   ASSERT_THAT(itr->Advance(), IsOk());
627   EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(2));
628   EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
629               Eq(1U << kSectionIdNestedBody));
630   std::vector<TermMatchInfo> matched_terms_stats;
631   std::unordered_map<SectionId, Hit::TermFrequency>
632       expected_section_ids_tf_map2 = {{kSectionIdNestedBody, 2}};
633   itr->PopulateMatchedTermsStats(&matched_terms_stats);
634   EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
635                                        "foo", expected_section_ids_tf_map2)));
636 
637   ASSERT_THAT(itr->Advance(), IsOk());
638   EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(1));
639   EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
640               Eq(1U << kSectionIdBody));
641   std::unordered_map<SectionId, Hit::TermFrequency>
642       expected_section_ids_tf_map1 = {{kSectionIdBody, 2}};
643   matched_terms_stats.clear();
644   itr->PopulateMatchedTermsStats(&matched_terms_stats);
645   EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
646                                        "foo", expected_section_ids_tf_map1)));
647 
648   ASSERT_THAT(itr->Advance(), IsOk());
649   EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(0));
650   EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
651               Eq(1U << kSectionIdTitle | 1U << kSectionIdBody));
652   std::unordered_map<SectionId, Hit::TermFrequency>
653       expected_section_ids_tf_map0 = {{kSectionIdTitle, 3},
654                                       {kSectionIdBody, 1}};
655   matched_terms_stats.clear();
656   itr->PopulateMatchedTermsStats(&matched_terms_stats);
657   EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
658                                        "foo", expected_section_ids_tf_map0)));
659 }
660 
661 }  // namespace
662 
663 }  // namespace lib
664 }  // namespace icing
665