1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/index/term-indexing-handler.h"
16
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25
26 #include "icing/text_classifier/lib3/utils/base/status.h"
27 #include "icing/text_classifier/lib3/utils/base/statusor.h"
28 #include "gmock/gmock.h"
29 #include "gtest/gtest.h"
30 #include "icing/absl_ports/str_cat.h"
31 #include "icing/document-builder.h"
32 #include "icing/file/filesystem.h"
33 #include "icing/file/portable-file-backed-proto-log.h"
34 #include "icing/index/hit/doc-hit-info.h"
35 #include "icing/index/hit/hit.h"
36 #include "icing/index/index.h"
37 #include "icing/index/iterator/doc-hit-info-iterator-test-util.h"
38 #include "icing/index/iterator/doc-hit-info-iterator.h"
39 #include "icing/index/property-existence-indexing-handler.h"
40 #include "icing/legacy/index/icing-filesystem.h"
41 #include "icing/portable/platform.h"
42 #include "icing/proto/document.pb.h"
43 #include "icing/proto/document_wrapper.pb.h"
44 #include "icing/proto/schema.pb.h"
45 #include "icing/proto/term.pb.h"
46 #include "icing/schema-builder.h"
47 #include "icing/schema/schema-store.h"
48 #include "icing/schema/section.h"
49 #include "icing/store/document-id.h"
50 #include "icing/store/document-store.h"
51 #include "icing/testing/common-matchers.h"
52 #include "icing/testing/fake-clock.h"
53 #include "icing/testing/icu-data-file-helper.h"
54 #include "icing/testing/test-data.h"
55 #include "icing/testing/tmp-directory.h"
56 #include "icing/tokenization/language-segmenter-factory.h"
57 #include "icing/tokenization/language-segmenter.h"
58 #include "icing/transform/normalizer-factory.h"
59 #include "icing/transform/normalizer.h"
60 #include "icing/util/tokenized-document.h"
61 #include "unicode/uloc.h"
62
63 namespace icing {
64 namespace lib {
65
66 namespace {
67
68 using ::testing::ElementsAre;
69 using ::testing::Eq;
70 using ::testing::IsEmpty;
71 using ::testing::IsFalse;
72 using ::testing::IsTrue;
73 using ::testing::Test;
74
75 // Schema type with indexable properties and section Id.
76 // Section Id is determined by the lexicographical order of indexable property
77 // path.
78 // Section id = 0: body
79 // Section id = 1: title
80 constexpr std::string_view kFakeType = "FakeType";
81 constexpr std::string_view kPropertyBody = "body";
82 constexpr std::string_view kPropertyTitle = "title";
83
84 constexpr SectionId kSectionIdBody = 0;
85 constexpr SectionId kSectionIdTitle = 1;
86
87 // Schema type with nested indexable properties and section Id.
88 // Section id = 0: "name"
89 // Section id = 1: "nested.body"
90 // Section id = 3: "nested.title"
91 // Section id = 4: "subject"
92 constexpr std::string_view kNestedType = "NestedType";
93 constexpr std::string_view kPropertyName = "name";
94 constexpr std::string_view kPropertyNestedDoc = "nested";
95 constexpr std::string_view kPropertySubject = "subject";
96
97 constexpr SectionId kSectionIdNestedBody = 1;
98
99 class TermIndexingHandlerTest : public Test {
100 protected:
SetUp()101 void SetUp() override {
102 if (!IsCfStringTokenization() && !IsReverseJniTokenization()) {
103 ICING_ASSERT_OK(
104 // File generated via icu_data_file rule in //icing/BUILD.
105 icu_data_file_helper::SetUpICUDataFile(
106 GetTestFilePath("icing/icu.dat")));
107 }
108
109 base_dir_ = GetTestTempDir() + "/icing_test";
110 ASSERT_THAT(filesystem_.CreateDirectoryRecursively(base_dir_.c_str()),
111 IsTrue());
112
113 index_dir_ = base_dir_ + "/index";
114 schema_store_dir_ = base_dir_ + "/schema_store";
115 document_store_dir_ = base_dir_ + "/document_store";
116
117 language_segmenter_factory::SegmenterOptions segmenter_options(ULOC_US);
118 ICING_ASSERT_OK_AND_ASSIGN(
119 lang_segmenter_,
120 language_segmenter_factory::Create(std::move(segmenter_options)));
121
122 ICING_ASSERT_OK_AND_ASSIGN(
123 normalizer_,
124 normalizer_factory::Create(
125 /*max_term_byte_size=*/std::numeric_limits<int32_t>::max()));
126
127 ASSERT_THAT(
128 filesystem_.CreateDirectoryRecursively(schema_store_dir_.c_str()),
129 IsTrue());
130 ICING_ASSERT_OK_AND_ASSIGN(
131 schema_store_,
132 SchemaStore::Create(&filesystem_, schema_store_dir_, &fake_clock_));
133 SchemaProto schema =
134 SchemaBuilder()
135 .AddType(
136 SchemaTypeConfigBuilder()
137 .SetType(kFakeType)
138 .AddProperty(PropertyConfigBuilder()
139 .SetName(kPropertyTitle)
140 .SetDataTypeString(TERM_MATCH_PREFIX,
141 TOKENIZER_PLAIN)
142 .SetCardinality(CARDINALITY_OPTIONAL))
143 .AddProperty(PropertyConfigBuilder()
144 .SetName(kPropertyBody)
145 .SetDataTypeString(TERM_MATCH_EXACT,
146 TOKENIZER_PLAIN)
147 .SetCardinality(CARDINALITY_OPTIONAL)))
148 .AddType(
149 SchemaTypeConfigBuilder()
150 .SetType(kNestedType)
151 .AddProperty(
152 PropertyConfigBuilder()
153 .SetName(kPropertyNestedDoc)
154 .SetDataTypeDocument(
155 kFakeType, /*index_nested_properties=*/true)
156 .SetCardinality(CARDINALITY_OPTIONAL))
157 .AddProperty(PropertyConfigBuilder()
158 .SetName(kPropertySubject)
159 .SetDataTypeString(TERM_MATCH_EXACT,
160 TOKENIZER_PLAIN)
161 .SetCardinality(CARDINALITY_OPTIONAL))
162 .AddProperty(PropertyConfigBuilder()
163 .SetName(kPropertyName)
164 .SetDataTypeString(TERM_MATCH_EXACT,
165 TOKENIZER_PLAIN)
166 .SetCardinality(CARDINALITY_OPTIONAL)))
167 .Build();
168 ICING_ASSERT_OK(schema_store_->SetSchema(
169 schema, /*ignore_errors_and_delete_documents=*/false,
170 /*allow_circular_schema_definitions=*/false));
171
172 ASSERT_TRUE(
173 filesystem_.CreateDirectoryRecursively(document_store_dir_.c_str()));
174 ICING_ASSERT_OK_AND_ASSIGN(
175 DocumentStore::CreateResult doc_store_create_result,
176 DocumentStore::Create(&filesystem_, document_store_dir_, &fake_clock_,
177 schema_store_.get(),
178 /*force_recovery_and_revalidate_documents=*/false,
179 /*namespace_id_fingerprint=*/true,
180 /*pre_mapping_fbv=*/false,
181 /*use_persistent_hash_map=*/true,
182 PortableFileBackedProtoLog<
183 DocumentWrapper>::kDeflateCompressionLevel,
184 /*initialize_stats=*/nullptr));
185 document_store_ = std::move(doc_store_create_result.document_store);
186 }
187
TearDown()188 void TearDown() override {
189 document_store_.reset();
190 schema_store_.reset();
191 normalizer_.reset();
192 lang_segmenter_.reset();
193
194 filesystem_.DeleteDirectoryRecursively(base_dir_.c_str());
195 }
196
197 Filesystem filesystem_;
198 IcingFilesystem icing_filesystem_;
199 FakeClock fake_clock_;
200 std::string base_dir_;
201 std::string index_dir_;
202 std::string schema_store_dir_;
203 std::string document_store_dir_;
204
205 std::unique_ptr<LanguageSegmenter> lang_segmenter_;
206 std::unique_ptr<Normalizer> normalizer_;
207 std::unique_ptr<SchemaStore> schema_store_;
208 std::unique_ptr<DocumentStore> document_store_;
209 };
210
211 libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>>
QueryExistence(Index * index,std::string_view property_path)212 QueryExistence(Index* index, std::string_view property_path) {
213 return index->GetIterator(
214 absl_ports::StrCat(kPropertyExistenceTokenPrefix, property_path),
215 /*term_start_index=*/0,
216 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
217 TermMatchType::EXACT_ONLY,
218 /*need_hit_term_frequency=*/false);
219 }
220
GetHits(std::unique_ptr<DocHitInfoIterator> iterator)221 std::vector<DocHitInfo> GetHits(std::unique_ptr<DocHitInfoIterator> iterator) {
222 std::vector<DocHitInfo> infos;
223 while (iterator->Advance().ok()) {
224 infos.push_back(iterator->doc_hit_info());
225 }
226 return infos;
227 }
228
GetHitsWithTermFrequency(std::unique_ptr<DocHitInfoIterator> iterator)229 std::vector<DocHitInfoTermFrequencyPair> GetHitsWithTermFrequency(
230 std::unique_ptr<DocHitInfoIterator> iterator) {
231 std::vector<DocHitInfoTermFrequencyPair> infos;
232 while (iterator->Advance().ok()) {
233 std::vector<TermMatchInfo> matched_terms_stats;
234 iterator->PopulateMatchedTermsStats(&matched_terms_stats);
235 for (const TermMatchInfo& term_match_info : matched_terms_stats) {
236 infos.push_back(DocHitInfoTermFrequencyPair(
237 iterator->doc_hit_info(), term_match_info.term_frequencies));
238 }
239 }
240 return infos;
241 }
242
TEST_F(TermIndexingHandlerTest,HandleBothStringSectionAndPropertyExistence)243 TEST_F(TermIndexingHandlerTest, HandleBothStringSectionAndPropertyExistence) {
244 Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
245 /*lite_index_sort_at_indexing=*/true,
246 /*lite_index_sort_size=*/1024 * 8);
247 ICING_ASSERT_OK_AND_ASSIGN(
248 std::unique_ptr<Index> index,
249 Index::Create(options, &filesystem_, &icing_filesystem_));
250
251 DocumentProto document =
252 DocumentBuilder()
253 .SetKey("icing", "fake_type/1")
254 .SetSchema(std::string(kFakeType))
255 .AddStringProperty(std::string(kPropertyTitle), "foo")
256 .AddStringProperty(std::string(kPropertyBody), "")
257 .Build();
258
259 ICING_ASSERT_OK_AND_ASSIGN(
260 TokenizedDocument tokenized_document,
261 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
262 std::move(document)));
263
264 ICING_ASSERT_OK_AND_ASSIGN(
265 DocumentId document_id,
266 document_store_->Put(tokenized_document.document()));
267
268 EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
269
270 ICING_ASSERT_OK_AND_ASSIGN(
271 std::unique_ptr<TermIndexingHandler> handler,
272 TermIndexingHandler::Create(
273 &fake_clock_, normalizer_.get(), index.get(),
274 /*build_property_existence_metadata_hits=*/true));
275 EXPECT_THAT(
276 handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false,
277 /*put_document_stats=*/nullptr),
278 IsOk());
279
280 EXPECT_THAT(index->last_added_document_id(), Eq(document_id));
281
282 // Query 'foo'
283 ICING_ASSERT_OK_AND_ASSIGN(
284 std::unique_ptr<DocHitInfoIterator> itr,
285 index->GetIterator("foo", /*term_start_index=*/0,
286 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
287 TermMatchType::EXACT_ONLY));
288 std::vector<DocHitInfoTermFrequencyPair> hits =
289 GetHitsWithTermFrequency(std::move(itr));
290 std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
291 {kSectionIdTitle, 1}};
292 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
293 document_id, expected_map)));
294
295 // Query for "title" property existence.
296 ICING_ASSERT_OK_AND_ASSIGN(itr, QueryExistence(index.get(), kPropertyTitle));
297 EXPECT_THAT(
298 GetHits(std::move(itr)),
299 ElementsAre(EqualsDocHitInfo(document_id, std::vector<SectionId>{0})));
300
301 // Query for "body" property existence.
302 ICING_ASSERT_OK_AND_ASSIGN(itr, QueryExistence(index.get(), kPropertyBody));
303 EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
304 }
305
TEST_F(TermIndexingHandlerTest,HandleIntoLiteIndex_sortInIndexingNotTriggered)306 TEST_F(TermIndexingHandlerTest,
307 HandleIntoLiteIndex_sortInIndexingNotTriggered) {
308 Index::Options options(index_dir_, /*index_merge_size=*/1024 * 1024,
309 /*lite_index_sort_at_indexing=*/true,
310 /*lite_index_sort_size=*/1024 * 8);
311 ICING_ASSERT_OK_AND_ASSIGN(
312 std::unique_ptr<Index> index,
313 Index::Create(options, &filesystem_, &icing_filesystem_));
314
315 DocumentProto document =
316 DocumentBuilder()
317 .SetKey("icing", "fake_type/1")
318 .SetSchema(std::string(kFakeType))
319 .AddStringProperty(std::string(kPropertyTitle), "foo")
320 .AddStringProperty(std::string(kPropertyBody), "foo bar baz")
321 .Build();
322
323 ICING_ASSERT_OK_AND_ASSIGN(
324 TokenizedDocument tokenized_document,
325 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
326 std::move(document)));
327
328 ICING_ASSERT_OK_AND_ASSIGN(
329 DocumentId document_id,
330 document_store_->Put(tokenized_document.document()));
331
332 EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
333
334 ICING_ASSERT_OK_AND_ASSIGN(
335 std::unique_ptr<TermIndexingHandler> handler,
336 TermIndexingHandler::Create(
337 &fake_clock_, normalizer_.get(), index.get(),
338 /*build_property_existence_metadata_hits=*/true));
339 EXPECT_THAT(
340 handler->Handle(tokenized_document, document_id, /*recovery_mode=*/false,
341 /*put_document_stats=*/nullptr),
342 IsOk());
343
344 EXPECT_THAT(index->last_added_document_id(), Eq(document_id));
345
346 // Query 'foo'
347 ICING_ASSERT_OK_AND_ASSIGN(
348 std::unique_ptr<DocHitInfoIterator> itr,
349 index->GetIterator("foo", /*term_start_index=*/0,
350 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
351 TermMatchType::EXACT_ONLY));
352 std::vector<DocHitInfoTermFrequencyPair> hits =
353 GetHitsWithTermFrequency(std::move(itr));
354 std::unordered_map<SectionId, Hit::TermFrequency> expected_map{
355 {kSectionIdTitle, 1}, {kSectionIdBody, 1}};
356 EXPECT_THAT(hits, ElementsAre(EqualsDocHitInfoWithTermFrequency(
357 document_id, expected_map)));
358
359 // Query 'foo' with sectionId mask that masks all results
360 ICING_ASSERT_OK_AND_ASSIGN(
361 itr, index->GetIterator("foo", /*term_start_index=*/0,
362 /*unnormalized_term_length=*/0, 1U << 2,
363 TermMatchType::EXACT_ONLY));
364 EXPECT_THAT(GetHits(std::move(itr)), IsEmpty());
365 }
366
TEST_F(TermIndexingHandlerTest,HandleIntoLiteIndex_sortInIndexingTriggered)367 TEST_F(TermIndexingHandlerTest, HandleIntoLiteIndex_sortInIndexingTriggered) {
368 // Create the LiteIndex with a smaller sort threshold. At 64 bytes we sort the
369 // HitBuffer after inserting 8 hits
370 Index::Options options(index_dir_,
371 /*index_merge_size=*/1024 * 1024,
372 /*lite_index_sort_at_indexing=*/true,
373 /*lite_index_sort_size=*/64);
374 ICING_ASSERT_OK_AND_ASSIGN(
375 std::unique_ptr<Index> index,
376 Index::Create(options, &filesystem_, &icing_filesystem_));
377
378 DocumentProto document0 =
379 DocumentBuilder()
380 .SetKey("icing", "fake_type/0")
381 .SetSchema(std::string(kFakeType))
382 .AddStringProperty(std::string(kPropertyTitle), "foo foo foo")
383 .AddStringProperty(std::string(kPropertyBody), "foo bar baz")
384 .Build();
385 DocumentProto document1 =
386 DocumentBuilder()
387 .SetKey("icing", "fake_type/1")
388 .SetSchema(std::string(kFakeType))
389 .AddStringProperty(std::string(kPropertyTitle), "bar baz baz")
390 .AddStringProperty(std::string(kPropertyBody), "foo foo baz")
391 .Build();
392 DocumentProto document2 =
393 DocumentBuilder()
394 .SetKey("icing", "nested_type/0")
395 .SetSchema(std::string(kNestedType))
396 .AddDocumentProperty(std::string(kPropertyNestedDoc), document1)
397 .AddStringProperty(std::string(kPropertyName), "qux")
398 .AddStringProperty(std::string(kPropertySubject), "bar bar")
399 .Build();
400
401 ICING_ASSERT_OK_AND_ASSIGN(
402 TokenizedDocument tokenized_document0,
403 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
404 std::move(document0)));
405 ICING_ASSERT_OK_AND_ASSIGN(
406 DocumentId document_id0,
407 document_store_->Put(tokenized_document0.document()));
408
409 ICING_ASSERT_OK_AND_ASSIGN(
410 TokenizedDocument tokenized_document1,
411 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
412 std::move(document1)));
413 ICING_ASSERT_OK_AND_ASSIGN(
414 DocumentId document_id1,
415 document_store_->Put(tokenized_document1.document()));
416
417 ICING_ASSERT_OK_AND_ASSIGN(
418 TokenizedDocument tokenized_document2,
419 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
420 std::move(document2)));
421 ICING_ASSERT_OK_AND_ASSIGN(
422 DocumentId document_id2,
423 document_store_->Put(tokenized_document2.document()));
424 EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
425
426 ICING_ASSERT_OK_AND_ASSIGN(
427 std::unique_ptr<TermIndexingHandler> handler,
428 TermIndexingHandler::Create(
429 &fake_clock_, normalizer_.get(), index.get(),
430 /*build_property_existence_metadata_hits=*/true));
431
432 // Handle doc0 and doc1. The LiteIndex should sort and merge after adding
433 // these
434 EXPECT_THAT(handler->Handle(tokenized_document0, document_id0,
435 /*recovery_mode=*/false,
436 /*put_document_stats=*/nullptr),
437 IsOk());
438 EXPECT_THAT(handler->Handle(tokenized_document1, document_id1,
439 /*recovery_mode=*/false,
440 /*put_document_stats=*/nullptr),
441 IsOk());
442 EXPECT_THAT(index->last_added_document_id(), Eq(document_id1));
443 EXPECT_THAT(index->LiteIndexNeedSort(), IsFalse());
444
445 // Handle doc2. The LiteIndex should have an unsorted portion after adding
446 EXPECT_THAT(handler->Handle(tokenized_document2, document_id2,
447 /*recovery_mode=*/false,
448 /*put_document_stats=*/nullptr),
449 IsOk());
450 EXPECT_THAT(index->last_added_document_id(), Eq(document_id2));
451
452 // Hits in the hit buffer:
453 // <term>: {(docId, sectionId, term_freq)...}
454 // foo: {(0, kSectionIdTitle, 3); (0, kSectionIdBody, 1);
455 // (1, kSectionIdBody, 2);
456 // (2, kSectionIdNestedBody, 2)}
457 // bar: {(0, kSectionIdBody, 1);
458 // (1, kSectionIdTitle, 1);
459 // (2, kSectionIdNestedTitle, 1); (2, kSectionIdSubject, 2)}
460 // baz: {(0, kSectionIdBody, 1);
461 // (1, kSectionIdTitle, 2); (1, kSectionIdBody, 1),
462 // (2, kSectionIdNestedTitle, 2); (2, kSectionIdNestedBody, 1)}
463 // qux: {(2, kSectionIdName, 1)}
464
465 // Query 'foo'
466 ICING_ASSERT_OK_AND_ASSIGN(
467 std::unique_ptr<DocHitInfoIterator> itr,
468 index->GetIterator("foo", /*term_start_index=*/0,
469 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
470 TermMatchType::EXACT_ONLY));
471
472 // Advance the iterator and verify that we're returning hits in the correct
473 // order (i.e. in descending order of DocId)
474 ASSERT_THAT(itr->Advance(), IsOk());
475 EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(2));
476 EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
477 Eq(1U << kSectionIdNestedBody));
478 std::vector<TermMatchInfo> matched_terms_stats;
479 std::unordered_map<SectionId, Hit::TermFrequency>
480 expected_section_ids_tf_map2 = {{kSectionIdNestedBody, 2}};
481 itr->PopulateMatchedTermsStats(&matched_terms_stats);
482 EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
483 "foo", expected_section_ids_tf_map2)));
484
485 ASSERT_THAT(itr->Advance(), IsOk());
486 EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(1));
487 EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
488 Eq(1U << kSectionIdBody));
489 std::unordered_map<SectionId, Hit::TermFrequency>
490 expected_section_ids_tf_map1 = {{kSectionIdBody, 2}};
491 matched_terms_stats.clear();
492 itr->PopulateMatchedTermsStats(&matched_terms_stats);
493 EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
494 "foo", expected_section_ids_tf_map1)));
495
496 ASSERT_THAT(itr->Advance(), IsOk());
497 EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(0));
498 EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
499 Eq(1U << kSectionIdTitle | 1U << kSectionIdBody));
500 std::unordered_map<SectionId, Hit::TermFrequency>
501 expected_section_ids_tf_map0 = {{kSectionIdTitle, 3},
502 {kSectionIdBody, 1}};
503 matched_terms_stats.clear();
504 itr->PopulateMatchedTermsStats(&matched_terms_stats);
505 EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
506 "foo", expected_section_ids_tf_map0)));
507 }
508
TEST_F(TermIndexingHandlerTest,HandleIntoLiteIndex_enableSortInIndexing)509 TEST_F(TermIndexingHandlerTest, HandleIntoLiteIndex_enableSortInIndexing) {
510 // Create the LiteIndex with a smaller sort threshold. At 64 bytes we sort the
511 // HitBuffer after inserting 8 hits
512 Index::Options options(index_dir_,
513 /*index_merge_size=*/1024 * 1024,
514 /*lite_index_sort_at_indexing=*/false,
515 /*lite_index_sort_size=*/64);
516 ICING_ASSERT_OK_AND_ASSIGN(
517 std::unique_ptr<Index> index,
518 Index::Create(options, &filesystem_, &icing_filesystem_));
519
520 DocumentProto document0 =
521 DocumentBuilder()
522 .SetKey("icing", "fake_type/0")
523 .SetSchema(std::string(kFakeType))
524 .AddStringProperty(std::string(kPropertyTitle), "foo foo foo")
525 .AddStringProperty(std::string(kPropertyBody), "foo bar baz")
526 .Build();
527 DocumentProto document1 =
528 DocumentBuilder()
529 .SetKey("icing", "fake_type/1")
530 .SetSchema(std::string(kFakeType))
531 .AddStringProperty(std::string(kPropertyTitle), "bar baz baz")
532 .AddStringProperty(std::string(kPropertyBody), "foo foo baz")
533 .Build();
534 DocumentProto document2 =
535 DocumentBuilder()
536 .SetKey("icing", "nested_type/0")
537 .SetSchema(std::string(kNestedType))
538 .AddDocumentProperty(std::string(kPropertyNestedDoc), document1)
539 .AddStringProperty(std::string(kPropertyName), "qux")
540 .AddStringProperty(std::string(kPropertySubject), "bar bar")
541 .Build();
542
543 ICING_ASSERT_OK_AND_ASSIGN(
544 TokenizedDocument tokenized_document0,
545 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
546 std::move(document0)));
547 ICING_ASSERT_OK_AND_ASSIGN(
548 DocumentId document_id0,
549 document_store_->Put(tokenized_document0.document()));
550
551 ICING_ASSERT_OK_AND_ASSIGN(
552 TokenizedDocument tokenized_document1,
553 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
554 std::move(document1)));
555 ICING_ASSERT_OK_AND_ASSIGN(
556 DocumentId document_id1,
557 document_store_->Put(tokenized_document1.document()));
558
559 ICING_ASSERT_OK_AND_ASSIGN(
560 TokenizedDocument tokenized_document2,
561 TokenizedDocument::Create(schema_store_.get(), lang_segmenter_.get(),
562 std::move(document2)));
563 ICING_ASSERT_OK_AND_ASSIGN(
564 DocumentId document_id2,
565 document_store_->Put(tokenized_document2.document()));
566 EXPECT_THAT(index->last_added_document_id(), Eq(kInvalidDocumentId));
567
568 ICING_ASSERT_OK_AND_ASSIGN(
569 std::unique_ptr<TermIndexingHandler> handler,
570 TermIndexingHandler::Create(
571 &fake_clock_, normalizer_.get(), index.get(),
572 /*build_property_existence_metadata_hits=*/true));
573
574 // Handle all docs
575 EXPECT_THAT(handler->Handle(tokenized_document0, document_id0,
576 /*recovery_mode=*/false,
577 /*put_document_stats=*/nullptr),
578 IsOk());
579 EXPECT_THAT(handler->Handle(tokenized_document1, document_id1,
580 /*recovery_mode=*/false,
581 /*put_document_stats=*/nullptr),
582 IsOk());
583 EXPECT_THAT(handler->Handle(tokenized_document2, document_id2,
584 /*recovery_mode=*/false,
585 /*put_document_stats=*/nullptr),
586 IsOk());
587 EXPECT_THAT(index->last_added_document_id(), Eq(document_id2));
588
589 // We've disabled sorting during indexing so the HitBuffer's unsorted section
590 // should exceed the sort threshold. PersistToDisk and reinitialize the
591 // LiteIndex with sort_at_indexing=true.
592 ASSERT_THAT(index->PersistToDisk(), IsOk());
593 options = Index::Options(index_dir_,
594 /*index_merge_size=*/1024 * 1024,
595 /*lite_index_sort_at_indexing=*/true,
596 /*lite_index_sort_size=*/64);
597 ICING_ASSERT_OK_AND_ASSIGN(
598 index, Index::Create(options, &filesystem_, &icing_filesystem_));
599
600 // Verify that the HitBuffer has been sorted after initializing with
601 // sort_at_indexing enabled.
602 EXPECT_THAT(index->LiteIndexNeedSort(), IsFalse());
603
604 // Hits in the hit buffer:
605 // <term>: {(docId, sectionId, term_freq)...}
606 // foo: {(0, kSectionIdTitle, 3); (0, kSectionIdBody, 1);
607 // (1, kSectionIdBody, 2);
608 // (2, kSectionIdNestedBody, 2)}
609 // bar: {(0, kSectionIdBody, 1);
610 // (1, kSectionIdTitle, 1);
611 // (2, kSectionIdNestedTitle, 1); (2, kSectionIdSubject, 2)}
612 // baz: {(0, kSectionIdBody, 1);
613 // (1, kSectionIdTitle, 2); (1, kSectionIdBody, 1),
614 // (2, kSectionIdNestedTitle, 2); (2, kSectionIdNestedBody, 1)}
615 // qux: {(2, kSectionIdName, 1)}
616
617 // Query 'foo'
618 ICING_ASSERT_OK_AND_ASSIGN(
619 std::unique_ptr<DocHitInfoIterator> itr,
620 index->GetIterator("foo", /*term_start_index=*/0,
621 /*unnormalized_term_length=*/0, kSectionIdMaskAll,
622 TermMatchType::EXACT_ONLY));
623
624 // Advance the iterator and verify that we're returning hits in the correct
625 // order (i.e. in descending order of DocId)
626 ASSERT_THAT(itr->Advance(), IsOk());
627 EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(2));
628 EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
629 Eq(1U << kSectionIdNestedBody));
630 std::vector<TermMatchInfo> matched_terms_stats;
631 std::unordered_map<SectionId, Hit::TermFrequency>
632 expected_section_ids_tf_map2 = {{kSectionIdNestedBody, 2}};
633 itr->PopulateMatchedTermsStats(&matched_terms_stats);
634 EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
635 "foo", expected_section_ids_tf_map2)));
636
637 ASSERT_THAT(itr->Advance(), IsOk());
638 EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(1));
639 EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
640 Eq(1U << kSectionIdBody));
641 std::unordered_map<SectionId, Hit::TermFrequency>
642 expected_section_ids_tf_map1 = {{kSectionIdBody, 2}};
643 matched_terms_stats.clear();
644 itr->PopulateMatchedTermsStats(&matched_terms_stats);
645 EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
646 "foo", expected_section_ids_tf_map1)));
647
648 ASSERT_THAT(itr->Advance(), IsOk());
649 EXPECT_THAT(itr->doc_hit_info().document_id(), Eq(0));
650 EXPECT_THAT(itr->doc_hit_info().hit_section_ids_mask(),
651 Eq(1U << kSectionIdTitle | 1U << kSectionIdBody));
652 std::unordered_map<SectionId, Hit::TermFrequency>
653 expected_section_ids_tf_map0 = {{kSectionIdTitle, 3},
654 {kSectionIdBody, 1}};
655 matched_terms_stats.clear();
656 itr->PopulateMatchedTermsStats(&matched_terms_stats);
657 EXPECT_THAT(matched_terms_stats, ElementsAre(EqualsTermMatchInfo(
658 "foo", expected_section_ids_tf_map0)));
659 }
660
661 } // namespace
662
663 } // namespace lib
664 } // namespace icing
665