1 // Copyright (C) 2023 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/join/qualified-id-join-indexing-handler.h"
16
17 #include <cstdint>
18 #include <limits>
19 #include <memory>
20 #include <optional>
21 #include <string_view>
22 #include <utility>
23 #include <vector>
24
25 #include "icing/text_classifier/lib3/utils/base/status.h"
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/absl_ports/canonical_errors.h"
28 #include "icing/join/document-join-id-pair.h"
29 #include "icing/join/qualified-id-join-index.h"
30 #include "icing/join/qualified-id.h"
31 #include "icing/legacy/core/icing-string-util.h"
32 #include "icing/proto/logging.pb.h"
33 #include "icing/schema/joinable-property.h"
34 #include "icing/store/document-filter-data.h"
35 #include "icing/store/document-id.h"
36 #include "icing/store/document-store.h"
37 #include "icing/store/namespace-id-fingerprint.h"
38 #include "icing/store/namespace-id.h"
39 #include "icing/util/clock.h"
40 #include "icing/util/logging.h"
41 #include "icing/util/status-macros.h"
42 #include "icing/util/tokenized-document.h"
43
44 namespace icing {
45 namespace lib {
46
47 /* static */ libtextclassifier3::StatusOr<
48 std::unique_ptr<QualifiedIdJoinIndexingHandler>>
Create(const Clock * clock,const DocumentStore * doc_store,QualifiedIdJoinIndex * qualified_id_join_index)49 QualifiedIdJoinIndexingHandler::Create(
50 const Clock* clock, const DocumentStore* doc_store,
51 QualifiedIdJoinIndex* qualified_id_join_index) {
52 ICING_RETURN_ERROR_IF_NULL(clock);
53 ICING_RETURN_ERROR_IF_NULL(doc_store);
54 ICING_RETURN_ERROR_IF_NULL(qualified_id_join_index);
55
56 return std::unique_ptr<QualifiedIdJoinIndexingHandler>(
57 new QualifiedIdJoinIndexingHandler(clock, doc_store,
58 qualified_id_join_index));
59 }
60
Handle(const TokenizedDocument & tokenized_document,DocumentId document_id,DocumentId old_document_id,bool recovery_mode,PutDocumentStatsProto * put_document_stats)61 libtextclassifier3::Status QualifiedIdJoinIndexingHandler::Handle(
62 const TokenizedDocument& tokenized_document, DocumentId document_id,
63 DocumentId old_document_id, bool recovery_mode,
64 PutDocumentStatsProto* put_document_stats) {
65 std::unique_ptr<Timer> index_timer = clock_.GetNewTimer();
66
67 if (!IsDocumentIdValid(document_id)) {
68 return absl_ports::InvalidArgumentError(
69 IcingStringUtil::StringPrintf("Invalid DocumentId %d", document_id));
70 }
71
72 if (qualified_id_join_index_.last_added_document_id() != kInvalidDocumentId &&
73 document_id <= qualified_id_join_index_.last_added_document_id()) {
74 if (recovery_mode) {
75 // Skip the document if document_id <= last_added_document_id in recovery
76 // mode without returning an error.
77 return libtextclassifier3::Status::OK;
78 }
79 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
80 "DocumentId %d must be greater than last added document_id %d",
81 document_id, qualified_id_join_index_.last_added_document_id()));
82 }
83 qualified_id_join_index_.set_last_added_document_id(document_id);
84
85 switch (qualified_id_join_index_.version()) {
86 case QualifiedIdJoinIndex::Version::kV2:
87 ICING_RETURN_IF_ERROR(HandleV2(tokenized_document, document_id));
88 break;
89 case QualifiedIdJoinIndex::Version::kV3:
90 ICING_RETURN_IF_ERROR(
91 HandleV3(tokenized_document, document_id, old_document_id));
92 break;
93 }
94
95 if (put_document_stats != nullptr) {
96 put_document_stats->set_qualified_id_join_index_latency_ms(
97 index_timer->GetElapsedMilliseconds());
98 }
99
100 return libtextclassifier3::Status::OK;
101 }
102
HandleV2(const TokenizedDocument & tokenized_document,DocumentId document_id)103 libtextclassifier3::Status QualifiedIdJoinIndexingHandler::HandleV2(
104 const TokenizedDocument& tokenized_document, DocumentId document_id) {
105 std::optional<DocumentFilterData> filter_data =
106 doc_store_.GetAliveDocumentFilterData(
107 document_id,
108 /*current_time_ms=*/std::numeric_limits<int64_t>::min());
109 if (!filter_data) {
110 // This should not happen.
111 return absl_ports::InternalError(
112 "Failed to get alive document filter data when indexing");
113 }
114
115 for (const JoinableProperty<std::string_view>& qualified_id_property :
116 tokenized_document.qualified_id_join_properties()) {
117 // Parse all qualified id strings and convert them to
118 // NamespaceIdFingerprint.
119 std::vector<NamespaceIdFingerprint> ref_doc_nsid_uri_fingerprints;
120 for (std::string_view ref_qualified_id_str : qualified_id_property.values) {
121 // Attempt to parse qualified id string to make sure the format is
122 // correct.
123 auto ref_qualified_id_or = QualifiedId::Parse(ref_qualified_id_str);
124 if (!ref_qualified_id_or.ok()) {
125 // Skip incorrect format of qualified id string.
126 continue;
127 }
128
129 QualifiedId ref_qualified_id =
130 std::move(ref_qualified_id_or).ValueOrDie();
131 auto ref_namespace_id_or =
132 doc_store_.GetNamespaceId(ref_qualified_id.name_space());
133 if (!ref_namespace_id_or.ok()) {
134 // Skip invalid namespace id.
135 continue;
136 }
137 NamespaceId ref_namespace_id =
138 std::move(ref_namespace_id_or).ValueOrDie();
139
140 ref_doc_nsid_uri_fingerprints.push_back(
141 NamespaceIdFingerprint(ref_namespace_id, ref_qualified_id.uri()));
142 }
143
144 // Batch add all join data of this (schema_type_id, joinable_property_id)
145 // into to the index.
146 libtextclassifier3::Status status = qualified_id_join_index_.Put(
147 filter_data->schema_type_id(), qualified_id_property.metadata.id,
148 document_id, std::move(ref_doc_nsid_uri_fingerprints));
149 if (!status.ok()) {
150 ICING_LOG(WARNING)
151 << "Failed to add data into qualified id join index v2 due to: "
152 << status.error_message();
153 return status;
154 }
155 }
156 return libtextclassifier3::Status::OK;
157 }
158
HandleV3(const TokenizedDocument & tokenized_document,DocumentId document_id,DocumentId old_document_id)159 libtextclassifier3::Status QualifiedIdJoinIndexingHandler::HandleV3(
160 const TokenizedDocument& tokenized_document, DocumentId document_id,
161 DocumentId old_document_id) {
162 // (Parent perspective)
163 // When replacement, if there were any existing child documents joining to it,
164 // then we need to migrate the old document id to the new document id.
165 if (IsDocumentIdValid(old_document_id)) {
166 ICING_RETURN_IF_ERROR(
167 qualified_id_join_index_.MigrateParent(old_document_id, document_id));
168 }
169
170 // (Child perspective)
171 // Add child join data.
172 for (const JoinableProperty<std::string_view>& qualified_id_property :
173 tokenized_document.qualified_id_join_properties()) {
174 if (qualified_id_property.values.empty()) {
175 continue;
176 }
177
178 DocumentJoinIdPair child_doc_join_id_pair(
179 document_id, qualified_id_property.metadata.id);
180
181 // Extract parent qualified ids and lookup their corresponding document ids.
182 std::vector<DocumentId> parent_doc_ids;
183 parent_doc_ids.reserve(qualified_id_property.values.size());
184 for (std::string_view parent_qualified_id_str :
185 qualified_id_property.values) {
186 libtextclassifier3::StatusOr<QualifiedId> parent_qualified_id_or =
187 QualifiedId::Parse(parent_qualified_id_str);
188 if (!parent_qualified_id_or.ok()) {
189 // Skip incorrect format of qualified id string.
190 continue;
191 }
192 QualifiedId parent_qualified_id =
193 std::move(parent_qualified_id_or).ValueOrDie();
194
195 // Lookup document store to get the parent document id.
196 libtextclassifier3::StatusOr<DocumentId> parent_doc_id_or =
197 doc_store_.GetDocumentId(parent_qualified_id.name_space(),
198 parent_qualified_id.uri());
199 if (!parent_doc_id_or.ok() ||
200 parent_doc_id_or.ValueOrDie() == kInvalidDocumentId) {
201 // Skip invalid parent document id or parent document does not exist.
202 continue;
203 }
204 parent_doc_ids.push_back(parent_doc_id_or.ValueOrDie());
205 }
206
207 // Add all parent document ids to the index.
208 libtextclassifier3::Status status = qualified_id_join_index_.Put(
209 child_doc_join_id_pair, std::move(parent_doc_ids));
210 if (!status.ok()) {
211 ICING_LOG(WARNING)
212 << "Failed to add data into qualified id join index due to: "
213 << status.error_message();
214 return status;
215 }
216 }
217 return libtextclassifier3::Status::OK;
218 }
219
220 } // namespace lib
221 } // namespace icing
222