1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/join/join-processor.h"
16
17 #include <algorithm>
18 #include <memory>
19 #include <optional>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/absl_ports/canonical_errors.h"
28 #include "icing/absl_ports/str_cat.h"
29 #include "icing/join/aggregation-scorer.h"
30 #include "icing/join/doc-join-info.h"
31 #include "icing/join/join-children-fetcher.h"
32 #include "icing/join/qualified-id-join-index.h"
33 #include "icing/join/qualified-id.h"
34 #include "icing/proto/schema.pb.h"
35 #include "icing/proto/scoring.pb.h"
36 #include "icing/proto/search.pb.h"
37 #include "icing/schema/joinable-property.h"
38 #include "icing/scoring/scored-document-hit.h"
39 #include "icing/store/document-filter-data.h"
40 #include "icing/store/document-id.h"
41 #include "icing/store/namespace-fingerprint-identifier.h"
42 #include "icing/util/status-macros.h"
43
44 namespace icing {
45 namespace lib {
46
47 libtextclassifier3::StatusOr<JoinChildrenFetcher>
GetChildrenFetcher(const JoinSpecProto & join_spec,std::vector<ScoredDocumentHit> && child_scored_document_hits)48 JoinProcessor::GetChildrenFetcher(
49 const JoinSpecProto& join_spec,
50 std::vector<ScoredDocumentHit>&& child_scored_document_hits) {
51 if (join_spec.parent_property_expression() != kQualifiedIdExpr) {
52 // TODO(b/256022027): So far we only support kQualifiedIdExpr for
53 // parent_property_expression, we could support more.
54 return absl_ports::UnimplementedError(absl_ports::StrCat(
55 "Parent property expression must be ", kQualifiedIdExpr));
56 }
57
58 ScoredDocumentHitComparator score_comparator(
59 /*is_descending=*/join_spec.nested_spec().scoring_spec().order_by() ==
60 ScoringSpecProto::Order::DESC);
61
62 if (qualified_id_join_index_->is_v2()) {
63 // v2
64 // Step 1a: sort child ScoredDocumentHits in document id descending order.
65 std::sort(child_scored_document_hits.begin(),
66 child_scored_document_hits.end(),
67 [](const ScoredDocumentHit& lhs, const ScoredDocumentHit& rhs) {
68 return lhs.document_id() > rhs.document_id();
69 });
70
71 // Step 1b: group all child ScoredDocumentHits by the document's
72 // schema_type_id.
73 std::unordered_map<SchemaTypeId, std::vector<ScoredDocumentHit>>
74 schema_to_child_scored_doc_hits_map;
75 for (const ScoredDocumentHit& child_scored_document_hit :
76 child_scored_document_hits) {
77 std::optional<DocumentFilterData> child_doc_filter_data =
78 doc_store_->GetAliveDocumentFilterData(
79 child_scored_document_hit.document_id(), current_time_ms_);
80 if (!child_doc_filter_data) {
81 continue;
82 }
83
84 schema_to_child_scored_doc_hits_map[child_doc_filter_data
85 ->schema_type_id()]
86 .push_back(child_scored_document_hit);
87 }
88
89 // Step 1c: for each schema_type_id, lookup QualifiedIdJoinIndexImplV2 to
90 // fetch all child join data from posting list(s). Convert all
91 // child join data to referenced parent document ids and bucketize
92 // child ScoredDocumentHits by it.
93 std::unordered_map<DocumentId, std::vector<ScoredDocumentHit>>
94 parent_to_child_docs_map;
95 for (auto& [schema_type_id, grouped_child_scored_doc_hits] :
96 schema_to_child_scored_doc_hits_map) {
97 // Get joinable_property_id of this schema.
98 ICING_ASSIGN_OR_RETURN(
99 const JoinablePropertyMetadata* metadata,
100 schema_store_->GetJoinablePropertyMetadata(
101 schema_type_id, join_spec.child_property_expression()));
102 if (metadata == nullptr ||
103 metadata->value_type != JoinableConfig::ValueType::QUALIFIED_ID) {
104 // Currently we only support qualified id, so skip other types.
105 continue;
106 }
107
108 // Lookup QualifiedIdJoinIndexImplV2.
109 ICING_ASSIGN_OR_RETURN(
110 std::unique_ptr<QualifiedIdJoinIndex::JoinDataIteratorBase>
111 join_index_iter,
112 qualified_id_join_index_->GetIterator(
113 schema_type_id, /*joinable_property_id=*/metadata->id));
114
115 // - Join index contains all join data of schema_type_id and
116 // join_index_iter will return all of them in (child) document id
117 // descending order.
118 // - But we only need join data of child document ids which appear in
119 // grouped_child_scored_doc_hits. Also grouped_child_scored_doc_hits
120 // contain ScoredDocumentHits in (child) document id descending order.
121 // - Therefore, we advance 2 iterators to intersect them and get desired
122 // join data.
123 auto child_scored_doc_hits_iter = grouped_child_scored_doc_hits.cbegin();
124 while (join_index_iter->Advance().ok() &&
125 child_scored_doc_hits_iter !=
126 grouped_child_scored_doc_hits.cend()) {
127 // Advance child_scored_doc_hits_iter until it points to a
128 // ScoredDocumentHit with document id <= the one pointed by
129 // join_index_iter.
130 while (child_scored_doc_hits_iter !=
131 grouped_child_scored_doc_hits.cend() &&
132 child_scored_doc_hits_iter->document_id() >
133 join_index_iter->GetCurrent().document_id()) {
134 ++child_scored_doc_hits_iter;
135 }
136
137 if (child_scored_doc_hits_iter !=
138 grouped_child_scored_doc_hits.cend() &&
139 child_scored_doc_hits_iter->document_id() ==
140 join_index_iter->GetCurrent().document_id()) {
141 // We get a join data whose child document id exists in both join
142 // index and grouped_child_scored_doc_hits. Convert its join info to
143 // referenced parent document ids and bucketize ScoredDocumentHits by
144 // it (putting into parent_to_child_docs_map).
145 const NamespaceFingerprintIdentifier& ref_ns_id =
146 join_index_iter->GetCurrent().join_info();
147 libtextclassifier3::StatusOr<DocumentId> ref_parent_doc_id_or =
148 doc_store_->GetDocumentId(ref_ns_id);
149 if (ref_parent_doc_id_or.ok()) {
150 parent_to_child_docs_map[std::move(ref_parent_doc_id_or)
151 .ValueOrDie()]
152 .push_back(*child_scored_doc_hits_iter);
153 }
154 }
155 }
156 }
157
158 // Step 1d: finally, sort each parent's joined child ScoredDocumentHits by
159 // score.
160 for (auto& [parent_doc_id, bucketized_child_scored_hits] :
161 parent_to_child_docs_map) {
162 std::sort(bucketized_child_scored_hits.begin(),
163 bucketized_child_scored_hits.end(), score_comparator);
164 }
165
166 return JoinChildrenFetcher(join_spec, std::move(parent_to_child_docs_map));
167 }
168
169 // v1
170 // TODO(b/275121148): deprecate this part after rollout v2.
171 std::sort(child_scored_document_hits.begin(),
172 child_scored_document_hits.end(), score_comparator);
173
174 // Step 1: group child documents by parent documentId. Currently we only
175 // support QualifiedId joining, so fetch the qualified id content of
176 // child_property_expression, break it down into namespace + uri, and
177 // lookup the DocumentId.
178 // The keys of this map are the DocumentIds of the parent docs the child
179 // ScoredDocumentHits refer to. The values in this map are vectors of child
180 // ScoredDocumentHits that refer to a parent DocumentId.
181 std::unordered_map<DocumentId, std::vector<ScoredDocumentHit>>
182 map_joinable_qualified_id;
183 for (const ScoredDocumentHit& child : child_scored_document_hits) {
184 ICING_ASSIGN_OR_RETURN(
185 DocumentId ref_doc_id,
186 FetchReferencedQualifiedId(child.document_id(),
187 join_spec.child_property_expression()));
188 if (ref_doc_id == kInvalidDocumentId) {
189 continue;
190 }
191
192 map_joinable_qualified_id[ref_doc_id].push_back(child);
193 }
194 return JoinChildrenFetcher(join_spec, std::move(map_joinable_qualified_id));
195 }
196
197 libtextclassifier3::StatusOr<std::vector<JoinedScoredDocumentHit>>
Join(const JoinSpecProto & join_spec,std::vector<ScoredDocumentHit> && parent_scored_document_hits,const JoinChildrenFetcher & join_children_fetcher)198 JoinProcessor::Join(
199 const JoinSpecProto& join_spec,
200 std::vector<ScoredDocumentHit>&& parent_scored_document_hits,
201 const JoinChildrenFetcher& join_children_fetcher) {
202 std::unique_ptr<AggregationScorer> aggregation_scorer =
203 AggregationScorer::Create(join_spec);
204
205 std::vector<JoinedScoredDocumentHit> joined_scored_document_hits;
206 joined_scored_document_hits.reserve(parent_scored_document_hits.size());
207
208 // Step 2: iterate through all parent documentIds and construct
209 // JoinedScoredDocumentHit for each by looking up
210 // join_children_fetcher.
211 for (ScoredDocumentHit& parent : parent_scored_document_hits) {
212 ICING_ASSIGN_OR_RETURN(
213 std::vector<ScoredDocumentHit> children,
214 join_children_fetcher.GetChildren(parent.document_id()));
215
216 double final_score = aggregation_scorer->GetScore(parent, children);
217 joined_scored_document_hits.emplace_back(final_score, std::move(parent),
218 std::move(children));
219 }
220
221 return joined_scored_document_hits;
222 }
223
224 libtextclassifier3::StatusOr<DocumentId>
FetchReferencedQualifiedId(const DocumentId & document_id,const std::string & property_path) const225 JoinProcessor::FetchReferencedQualifiedId(
226 const DocumentId& document_id, const std::string& property_path) const {
227 std::optional<DocumentFilterData> filter_data =
228 doc_store_->GetAliveDocumentFilterData(document_id, current_time_ms_);
229 if (!filter_data) {
230 return kInvalidDocumentId;
231 }
232
233 ICING_ASSIGN_OR_RETURN(const JoinablePropertyMetadata* metadata,
234 schema_store_->GetJoinablePropertyMetadata(
235 filter_data->schema_type_id(), property_path));
236 if (metadata == nullptr ||
237 metadata->value_type != JoinableConfig::ValueType::QUALIFIED_ID) {
238 // Currently we only support qualified id.
239 return kInvalidDocumentId;
240 }
241
242 DocJoinInfo info(document_id, metadata->id);
243 libtextclassifier3::StatusOr<std::string_view> ref_qualified_id_str_or =
244 qualified_id_join_index_->Get(info);
245 if (!ref_qualified_id_str_or.ok()) {
246 if (absl_ports::IsNotFound(ref_qualified_id_str_or.status())) {
247 return kInvalidDocumentId;
248 }
249 return std::move(ref_qualified_id_str_or).status();
250 }
251
252 libtextclassifier3::StatusOr<QualifiedId> ref_qualified_id_or =
253 QualifiedId::Parse(std::move(ref_qualified_id_str_or).ValueOrDie());
254 if (!ref_qualified_id_or.ok()) {
255 // This shouldn't happen because we've validated it during indexing and only
256 // put valid qualified id strings into qualified id join index.
257 return kInvalidDocumentId;
258 }
259 QualifiedId qualified_id = std::move(ref_qualified_id_or).ValueOrDie();
260
261 libtextclassifier3::StatusOr<DocumentId> ref_document_id_or =
262 doc_store_->GetDocumentId(qualified_id.name_space(), qualified_id.uri());
263 if (!ref_document_id_or.ok()) {
264 return kInvalidDocumentId;
265 }
266 return std::move(ref_document_id_or).ValueOrDie();
267 }
268
269 } // namespace lib
270 } // namespace icing
271