• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/join/join-processor.h"
16 
17 #include <algorithm>
18 #include <memory>
19 #include <optional>
20 #include <string>
21 #include <string_view>
22 #include <unordered_map>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/text_classifier/lib3/utils/base/statusor.h"
27 #include "icing/absl_ports/canonical_errors.h"
28 #include "icing/absl_ports/str_cat.h"
29 #include "icing/join/aggregation-scorer.h"
30 #include "icing/join/doc-join-info.h"
31 #include "icing/join/join-children-fetcher.h"
32 #include "icing/join/qualified-id-join-index.h"
33 #include "icing/join/qualified-id.h"
34 #include "icing/proto/schema.pb.h"
35 #include "icing/proto/scoring.pb.h"
36 #include "icing/proto/search.pb.h"
37 #include "icing/schema/joinable-property.h"
38 #include "icing/scoring/scored-document-hit.h"
39 #include "icing/store/document-filter-data.h"
40 #include "icing/store/document-id.h"
41 #include "icing/store/namespace-fingerprint-identifier.h"
42 #include "icing/util/status-macros.h"
43 
44 namespace icing {
45 namespace lib {
46 
47 libtextclassifier3::StatusOr<JoinChildrenFetcher>
GetChildrenFetcher(const JoinSpecProto & join_spec,std::vector<ScoredDocumentHit> && child_scored_document_hits)48 JoinProcessor::GetChildrenFetcher(
49     const JoinSpecProto& join_spec,
50     std::vector<ScoredDocumentHit>&& child_scored_document_hits) {
51   if (join_spec.parent_property_expression() != kQualifiedIdExpr) {
52     // TODO(b/256022027): So far we only support kQualifiedIdExpr for
53     // parent_property_expression, we could support more.
54     return absl_ports::UnimplementedError(absl_ports::StrCat(
55         "Parent property expression must be ", kQualifiedIdExpr));
56   }
57 
58   ScoredDocumentHitComparator score_comparator(
59       /*is_descending=*/join_spec.nested_spec().scoring_spec().order_by() ==
60       ScoringSpecProto::Order::DESC);
61 
62   if (qualified_id_join_index_->is_v2()) {
63     // v2
64     // Step 1a: sort child ScoredDocumentHits in document id descending order.
65     std::sort(child_scored_document_hits.begin(),
66               child_scored_document_hits.end(),
67               [](const ScoredDocumentHit& lhs, const ScoredDocumentHit& rhs) {
68                 return lhs.document_id() > rhs.document_id();
69               });
70 
71     // Step 1b: group all child ScoredDocumentHits by the document's
72     //          schema_type_id.
73     std::unordered_map<SchemaTypeId, std::vector<ScoredDocumentHit>>
74         schema_to_child_scored_doc_hits_map;
75     for (const ScoredDocumentHit& child_scored_document_hit :
76          child_scored_document_hits) {
77       std::optional<DocumentFilterData> child_doc_filter_data =
78           doc_store_->GetAliveDocumentFilterData(
79               child_scored_document_hit.document_id(), current_time_ms_);
80       if (!child_doc_filter_data) {
81         continue;
82       }
83 
84       schema_to_child_scored_doc_hits_map[child_doc_filter_data
85                                               ->schema_type_id()]
86           .push_back(child_scored_document_hit);
87     }
88 
89     // Step 1c: for each schema_type_id, lookup QualifiedIdJoinIndexImplV2 to
90     //          fetch all child join data from posting list(s). Convert all
91     //          child join data to referenced parent document ids and bucketize
92     //          child ScoredDocumentHits by it.
93     std::unordered_map<DocumentId, std::vector<ScoredDocumentHit>>
94         parent_to_child_docs_map;
95     for (auto& [schema_type_id, grouped_child_scored_doc_hits] :
96          schema_to_child_scored_doc_hits_map) {
97       // Get joinable_property_id of this schema.
98       ICING_ASSIGN_OR_RETURN(
99           const JoinablePropertyMetadata* metadata,
100           schema_store_->GetJoinablePropertyMetadata(
101               schema_type_id, join_spec.child_property_expression()));
102       if (metadata == nullptr ||
103           metadata->value_type != JoinableConfig::ValueType::QUALIFIED_ID) {
104         // Currently we only support qualified id, so skip other types.
105         continue;
106       }
107 
108       // Lookup QualifiedIdJoinIndexImplV2.
109       ICING_ASSIGN_OR_RETURN(
110           std::unique_ptr<QualifiedIdJoinIndex::JoinDataIteratorBase>
111               join_index_iter,
112           qualified_id_join_index_->GetIterator(
113               schema_type_id, /*joinable_property_id=*/metadata->id));
114 
115       // - Join index contains all join data of schema_type_id and
116       //   join_index_iter will return all of them in (child) document id
117       //   descending order.
118       // - But we only need join data of child document ids which appear in
119       //   grouped_child_scored_doc_hits. Also grouped_child_scored_doc_hits
120       //   contain ScoredDocumentHits in (child) document id descending order.
121       // - Therefore, we advance 2 iterators to intersect them and get desired
122       //   join data.
123       auto child_scored_doc_hits_iter = grouped_child_scored_doc_hits.cbegin();
124       while (join_index_iter->Advance().ok() &&
125              child_scored_doc_hits_iter !=
126                  grouped_child_scored_doc_hits.cend()) {
127         // Advance child_scored_doc_hits_iter until it points to a
128         // ScoredDocumentHit with document id <= the one pointed by
129         // join_index_iter.
130         while (child_scored_doc_hits_iter !=
131                    grouped_child_scored_doc_hits.cend() &&
132                child_scored_doc_hits_iter->document_id() >
133                    join_index_iter->GetCurrent().document_id()) {
134           ++child_scored_doc_hits_iter;
135         }
136 
137         if (child_scored_doc_hits_iter !=
138                 grouped_child_scored_doc_hits.cend() &&
139             child_scored_doc_hits_iter->document_id() ==
140                 join_index_iter->GetCurrent().document_id()) {
141           // We get a join data whose child document id exists in both join
142           // index and grouped_child_scored_doc_hits. Convert its join info to
143           // referenced parent document ids and bucketize ScoredDocumentHits by
144           // it (putting into parent_to_child_docs_map).
145           const NamespaceFingerprintIdentifier& ref_ns_id =
146               join_index_iter->GetCurrent().join_info();
147           libtextclassifier3::StatusOr<DocumentId> ref_parent_doc_id_or =
148               doc_store_->GetDocumentId(ref_ns_id);
149           if (ref_parent_doc_id_or.ok()) {
150             parent_to_child_docs_map[std::move(ref_parent_doc_id_or)
151                                          .ValueOrDie()]
152                 .push_back(*child_scored_doc_hits_iter);
153           }
154         }
155       }
156     }
157 
158     // Step 1d: finally, sort each parent's joined child ScoredDocumentHits by
159     //          score.
160     for (auto& [parent_doc_id, bucketized_child_scored_hits] :
161          parent_to_child_docs_map) {
162       std::sort(bucketized_child_scored_hits.begin(),
163                 bucketized_child_scored_hits.end(), score_comparator);
164     }
165 
166     return JoinChildrenFetcher(join_spec, std::move(parent_to_child_docs_map));
167   }
168 
169   // v1
170   // TODO(b/275121148): deprecate this part after rollout v2.
171   std::sort(child_scored_document_hits.begin(),
172             child_scored_document_hits.end(), score_comparator);
173 
174   // Step 1: group child documents by parent documentId. Currently we only
175   //         support QualifiedId joining, so fetch the qualified id content of
176   //         child_property_expression, break it down into namespace + uri, and
177   //         lookup the DocumentId.
178   // The keys of this map are the DocumentIds of the parent docs the child
179   // ScoredDocumentHits refer to. The values in this map are vectors of child
180   // ScoredDocumentHits that refer to a parent DocumentId.
181   std::unordered_map<DocumentId, std::vector<ScoredDocumentHit>>
182       map_joinable_qualified_id;
183   for (const ScoredDocumentHit& child : child_scored_document_hits) {
184     ICING_ASSIGN_OR_RETURN(
185         DocumentId ref_doc_id,
186         FetchReferencedQualifiedId(child.document_id(),
187                                    join_spec.child_property_expression()));
188     if (ref_doc_id == kInvalidDocumentId) {
189       continue;
190     }
191 
192     map_joinable_qualified_id[ref_doc_id].push_back(child);
193   }
194   return JoinChildrenFetcher(join_spec, std::move(map_joinable_qualified_id));
195 }
196 
197 libtextclassifier3::StatusOr<std::vector<JoinedScoredDocumentHit>>
Join(const JoinSpecProto & join_spec,std::vector<ScoredDocumentHit> && parent_scored_document_hits,const JoinChildrenFetcher & join_children_fetcher)198 JoinProcessor::Join(
199     const JoinSpecProto& join_spec,
200     std::vector<ScoredDocumentHit>&& parent_scored_document_hits,
201     const JoinChildrenFetcher& join_children_fetcher) {
202   std::unique_ptr<AggregationScorer> aggregation_scorer =
203       AggregationScorer::Create(join_spec);
204 
205   std::vector<JoinedScoredDocumentHit> joined_scored_document_hits;
206   joined_scored_document_hits.reserve(parent_scored_document_hits.size());
207 
208   // Step 2: iterate through all parent documentIds and construct
209   //         JoinedScoredDocumentHit for each by looking up
210   //         join_children_fetcher.
211   for (ScoredDocumentHit& parent : parent_scored_document_hits) {
212     ICING_ASSIGN_OR_RETURN(
213         std::vector<ScoredDocumentHit> children,
214         join_children_fetcher.GetChildren(parent.document_id()));
215 
216     double final_score = aggregation_scorer->GetScore(parent, children);
217     joined_scored_document_hits.emplace_back(final_score, std::move(parent),
218                                              std::move(children));
219   }
220 
221   return joined_scored_document_hits;
222 }
223 
224 libtextclassifier3::StatusOr<DocumentId>
FetchReferencedQualifiedId(const DocumentId & document_id,const std::string & property_path) const225 JoinProcessor::FetchReferencedQualifiedId(
226     const DocumentId& document_id, const std::string& property_path) const {
227   std::optional<DocumentFilterData> filter_data =
228       doc_store_->GetAliveDocumentFilterData(document_id, current_time_ms_);
229   if (!filter_data) {
230     return kInvalidDocumentId;
231   }
232 
233   ICING_ASSIGN_OR_RETURN(const JoinablePropertyMetadata* metadata,
234                          schema_store_->GetJoinablePropertyMetadata(
235                              filter_data->schema_type_id(), property_path));
236   if (metadata == nullptr ||
237       metadata->value_type != JoinableConfig::ValueType::QUALIFIED_ID) {
238     // Currently we only support qualified id.
239     return kInvalidDocumentId;
240   }
241 
242   DocJoinInfo info(document_id, metadata->id);
243   libtextclassifier3::StatusOr<std::string_view> ref_qualified_id_str_or =
244       qualified_id_join_index_->Get(info);
245   if (!ref_qualified_id_str_or.ok()) {
246     if (absl_ports::IsNotFound(ref_qualified_id_str_or.status())) {
247       return kInvalidDocumentId;
248     }
249     return std::move(ref_qualified_id_str_or).status();
250   }
251 
252   libtextclassifier3::StatusOr<QualifiedId> ref_qualified_id_or =
253       QualifiedId::Parse(std::move(ref_qualified_id_str_or).ValueOrDie());
254   if (!ref_qualified_id_or.ok()) {
255     // This shouldn't happen because we've validated it during indexing and only
256     // put valid qualified id strings into qualified id join index.
257     return kInvalidDocumentId;
258   }
259   QualifiedId qualified_id = std::move(ref_qualified_id_or).ValueOrDie();
260 
261   libtextclassifier3::StatusOr<DocumentId> ref_document_id_or =
262       doc_store_->GetDocumentId(qualified_id.name_space(), qualified_id.uri());
263   if (!ref_document_id_or.ok()) {
264     return kInvalidDocumentId;
265   }
266   return std::move(ref_document_id_or).ValueOrDie();
267 }
268 
269 }  // namespace lib
270 }  // namespace icing
271