1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/monkey_test/in-memory-icing-search-engine.h"
16 
17 #include <algorithm>
18 #include <cstdint>
19 #include <memory>
20 #include <random>
21 #include <string>
22 #include <string_view>
23 #include <unordered_set>
24 #include <utility>
25 #include <vector>
26 
27 #include "icing/text_classifier/lib3/utils/base/status.h"
28 #include "icing/text_classifier/lib3/utils/base/statusor.h"
29 #include "icing/absl_ports/canonical_errors.h"
30 #include "icing/absl_ports/str_cat.h"
31 #include "icing/absl_ports/str_join.h"
32 #include "icing/index/embed/embedding-scorer.h"
33 #include "icing/monkey_test/monkey-tokenized-document.h"
34 #include "icing/proto/document.pb.h"
35 #include "icing/proto/schema.pb.h"
36 #include "icing/proto/search.pb.h"
37 #include "icing/proto/term.pb.h"
38 #include "icing/store/document-id.h"
39 #include "icing/util/status-macros.h"
40 
41 namespace icing {
42 namespace lib {
43 
44 namespace {
45 
46 // Check if s1 is a prefix of s2.
IsPrefix(std::string_view s1,std::string_view s2)47 bool IsPrefix(std::string_view s1, std::string_view s2) {
48   if (s1.length() > s2.length()) {
49     return false;
50   }
51   return s1 == s2.substr(0, s1.length());
52 }
53 
54 const std::string_view kSemanticSearchPrefix =
55     "semanticSearch(getEmbeddingParameter(0)";
56 
GetEmbeddingSearchRange(std::string_view s)57 libtextclassifier3::StatusOr<std::pair<double, double>> GetEmbeddingSearchRange(
58     std::string_view s) {
59   std::vector<double> values;
60   std::string current_number;
61   int i = s.find(kSemanticSearchPrefix) + kSemanticSearchPrefix.length();
62   for (; i < s.size(); ++i) {
63     char c = s[i];
64     if (c == '.' || c == '-' || (c >= '0' && c <= '9')) {
65       current_number += c;
66     } else {
67       if (!current_number.empty()) {
68         values.push_back(std::stod(current_number));
69         current_number.clear();
70       }
71     }
72   }
73   if (values.size() != 2) {
74     return absl_ports::InvalidArgumentError(
75         absl_ports::StrCat("Not an embedding search.", s));
76   }
77   return std::make_pair(values[0], values[1]);
78 }
79 
DoesVectorsMatch(const EmbeddingScorer * scorer,std::pair<double,double> embedding_search_range,const PropertyProto::VectorProto & vector1,const PropertyProto::VectorProto & vector2)80 bool DoesVectorsMatch(const EmbeddingScorer *scorer,
81                       std::pair<double, double> embedding_search_range,
82                       const PropertyProto::VectorProto &vector1,
83                       const PropertyProto::VectorProto &vector2) {
84   if (vector1.model_signature() != vector2.model_signature() ||
85       vector1.values_size() != vector2.values_size()) {
86     return false;
87   }
88   float score = scorer->Score(vector1.values_size(), vector1.values().data(),
89                               vector2.values().data());
90   return embedding_search_range.first <= score &&
91          score <= embedding_search_range.second;
92 }
93 
94 }  // namespace
95 
96 libtextclassifier3::StatusOr<const PropertyConfigProto *>
GetPropertyConfig(const std::string & schema_type,const std::string & property_name) const97 InMemoryIcingSearchEngine::GetPropertyConfig(
98     const std::string &schema_type, const std::string &property_name) const {
99   auto schema_iter = property_config_map_.find(schema_type);
100   if (schema_iter == property_config_map_.end()) {
101     return absl_ports::NotFoundError(
102         absl_ports::StrCat("Schema type: ", schema_type, " is not found."));
103   }
104   auto property_iter = schema_iter->second.find(property_name);
105   if (property_iter == schema_iter->second.end()) {
106     return absl_ports::NotFoundError(
107         absl_ports::StrCat("Property: ", property_name, " is not found."));
108   }
109   return &property_iter->second;
110 }
111 
112 libtextclassifier3::StatusOr<InMemoryIcingSearchEngine::PropertyIndexInfo>
GetPropertyIndexInfo(const std::string & schema_type,const MonkeyTokenizedSection & section) const113 InMemoryIcingSearchEngine::GetPropertyIndexInfo(
114     const std::string &schema_type,
115     const MonkeyTokenizedSection §ion) const {
116   bool in_indexable_properties_list = false;
117   bool all_indexable_from_top = true;
118 
119   std::vector<std::string_view> properties_in_path =
120       absl_ports::StrSplit(section.path, ".");
121   if (properties_in_path.empty()) {
122     return absl_ports::InvalidArgumentError("Got empty path.");
123   }
124   std::string curr_schema_type = schema_type;
125   for (int i = 0; i < properties_in_path.size(); ++i) {
126     ICING_ASSIGN_OR_RETURN(
127         const PropertyConfigProto *prop,
128         GetPropertyConfig(curr_schema_type,
129                           std::string(properties_in_path[i])));
130     if (prop->data_type() == PropertyConfigProto::DataType::STRING) {
131       TermMatchType::Code term_match_type =
132           prop->string_indexing_config().term_match_type();
133       bool indexable = term_match_type != TermMatchType::UNKNOWN;
134       return PropertyIndexInfo{indexable, term_match_type};
135     }
136     if (prop->data_type() == PropertyConfigProto::DataType::VECTOR) {
137       bool indexable =
138           prop->embedding_indexing_config().embedding_indexing_type() !=
139           EmbeddingIndexingConfig::EmbeddingIndexingType::UNKNOWN;
140       return PropertyIndexInfo{indexable};
141     }
142 
143     if (prop->data_type() != PropertyConfigProto::DataType::DOCUMENT) {
144       return PropertyIndexInfo{/*indexable=*/false};
145     }
146 
147     bool old_all_indexable_from_top = all_indexable_from_top;
148     all_indexable_from_top &=
149         prop->document_indexing_config().index_nested_properties();
150     if (!all_indexable_from_top && !in_indexable_properties_list) {
151       // Only try to update in_indexable_properties_list if this is the first
152       // level with index_nested_properties=false.
153       if (old_all_indexable_from_top) {
154         auto &indexable_properties =
155             prop->document_indexing_config().indexable_nested_properties_list();
156         std::string relative_path =
157             absl_ports::StrCatPieces(std::vector<std::string_view>(
158                 properties_in_path.begin() + i + 1, properties_in_path.end()));
159         in_indexable_properties_list =
160             std::find(indexable_properties.begin(), indexable_properties.end(),
161                       relative_path) != indexable_properties.end();
162       }
163       // Check in_indexable_properties_list again.
164       if (!in_indexable_properties_list) {
165         return PropertyIndexInfo{/*indexable=*/false};
166       }
167     }
168     curr_schema_type = prop->document_indexing_config().GetTypeName();
169   }
170   return PropertyIndexInfo{/*indexable=*/false};
171 }
172 
173 libtextclassifier3::StatusOr<bool>
DoesDocumentMatchQuery(const MonkeyTokenizedDocument & document,const SearchSpecProto & search_spec) const174 InMemoryIcingSearchEngine::DoesDocumentMatchQuery(
175     const MonkeyTokenizedDocument &document,
176     const SearchSpecProto &search_spec) const {
177   std::string_view query = search_spec.query();
178   std::vector<std::string_view> strs = absl_ports::StrSplit(query, ":");
179   std::string_view section_restrict;
180   if (strs.size() > 1) {
181     section_restrict = strs[0];
182     query = strs[1];
183   }
184 
185   // Preprocess for embedding search.
186   libtextclassifier3::StatusOr<std::pair<double, double>>
187       embedding_search_range_or = GetEmbeddingSearchRange(query);
188   std::unique_ptr<EmbeddingScorer> embedding_scorer;
189   if (embedding_search_range_or.ok()) {
190     ICING_ASSIGN_OR_RETURN(
191         embedding_scorer,
192         EmbeddingScorer::Create(search_spec.embedding_query_metric_type()));
193   }
194 
195   for (const MonkeyTokenizedSection §ion : document.tokenized_sections) {
196     if (!section_restrict.empty() && section.path != section_restrict) {
197       continue;
198     }
199     ICING_ASSIGN_OR_RETURN(
200         PropertyIndexInfo property_index_info,
201         GetPropertyIndexInfo(document.document.schema(), section));
202     if (!property_index_info.indexable) {
203       // Skip non-indexable property.
204       continue;
205     }
206 
207     if (embedding_search_range_or.ok()) {
208       // Process embedding search.
209       for (const PropertyProto::VectorProto &vector :
210            section.embedding_vectors) {
211         if (DoesVectorsMatch(embedding_scorer.get(),
212                              embedding_search_range_or.ValueOrDie(),
213                              search_spec.embedding_query_vectors(0), vector)) {
214           return true;
215         }
216       }
217     } else {
218       // Process term search.
219       for (const std::string &token : section.token_sequence) {
220         if (property_index_info.term_match_type == TermMatchType::EXACT_ONLY ||
221             search_spec.term_match_type() == TermMatchType::EXACT_ONLY) {
222           if (token == query) {
223             return true;
224           }
225         } else if (IsPrefix(query, token)) {
226           return true;
227         }
228       }
229     }
230   }
231   return false;
232 }
233 
SetSchema(SchemaProto && schema)234 void InMemoryIcingSearchEngine::SetSchema(SchemaProto &&schema) {
235   schema_ = std::make_unique<SchemaProto>(std::move(schema));
236   property_config_map_.clear();
237   for (const SchemaTypeConfigProto &type_config : schema_->types()) {
238     auto &curr_property_map = property_config_map_[type_config.schema_type()];
239     for (const PropertyConfigProto &property_config :
240          type_config.properties()) {
241       curr_property_map.insert(
242           {property_config.property_name(), property_config});
243     }
244   }
245 }
246 
247 InMemoryIcingSearchEngine::PickDocumentResult
RandomPickDocument(float p_alive,float p_all,float p_other) const248 InMemoryIcingSearchEngine::RandomPickDocument(float p_alive, float p_all,
249                                               float p_other) const {
250   // Normalizing p_alive, p_all and p_other, so that they sum to 1.
251   if (p_alive == 0 && p_all == 0 && p_other == 0) {
252     p_alive = p_all = p_other = 1 / 3.;
253   } else {
254     float p_sum = p_alive + p_all + p_other;
255     p_alive = p_alive / p_sum;
256     p_all = p_all / p_sum;
257     p_other = p_other / p_sum;
258   }
259 
260   std::uniform_real_distribution<> real_dist(0, 1);
261   float p = real_dist(*random_);
262   if (p <= p_other || documents_.empty()) {
263     // 20 is a fair number of non-existing namespaces and uris, enough for
264     // monkey testing.
265     std::uniform_int_distribution<> dist(0, 19);
266     std::string name_space = absl_ports::StrCat("non_existing_namespace",
267                                                 std::to_string(dist(*random_)));
268     std::string uri =
269         absl_ports::StrCat("non_existing_uri", std::to_string(dist(*random_)));
270     return {name_space, uri};
271   }
272   p -= p_other;
273   DocumentId doc_id;
274   if (p <= p_all || existing_doc_ids_.empty()) {
275     std::uniform_int_distribution<DocumentId> dist(0, documents_.size() - 1);
276     doc_id = dist(*random_);
277   } else {
278     std::uniform_int_distribution<DocumentId> dist(
279         0, existing_doc_ids_.size() - 1);
280     doc_id = existing_doc_ids_[dist(*random_)];
281   }
282   InMemoryIcingSearchEngine::PickDocumentResult result = {
283       documents_[doc_id].document.namespace_(),
284       documents_[doc_id].document.uri()};
285 
286   // Even the (name_space, uri) of the picked doc_id has not been deleted
287   // specifically, doc_id may be outdated because of possible overwriting. So we
288   // need to find the latest document id, and return the latest DocumentProto.
289   auto latest_doc_id = InternalGet(result.name_space, result.uri);
290   if (latest_doc_id.ok()) {
291     result.document = documents_[latest_doc_id.ValueOrDie()].document;
292   }
293   return result;
294 }
295 
Put(const MonkeyTokenizedDocument & document)296 void InMemoryIcingSearchEngine::Put(const MonkeyTokenizedDocument &document) {
297   // Delete the old one if existing.
298   Delete(document.document.namespace_(), document.document.uri()).IgnoreError();
299   existing_doc_ids_.push_back(documents_.size());
300   namespace_uri_docid_map[document.document.namespace_()]
301                          [document.document.uri()] = documents_.size();
302   documents_.push_back(document);
303 }
304 
GetAllNamespaces() const305 std::unordered_set<std::string> InMemoryIcingSearchEngine::GetAllNamespaces()
306     const {
307   std::unordered_set<std::string> namespaces;
308   for (DocumentId doc_id : existing_doc_ids_) {
309     namespaces.insert(documents_[doc_id].document.namespace_());
310   }
311   return namespaces;
312 }
313 
Delete(const std::string & name_space,const std::string & uri)314 libtextclassifier3::Status InMemoryIcingSearchEngine::Delete(
315     const std::string &name_space, const std::string &uri) {
316   libtextclassifier3::StatusOr<DocumentId> doc_id_or =
317       InternalGet(name_space, uri);
318   if (doc_id_or.ok()) {
319     DocumentId doc_id = doc_id_or.ValueOrDie();
320     const DocumentProto &document = documents_[doc_id].document;
321     namespace_uri_docid_map[document.namespace_()].erase(document.uri());
322     auto end_itr =
323         std::remove(existing_doc_ids_.begin(), existing_doc_ids_.end(), doc_id);
324     existing_doc_ids_.erase(end_itr, existing_doc_ids_.end());
325   }
326   return doc_id_or.status();
327 }
328 
329 libtextclassifier3::StatusOr<uint32_t>
DeleteByNamespace(const std::string & name_space)330 InMemoryIcingSearchEngine::DeleteByNamespace(const std::string &name_space) {
331   std::vector<DocumentId> doc_ids_to_delete;
332   for (DocumentId doc_id : existing_doc_ids_) {
333     if (documents_[doc_id].document.namespace_() == name_space) {
334       doc_ids_to_delete.push_back(doc_id);
335     }
336   }
337   for (DocumentId doc_id : doc_ids_to_delete) {
338     const DocumentProto &document = documents_[doc_id].document;
339     if (!Delete(document.namespace_(), document.uri()).ok()) {
340       return absl_ports::InternalError(
341           "Should never happen. There are inconsistencies in the in-memory "
342           "Icing.");
343     }
344   }
345   return doc_ids_to_delete.size();
346 }
347 
348 libtextclassifier3::StatusOr<uint32_t>
DeleteBySchemaType(const std::string & schema_type)349 InMemoryIcingSearchEngine::DeleteBySchemaType(const std::string &schema_type) {
350   std::vector<DocumentId> doc_ids_to_delete;
351   for (DocumentId doc_id : existing_doc_ids_) {
352     if (documents_[doc_id].document.schema() == schema_type) {
353       doc_ids_to_delete.push_back(doc_id);
354     }
355   }
356   for (DocumentId doc_id : doc_ids_to_delete) {
357     const DocumentProto &document = documents_[doc_id].document;
358     if (!Delete(document.namespace_(), document.uri()).ok()) {
359       return absl_ports::InternalError(
360           "Should never happen. There are inconsistencies in the in-memory "
361           "Icing.");
362     }
363   }
364   return doc_ids_to_delete.size();
365 }
366 
DeleteByQuery(const SearchSpecProto & search_spec)367 libtextclassifier3::StatusOr<uint32_t> InMemoryIcingSearchEngine::DeleteByQuery(
368     const SearchSpecProto &search_spec) {
369   ICING_ASSIGN_OR_RETURN(std::vector<DocumentId> doc_ids_to_delete,
370                          InternalSearch(search_spec));
371   for (DocumentId doc_id : doc_ids_to_delete) {
372     const DocumentProto &document = documents_[doc_id].document;
373     if (!Delete(document.namespace_(), document.uri()).ok()) {
374       return absl_ports::InternalError(
375           "Should never happen. There are inconsistencies in the in-memory "
376           "Icing.");
377     }
378   }
379   return doc_ids_to_delete.size();
380 }
381 
382 libtextclassifier3::StatusOr<std::vector<DocumentProto>>
Search(const SearchSpecProto & search_spec) const383 InMemoryIcingSearchEngine::Search(const SearchSpecProto &search_spec) const {
384   ICING_ASSIGN_OR_RETURN(std::vector<DocumentId> matched_doc_ids,
385                          InternalSearch(search_spec));
386   std::vector<DocumentProto> result;
387   result.reserve(matched_doc_ids.size());
388   for (DocumentId doc_id : matched_doc_ids) {
389     result.push_back(documents_[doc_id].document);
390   }
391   return result;
392 }
393 
InternalGet(const std::string & name_space,const std::string & uri) const394 libtextclassifier3::StatusOr<DocumentId> InMemoryIcingSearchEngine::InternalGet(
395     const std::string &name_space, const std::string &uri) const {
396   auto uris = namespace_uri_docid_map.find(name_space);
397   if (uris != namespace_uri_docid_map.end()) {
398     auto doc = uris->second.find(uri);
399     if (doc != uris->second.end()) {
400       return doc->second;
401     }
402   }
403   return absl_ports::NotFoundError(absl_ports::StrCat(
404       name_space, ", ", uri,
405       " is not found by InMemoryIcingSearchEngine::InternalGet."));
406 }
407 
408 libtextclassifier3::StatusOr<std::vector<DocumentId>>
InternalSearch(const SearchSpecProto & search_spec) const409 InMemoryIcingSearchEngine::InternalSearch(
410     const SearchSpecProto &search_spec) const {
411   std::vector<DocumentId> matched_doc_ids;
412   for (DocumentId doc_id : existing_doc_ids_) {
413     ICING_ASSIGN_OR_RETURN(
414         bool match, DoesDocumentMatchQuery(documents_[doc_id], search_spec));
415     if (match) {
416       matched_doc_ids.push_back(doc_id);
417     }
418   }
419   return matched_doc_ids;
420 }
421 
422 }  // namespace lib
423 }  // namespace icing
424