• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/monkey_test/in-memory-icing-search-engine.h"
16 
17 #include <algorithm>
18 #include <cstdint>
19 #include <memory>
20 #include <random>
21 #include <string>
22 #include <string_view>
23 #include <unordered_set>
24 #include <utility>
25 #include <vector>
26 
27 #include "icing/text_classifier/lib3/utils/base/status.h"
28 #include "icing/text_classifier/lib3/utils/base/statusor.h"
29 #include "icing/absl_ports/canonical_errors.h"
30 #include "icing/absl_ports/str_cat.h"
31 #include "icing/absl_ports/str_join.h"
32 #include "icing/monkey_test/monkey-tokenized-document.h"
33 #include "icing/proto/document.pb.h"
34 #include "icing/proto/schema.pb.h"
35 #include "icing/proto/search.pb.h"
36 #include "icing/proto/term.pb.h"
37 #include "icing/store/document-id.h"
38 #include "icing/util/status-macros.h"
39 
40 namespace icing {
41 namespace lib {
42 
43 namespace {
44 
45 // Check if s1 is a prefix of s2.
IsPrefix(std::string_view s1,std::string_view s2)46 bool IsPrefix(std::string_view s1, std::string_view s2) {
47   if (s1.length() > s2.length()) {
48     return false;
49   }
50   return s1 == s2.substr(0, s1.length());
51 }
52 
53 }  // namespace
54 
55 libtextclassifier3::StatusOr<const PropertyConfigProto *>
GetPropertyConfig(const std::string & schema_type,const std::string & property_name) const56 InMemoryIcingSearchEngine::GetPropertyConfig(
57     const std::string &schema_type, const std::string &property_name) const {
58   auto schema_iter = property_config_map_.find(schema_type);
59   if (schema_iter == property_config_map_.end()) {
60     return absl_ports::NotFoundError(
61         absl_ports::StrCat("Schema type: ", schema_type, " is not found."));
62   }
63   auto property_iter = schema_iter->second.find(property_name);
64   if (property_iter == schema_iter->second.end()) {
65     return absl_ports::NotFoundError(
66         absl_ports::StrCat("Property: ", property_name, " is not found."));
67   }
68   return &property_iter->second;
69 }
70 
71 libtextclassifier3::StatusOr<TermMatchType::Code>
GetTermMatchType(const std::string & schema_type,const MonkeyTokenizedSection & section) const72 InMemoryIcingSearchEngine::GetTermMatchType(
73     const std::string &schema_type,
74     const MonkeyTokenizedSection &section) const {
75   bool in_indexable_properties_list = false;
76   bool all_indexable_from_top = true;
77 
78   std::vector<std::string_view> properties_in_path =
79       absl_ports::StrSplit(section.path, ".");
80   if (properties_in_path.empty()) {
81     return absl_ports::InvalidArgumentError("Got empty path.");
82   }
83   std::string curr_schema_type = schema_type;
84   for (int i = 0; i < properties_in_path.size(); ++i) {
85     ICING_ASSIGN_OR_RETURN(
86         const PropertyConfigProto *prop,
87         GetPropertyConfig(curr_schema_type,
88                           std::string(properties_in_path[i])));
89     if (prop->data_type() == PropertyConfigProto::DataType::STRING) {
90       return prop->string_indexing_config().term_match_type();
91     }
92 
93     if (prop->data_type() != PropertyConfigProto::DataType::DOCUMENT) {
94       return TermMatchType::Code::TermMatchType_Code_UNKNOWN;
95     }
96 
97     bool old_all_indexable_from_top = all_indexable_from_top;
98     all_indexable_from_top &=
99         prop->document_indexing_config().index_nested_properties();
100     if (!all_indexable_from_top && !in_indexable_properties_list) {
101       // Only try to update in_indexable_properties_list if this is the first
102       // level with index_nested_properties=false.
103       if (old_all_indexable_from_top) {
104         auto &indexable_properties =
105             prop->document_indexing_config().indexable_nested_properties_list();
106         std::string relative_path =
107             absl_ports::StrCatPieces(std::vector<std::string_view>(
108                 properties_in_path.begin() + i + 1, properties_in_path.end()));
109         in_indexable_properties_list =
110             std::find(indexable_properties.begin(), indexable_properties.end(),
111                       relative_path) != indexable_properties.end();
112       }
113       // Check in_indexable_properties_list again.
114       if (!in_indexable_properties_list) {
115         return TermMatchType::Code::TermMatchType_Code_UNKNOWN;
116       }
117     }
118     curr_schema_type = prop->document_indexing_config().GetTypeName();
119   }
120   return TermMatchType::Code::TermMatchType_Code_UNKNOWN;
121 }
122 
123 libtextclassifier3::StatusOr<bool>
DoesDocumentMatchQuery(const MonkeyTokenizedDocument & document,const std::string & query,TermMatchType::Code term_match_type) const124 InMemoryIcingSearchEngine::DoesDocumentMatchQuery(
125     const MonkeyTokenizedDocument &document, const std::string &query,
126     TermMatchType::Code term_match_type) const {
127   std::vector<std::string_view> strs = absl_ports::StrSplit(query, ":");
128   std::string_view query_term;
129   std::string_view section_restrict;
130   if (strs.size() > 1) {
131     section_restrict = strs[0];
132     query_term = strs[1];
133   } else {
134     query_term = query;
135   }
136   for (const MonkeyTokenizedSection &section : document.tokenized_sections) {
137     if (!section_restrict.empty() && section.path != section_restrict) {
138       continue;
139     }
140     ICING_ASSIGN_OR_RETURN(
141         TermMatchType::Code section_term_match_type,
142         GetTermMatchType(document.document.schema(), section));
143     if (section_term_match_type == TermMatchType::UNKNOWN) {
144       // Skip non-indexable property.
145       continue;
146     }
147     for (const std::string &token : section.token_sequence) {
148       if (section_term_match_type == TermMatchType::EXACT_ONLY ||
149           term_match_type == TermMatchType::EXACT_ONLY) {
150         if (token == query_term) {
151           return true;
152         }
153       } else if (IsPrefix(query_term, token)) {
154         return true;
155       }
156     }
157   }
158   return false;
159 }
160 
SetSchema(SchemaProto && schema)161 void InMemoryIcingSearchEngine::SetSchema(SchemaProto &&schema) {
162   schema_ = std::make_unique<SchemaProto>(std::move(schema));
163   property_config_map_.clear();
164   for (const SchemaTypeConfigProto &type_config : schema_->types()) {
165     auto &curr_property_map = property_config_map_[type_config.schema_type()];
166     for (const PropertyConfigProto &property_config :
167          type_config.properties()) {
168       curr_property_map.insert(
169           {property_config.property_name(), property_config});
170     }
171   }
172 }
173 
174 InMemoryIcingSearchEngine::PickDocumentResult
RandomPickDocument(float p_alive,float p_all,float p_other) const175 InMemoryIcingSearchEngine::RandomPickDocument(float p_alive, float p_all,
176                                               float p_other) const {
177   // Normalizing p_alive, p_all and p_other, so that they sum to 1.
178   if (p_alive == 0 && p_all == 0 && p_other == 0) {
179     p_alive = p_all = p_other = 1 / 3.;
180   } else {
181     float p_sum = p_alive + p_all + p_other;
182     p_alive = p_alive / p_sum;
183     p_all = p_all / p_sum;
184     p_other = p_other / p_sum;
185   }
186 
187   std::uniform_real_distribution<> real_dist(0, 1);
188   float p = real_dist(*random_);
189   if (p <= p_other || documents_.empty()) {
190     // 20 is a fair number of non-existing namespaces and uris, enough for
191     // monkey testing.
192     std::uniform_int_distribution<> dist(0, 19);
193     std::string name_space = absl_ports::StrCat("non_existing_namespace",
194                                                 std::to_string(dist(*random_)));
195     std::string uri =
196         absl_ports::StrCat("non_existing_uri", std::to_string(dist(*random_)));
197     return {name_space, uri};
198   }
199   p -= p_other;
200   DocumentId doc_id;
201   if (p <= p_all || existing_doc_ids_.empty()) {
202     std::uniform_int_distribution<DocumentId> dist(0, documents_.size() - 1);
203     doc_id = dist(*random_);
204   } else {
205     std::uniform_int_distribution<DocumentId> dist(
206         0, existing_doc_ids_.size() - 1);
207     doc_id = existing_doc_ids_[dist(*random_)];
208   }
209   InMemoryIcingSearchEngine::PickDocumentResult result = {
210       documents_[doc_id].document.namespace_(),
211       documents_[doc_id].document.uri()};
212 
213   // Even the (name_space, uri) of the picked doc_id has not been deleted
214   // specifically, doc_id may be outdated because of possible overwriting. So we
215   // need to find the latest document id, and return the latest DocumentProto.
216   auto latest_doc_id = InternalGet(result.name_space, result.uri);
217   if (latest_doc_id.ok()) {
218     result.document = documents_[latest_doc_id.ValueOrDie()].document;
219   }
220   return result;
221 }
222 
Put(const MonkeyTokenizedDocument & document)223 void InMemoryIcingSearchEngine::Put(const MonkeyTokenizedDocument &document) {
224   // Delete the old one if existing.
225   Delete(document.document.namespace_(), document.document.uri()).IgnoreError();
226   existing_doc_ids_.push_back(documents_.size());
227   namespace_uri_docid_map[document.document.namespace_()]
228                          [document.document.uri()] = documents_.size();
229   documents_.push_back(document);
230 }
231 
GetAllNamespaces() const232 std::unordered_set<std::string> InMemoryIcingSearchEngine::GetAllNamespaces()
233     const {
234   std::unordered_set<std::string> namespaces;
235   for (DocumentId doc_id : existing_doc_ids_) {
236     namespaces.insert(documents_[doc_id].document.namespace_());
237   }
238   return namespaces;
239 }
240 
Delete(const std::string & name_space,const std::string & uri)241 libtextclassifier3::Status InMemoryIcingSearchEngine::Delete(
242     const std::string &name_space, const std::string &uri) {
243   libtextclassifier3::StatusOr<DocumentId> doc_id_or =
244       InternalGet(name_space, uri);
245   if (doc_id_or.ok()) {
246     DocumentId doc_id = doc_id_or.ValueOrDie();
247     const DocumentProto &document = documents_[doc_id].document;
248     namespace_uri_docid_map[document.namespace_()].erase(document.uri());
249     auto end_itr =
250         std::remove(existing_doc_ids_.begin(), existing_doc_ids_.end(), doc_id);
251     existing_doc_ids_.erase(end_itr, existing_doc_ids_.end());
252   }
253   return doc_id_or.status();
254 }
255 
256 libtextclassifier3::StatusOr<uint32_t>
DeleteByNamespace(const std::string & name_space)257 InMemoryIcingSearchEngine::DeleteByNamespace(const std::string &name_space) {
258   std::vector<DocumentId> doc_ids_to_delete;
259   for (DocumentId doc_id : existing_doc_ids_) {
260     if (documents_[doc_id].document.namespace_() == name_space) {
261       doc_ids_to_delete.push_back(doc_id);
262     }
263   }
264   for (DocumentId doc_id : doc_ids_to_delete) {
265     const DocumentProto &document = documents_[doc_id].document;
266     if (!Delete(document.namespace_(), document.uri()).ok()) {
267       return absl_ports::InternalError(
268           "Should never happen. There are inconsistencies in the in-memory "
269           "Icing.");
270     }
271   }
272   return doc_ids_to_delete.size();
273 }
274 
275 libtextclassifier3::StatusOr<uint32_t>
DeleteBySchemaType(const std::string & schema_type)276 InMemoryIcingSearchEngine::DeleteBySchemaType(const std::string &schema_type) {
277   std::vector<DocumentId> doc_ids_to_delete;
278   for (DocumentId doc_id : existing_doc_ids_) {
279     if (documents_[doc_id].document.schema() == schema_type) {
280       doc_ids_to_delete.push_back(doc_id);
281     }
282   }
283   for (DocumentId doc_id : doc_ids_to_delete) {
284     const DocumentProto &document = documents_[doc_id].document;
285     if (!Delete(document.namespace_(), document.uri()).ok()) {
286       return absl_ports::InternalError(
287           "Should never happen. There are inconsistencies in the in-memory "
288           "Icing.");
289     }
290   }
291   return doc_ids_to_delete.size();
292 }
293 
DeleteByQuery(const SearchSpecProto & search_spec)294 libtextclassifier3::StatusOr<uint32_t> InMemoryIcingSearchEngine::DeleteByQuery(
295     const SearchSpecProto &search_spec) {
296   ICING_ASSIGN_OR_RETURN(std::vector<DocumentId> doc_ids_to_delete,
297                          InternalSearch(search_spec));
298   for (DocumentId doc_id : doc_ids_to_delete) {
299     const DocumentProto &document = documents_[doc_id].document;
300     if (!Delete(document.namespace_(), document.uri()).ok()) {
301       return absl_ports::InternalError(
302           "Should never happen. There are inconsistencies in the in-memory "
303           "Icing.");
304     }
305   }
306   return doc_ids_to_delete.size();
307 }
308 
309 libtextclassifier3::StatusOr<std::vector<DocumentProto>>
Search(const SearchSpecProto & search_spec) const310 InMemoryIcingSearchEngine::Search(const SearchSpecProto &search_spec) const {
311   ICING_ASSIGN_OR_RETURN(std::vector<DocumentId> matched_doc_ids,
312                          InternalSearch(search_spec));
313   std::vector<DocumentProto> result;
314   result.reserve(matched_doc_ids.size());
315   for (DocumentId doc_id : matched_doc_ids) {
316     result.push_back(documents_[doc_id].document);
317   }
318   return result;
319 }
320 
InternalGet(const std::string & name_space,const std::string & uri) const321 libtextclassifier3::StatusOr<DocumentId> InMemoryIcingSearchEngine::InternalGet(
322     const std::string &name_space, const std::string &uri) const {
323   auto uris = namespace_uri_docid_map.find(name_space);
324   if (uris != namespace_uri_docid_map.end()) {
325     auto doc = uris->second.find(uri);
326     if (doc != uris->second.end()) {
327       return doc->second;
328     }
329   }
330   return absl_ports::NotFoundError(absl_ports::StrCat(
331       name_space, ", ", uri,
332       " is not found by InMemoryIcingSearchEngine::InternalGet."));
333 }
334 
335 libtextclassifier3::StatusOr<std::vector<DocumentId>>
InternalSearch(const SearchSpecProto & search_spec) const336 InMemoryIcingSearchEngine::InternalSearch(
337     const SearchSpecProto &search_spec) const {
338   std::vector<DocumentId> matched_doc_ids;
339   for (DocumentId doc_id : existing_doc_ids_) {
340     ICING_ASSIGN_OR_RETURN(
341         bool match,
342         DoesDocumentMatchQuery(documents_[doc_id], search_spec.query(),
343                                search_spec.term_match_type()));
344     if (match) {
345       matched_doc_ids.push_back(doc_id);
346     }
347   }
348   return matched_doc_ids;
349 }
350 
351 }  // namespace lib
352 }  // namespace icing
353