1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/monkey_test/in-memory-icing-search-engine.h"
16
17 #include <algorithm>
18 #include <cstdint>
19 #include <memory>
20 #include <random>
21 #include <string>
22 #include <string_view>
23 #include <unordered_set>
24 #include <utility>
25 #include <vector>
26
27 #include "icing/text_classifier/lib3/utils/base/status.h"
28 #include "icing/text_classifier/lib3/utils/base/statusor.h"
29 #include "icing/absl_ports/canonical_errors.h"
30 #include "icing/absl_ports/str_cat.h"
31 #include "icing/absl_ports/str_join.h"
32 #include "icing/monkey_test/monkey-tokenized-document.h"
33 #include "icing/proto/document.pb.h"
34 #include "icing/proto/schema.pb.h"
35 #include "icing/proto/search.pb.h"
36 #include "icing/proto/term.pb.h"
37 #include "icing/store/document-id.h"
38 #include "icing/util/status-macros.h"
39
40 namespace icing {
41 namespace lib {
42
43 namespace {
44
45 // Check if s1 is a prefix of s2.
IsPrefix(std::string_view s1,std::string_view s2)46 bool IsPrefix(std::string_view s1, std::string_view s2) {
47 if (s1.length() > s2.length()) {
48 return false;
49 }
50 return s1 == s2.substr(0, s1.length());
51 }
52
53 } // namespace
54
55 libtextclassifier3::StatusOr<const PropertyConfigProto *>
GetPropertyConfig(const std::string & schema_type,const std::string & property_name) const56 InMemoryIcingSearchEngine::GetPropertyConfig(
57 const std::string &schema_type, const std::string &property_name) const {
58 auto schema_iter = property_config_map_.find(schema_type);
59 if (schema_iter == property_config_map_.end()) {
60 return absl_ports::NotFoundError(
61 absl_ports::StrCat("Schema type: ", schema_type, " is not found."));
62 }
63 auto property_iter = schema_iter->second.find(property_name);
64 if (property_iter == schema_iter->second.end()) {
65 return absl_ports::NotFoundError(
66 absl_ports::StrCat("Property: ", property_name, " is not found."));
67 }
68 return &property_iter->second;
69 }
70
71 libtextclassifier3::StatusOr<TermMatchType::Code>
GetTermMatchType(const std::string & schema_type,const MonkeyTokenizedSection & section) const72 InMemoryIcingSearchEngine::GetTermMatchType(
73 const std::string &schema_type,
74 const MonkeyTokenizedSection §ion) const {
75 bool in_indexable_properties_list = false;
76 bool all_indexable_from_top = true;
77
78 std::vector<std::string_view> properties_in_path =
79 absl_ports::StrSplit(section.path, ".");
80 if (properties_in_path.empty()) {
81 return absl_ports::InvalidArgumentError("Got empty path.");
82 }
83 std::string curr_schema_type = schema_type;
84 for (int i = 0; i < properties_in_path.size(); ++i) {
85 ICING_ASSIGN_OR_RETURN(
86 const PropertyConfigProto *prop,
87 GetPropertyConfig(curr_schema_type,
88 std::string(properties_in_path[i])));
89 if (prop->data_type() == PropertyConfigProto::DataType::STRING) {
90 return prop->string_indexing_config().term_match_type();
91 }
92
93 if (prop->data_type() != PropertyConfigProto::DataType::DOCUMENT) {
94 return TermMatchType::Code::TermMatchType_Code_UNKNOWN;
95 }
96
97 bool old_all_indexable_from_top = all_indexable_from_top;
98 all_indexable_from_top &=
99 prop->document_indexing_config().index_nested_properties();
100 if (!all_indexable_from_top && !in_indexable_properties_list) {
101 // Only try to update in_indexable_properties_list if this is the first
102 // level with index_nested_properties=false.
103 if (old_all_indexable_from_top) {
104 auto &indexable_properties =
105 prop->document_indexing_config().indexable_nested_properties_list();
106 std::string relative_path =
107 absl_ports::StrCatPieces(std::vector<std::string_view>(
108 properties_in_path.begin() + i + 1, properties_in_path.end()));
109 in_indexable_properties_list =
110 std::find(indexable_properties.begin(), indexable_properties.end(),
111 relative_path) != indexable_properties.end();
112 }
113 // Check in_indexable_properties_list again.
114 if (!in_indexable_properties_list) {
115 return TermMatchType::Code::TermMatchType_Code_UNKNOWN;
116 }
117 }
118 curr_schema_type = prop->document_indexing_config().GetTypeName();
119 }
120 return TermMatchType::Code::TermMatchType_Code_UNKNOWN;
121 }
122
123 libtextclassifier3::StatusOr<bool>
DoesDocumentMatchQuery(const MonkeyTokenizedDocument & document,const std::string & query,TermMatchType::Code term_match_type) const124 InMemoryIcingSearchEngine::DoesDocumentMatchQuery(
125 const MonkeyTokenizedDocument &document, const std::string &query,
126 TermMatchType::Code term_match_type) const {
127 std::vector<std::string_view> strs = absl_ports::StrSplit(query, ":");
128 std::string_view query_term;
129 std::string_view section_restrict;
130 if (strs.size() > 1) {
131 section_restrict = strs[0];
132 query_term = strs[1];
133 } else {
134 query_term = query;
135 }
136 for (const MonkeyTokenizedSection §ion : document.tokenized_sections) {
137 if (!section_restrict.empty() && section.path != section_restrict) {
138 continue;
139 }
140 ICING_ASSIGN_OR_RETURN(
141 TermMatchType::Code section_term_match_type,
142 GetTermMatchType(document.document.schema(), section));
143 if (section_term_match_type == TermMatchType::UNKNOWN) {
144 // Skip non-indexable property.
145 continue;
146 }
147 for (const std::string &token : section.token_sequence) {
148 if (section_term_match_type == TermMatchType::EXACT_ONLY ||
149 term_match_type == TermMatchType::EXACT_ONLY) {
150 if (token == query_term) {
151 return true;
152 }
153 } else if (IsPrefix(query_term, token)) {
154 return true;
155 }
156 }
157 }
158 return false;
159 }
160
SetSchema(SchemaProto && schema)161 void InMemoryIcingSearchEngine::SetSchema(SchemaProto &&schema) {
162 schema_ = std::make_unique<SchemaProto>(std::move(schema));
163 property_config_map_.clear();
164 for (const SchemaTypeConfigProto &type_config : schema_->types()) {
165 auto &curr_property_map = property_config_map_[type_config.schema_type()];
166 for (const PropertyConfigProto &property_config :
167 type_config.properties()) {
168 curr_property_map.insert(
169 {property_config.property_name(), property_config});
170 }
171 }
172 }
173
174 InMemoryIcingSearchEngine::PickDocumentResult
RandomPickDocument(float p_alive,float p_all,float p_other) const175 InMemoryIcingSearchEngine::RandomPickDocument(float p_alive, float p_all,
176 float p_other) const {
177 // Normalizing p_alive, p_all and p_other, so that they sum to 1.
178 if (p_alive == 0 && p_all == 0 && p_other == 0) {
179 p_alive = p_all = p_other = 1 / 3.;
180 } else {
181 float p_sum = p_alive + p_all + p_other;
182 p_alive = p_alive / p_sum;
183 p_all = p_all / p_sum;
184 p_other = p_other / p_sum;
185 }
186
187 std::uniform_real_distribution<> real_dist(0, 1);
188 float p = real_dist(*random_);
189 if (p <= p_other || documents_.empty()) {
190 // 20 is a fair number of non-existing namespaces and uris, enough for
191 // monkey testing.
192 std::uniform_int_distribution<> dist(0, 19);
193 std::string name_space = absl_ports::StrCat("non_existing_namespace",
194 std::to_string(dist(*random_)));
195 std::string uri =
196 absl_ports::StrCat("non_existing_uri", std::to_string(dist(*random_)));
197 return {name_space, uri};
198 }
199 p -= p_other;
200 DocumentId doc_id;
201 if (p <= p_all || existing_doc_ids_.empty()) {
202 std::uniform_int_distribution<DocumentId> dist(0, documents_.size() - 1);
203 doc_id = dist(*random_);
204 } else {
205 std::uniform_int_distribution<DocumentId> dist(
206 0, existing_doc_ids_.size() - 1);
207 doc_id = existing_doc_ids_[dist(*random_)];
208 }
209 InMemoryIcingSearchEngine::PickDocumentResult result = {
210 documents_[doc_id].document.namespace_(),
211 documents_[doc_id].document.uri()};
212
213 // Even the (name_space, uri) of the picked doc_id has not been deleted
214 // specifically, doc_id may be outdated because of possible overwriting. So we
215 // need to find the latest document id, and return the latest DocumentProto.
216 auto latest_doc_id = InternalGet(result.name_space, result.uri);
217 if (latest_doc_id.ok()) {
218 result.document = documents_[latest_doc_id.ValueOrDie()].document;
219 }
220 return result;
221 }
222
Put(const MonkeyTokenizedDocument & document)223 void InMemoryIcingSearchEngine::Put(const MonkeyTokenizedDocument &document) {
224 // Delete the old one if existing.
225 Delete(document.document.namespace_(), document.document.uri()).IgnoreError();
226 existing_doc_ids_.push_back(documents_.size());
227 namespace_uri_docid_map[document.document.namespace_()]
228 [document.document.uri()] = documents_.size();
229 documents_.push_back(document);
230 }
231
GetAllNamespaces() const232 std::unordered_set<std::string> InMemoryIcingSearchEngine::GetAllNamespaces()
233 const {
234 std::unordered_set<std::string> namespaces;
235 for (DocumentId doc_id : existing_doc_ids_) {
236 namespaces.insert(documents_[doc_id].document.namespace_());
237 }
238 return namespaces;
239 }
240
Delete(const std::string & name_space,const std::string & uri)241 libtextclassifier3::Status InMemoryIcingSearchEngine::Delete(
242 const std::string &name_space, const std::string &uri) {
243 libtextclassifier3::StatusOr<DocumentId> doc_id_or =
244 InternalGet(name_space, uri);
245 if (doc_id_or.ok()) {
246 DocumentId doc_id = doc_id_or.ValueOrDie();
247 const DocumentProto &document = documents_[doc_id].document;
248 namespace_uri_docid_map[document.namespace_()].erase(document.uri());
249 auto end_itr =
250 std::remove(existing_doc_ids_.begin(), existing_doc_ids_.end(), doc_id);
251 existing_doc_ids_.erase(end_itr, existing_doc_ids_.end());
252 }
253 return doc_id_or.status();
254 }
255
256 libtextclassifier3::StatusOr<uint32_t>
DeleteByNamespace(const std::string & name_space)257 InMemoryIcingSearchEngine::DeleteByNamespace(const std::string &name_space) {
258 std::vector<DocumentId> doc_ids_to_delete;
259 for (DocumentId doc_id : existing_doc_ids_) {
260 if (documents_[doc_id].document.namespace_() == name_space) {
261 doc_ids_to_delete.push_back(doc_id);
262 }
263 }
264 for (DocumentId doc_id : doc_ids_to_delete) {
265 const DocumentProto &document = documents_[doc_id].document;
266 if (!Delete(document.namespace_(), document.uri()).ok()) {
267 return absl_ports::InternalError(
268 "Should never happen. There are inconsistencies in the in-memory "
269 "Icing.");
270 }
271 }
272 return doc_ids_to_delete.size();
273 }
274
275 libtextclassifier3::StatusOr<uint32_t>
DeleteBySchemaType(const std::string & schema_type)276 InMemoryIcingSearchEngine::DeleteBySchemaType(const std::string &schema_type) {
277 std::vector<DocumentId> doc_ids_to_delete;
278 for (DocumentId doc_id : existing_doc_ids_) {
279 if (documents_[doc_id].document.schema() == schema_type) {
280 doc_ids_to_delete.push_back(doc_id);
281 }
282 }
283 for (DocumentId doc_id : doc_ids_to_delete) {
284 const DocumentProto &document = documents_[doc_id].document;
285 if (!Delete(document.namespace_(), document.uri()).ok()) {
286 return absl_ports::InternalError(
287 "Should never happen. There are inconsistencies in the in-memory "
288 "Icing.");
289 }
290 }
291 return doc_ids_to_delete.size();
292 }
293
DeleteByQuery(const SearchSpecProto & search_spec)294 libtextclassifier3::StatusOr<uint32_t> InMemoryIcingSearchEngine::DeleteByQuery(
295 const SearchSpecProto &search_spec) {
296 ICING_ASSIGN_OR_RETURN(std::vector<DocumentId> doc_ids_to_delete,
297 InternalSearch(search_spec));
298 for (DocumentId doc_id : doc_ids_to_delete) {
299 const DocumentProto &document = documents_[doc_id].document;
300 if (!Delete(document.namespace_(), document.uri()).ok()) {
301 return absl_ports::InternalError(
302 "Should never happen. There are inconsistencies in the in-memory "
303 "Icing.");
304 }
305 }
306 return doc_ids_to_delete.size();
307 }
308
309 libtextclassifier3::StatusOr<std::vector<DocumentProto>>
Search(const SearchSpecProto & search_spec) const310 InMemoryIcingSearchEngine::Search(const SearchSpecProto &search_spec) const {
311 ICING_ASSIGN_OR_RETURN(std::vector<DocumentId> matched_doc_ids,
312 InternalSearch(search_spec));
313 std::vector<DocumentProto> result;
314 result.reserve(matched_doc_ids.size());
315 for (DocumentId doc_id : matched_doc_ids) {
316 result.push_back(documents_[doc_id].document);
317 }
318 return result;
319 }
320
InternalGet(const std::string & name_space,const std::string & uri) const321 libtextclassifier3::StatusOr<DocumentId> InMemoryIcingSearchEngine::InternalGet(
322 const std::string &name_space, const std::string &uri) const {
323 auto uris = namespace_uri_docid_map.find(name_space);
324 if (uris != namespace_uri_docid_map.end()) {
325 auto doc = uris->second.find(uri);
326 if (doc != uris->second.end()) {
327 return doc->second;
328 }
329 }
330 return absl_ports::NotFoundError(absl_ports::StrCat(
331 name_space, ", ", uri,
332 " is not found by InMemoryIcingSearchEngine::InternalGet."));
333 }
334
335 libtextclassifier3::StatusOr<std::vector<DocumentId>>
InternalSearch(const SearchSpecProto & search_spec) const336 InMemoryIcingSearchEngine::InternalSearch(
337 const SearchSpecProto &search_spec) const {
338 std::vector<DocumentId> matched_doc_ids;
339 for (DocumentId doc_id : existing_doc_ids_) {
340 ICING_ASSIGN_OR_RETURN(
341 bool match,
342 DoesDocumentMatchQuery(documents_[doc_id], search_spec.query(),
343 search_spec.term_match_type()));
344 if (match) {
345 matched_doc_ids.push_back(doc_id);
346 }
347 }
348 return matched_doc_ids;
349 }
350
351 } // namespace lib
352 } // namespace icing
353