• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/monkey_test/monkey-test-generators.h"
16 
17 #include <array>
18 #include <cstdint>
19 #include <random>
20 #include <string>
21 #include <string_view>
22 #include <unordered_set>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/absl_ports/str_cat.h"
27 #include "icing/absl_ports/str_join.h"
28 #include "icing/document-builder.h"
29 #include "icing/monkey_test/monkey-test-util.h"
30 #include "icing/monkey_test/monkey-tokenized-document.h"
31 #include "icing/proto/schema.pb.h"
32 #include "icing/proto/term.pb.h"
33 #include "icing/schema/section.h"
34 
35 namespace icing {
36 namespace lib {
37 
38 namespace {
39 
40 constexpr std::array<PropertyConfigProto::Cardinality::Code, 3> kCardinalities =
41     {PropertyConfigProto::Cardinality::REPEATED,
42      PropertyConfigProto::Cardinality::OPTIONAL,
43      PropertyConfigProto::Cardinality::REQUIRED};
44 
45 constexpr std::array<TermMatchType::Code, 3> kTermMatchTypes = {
46     TermMatchType::UNKNOWN, TermMatchType::EXACT_ONLY, TermMatchType::PREFIX};
47 
GetRandomCardinality(MonkeyTestRandomEngine * random)48 PropertyConfigProto::Cardinality::Code GetRandomCardinality(
49     MonkeyTestRandomEngine* random) {
50   std::uniform_int_distribution<> dist(0, kCardinalities.size() - 1);
51   return kCardinalities[dist(*random)];
52 }
53 
GetRandomIndexableTermMatchType(MonkeyTestRandomEngine * random)54 TermMatchType::Code GetRandomIndexableTermMatchType(
55     MonkeyTestRandomEngine* random) {
56   std::uniform_int_distribution<> dist(1, kTermMatchTypes.size() - 1);
57   return kTermMatchTypes[dist(*random)];
58 }
59 
GetRandomBoolean(MonkeyTestRandomEngine * random)60 bool GetRandomBoolean(MonkeyTestRandomEngine* random) {
61   std::uniform_int_distribution<> dist(0, 1);
62   return dist(*random) == 1;
63 }
64 
65 // TODO: Update this function when supporting document_indexing_config.
IsIndexableProperty(const PropertyConfigProto & property)66 bool IsIndexableProperty(const PropertyConfigProto& property) {
67   return property.string_indexing_config().term_match_type() !=
68              TermMatchType::UNKNOWN ||
69          property.embedding_indexing_config().embedding_indexing_type() !=
70              EmbeddingIndexingConfig::EmbeddingIndexingType::UNKNOWN;
71 }
72 
SetStringIndexingConfig(MonkeyTestRandomEngine * random,PropertyConfigProto & property,bool indexable)73 void SetStringIndexingConfig(MonkeyTestRandomEngine* random,
74                              PropertyConfigProto& property, bool indexable) {
75   property.clear_string_indexing_config();
76   if (indexable) {
77     StringIndexingConfig* string_indexing_config =
78         property.mutable_string_indexing_config();
79     string_indexing_config->set_term_match_type(
80         GetRandomIndexableTermMatchType(random));
81     // TODO: Try to add different TokenizerTypes. VERBATIM, RFC822, and URL are
82     // the remaining candidates to consider.
83     string_indexing_config->set_tokenizer_type(
84         StringIndexingConfig::TokenizerType::PLAIN);
85   }
86 }
87 
SetEmbeddingIndexingConfig(MonkeyTestRandomEngine * random,PropertyConfigProto & property,bool indexable)88 void SetEmbeddingIndexingConfig(MonkeyTestRandomEngine* random,
89                                 PropertyConfigProto& property, bool indexable) {
90   property.clear_embedding_indexing_config();
91   if (indexable) {
92     property.mutable_embedding_indexing_config()->set_embedding_indexing_type(
93         EmbeddingIndexingConfig::EmbeddingIndexingType::LINEAR_SEARCH);
94   }
95 }
96 
97 }  // namespace
98 
GenerateSchema()99 SchemaProto MonkeySchemaGenerator::GenerateSchema() {
100   SchemaProto schema;
101   for (int i = 0; i < config_->num_types; ++i) {
102     *schema.add_types() = GenerateType();
103   }
104   return schema;
105 }
106 
UpdateSchema(const SchemaProto & schema)107 MonkeySchemaGenerator::UpdateSchemaResult MonkeySchemaGenerator::UpdateSchema(
108     const SchemaProto& schema) {
109   UpdateSchemaResult result = {std::move(schema)};
110   SchemaProto& new_schema = result.schema;
111 
112   // Delete up to 2 existing types.
113   std::uniform_int_distribution<> num_types_to_delete_dist(0, 2);
114   for (int num_types_to_delete = num_types_to_delete_dist(*random_);
115        num_types_to_delete >= 0; --num_types_to_delete) {
116     if (new_schema.types_size() > 0) {
117       std::uniform_int_distribution<> dist(0, new_schema.types_size() - 1);
118       int index_to_delete = dist(*random_);
119       result.schema_types_deleted.insert(
120           new_schema.types(index_to_delete).schema_type());
121       new_schema.mutable_types()->SwapElements(index_to_delete,
122                                                new_schema.types_size() - 1);
123       new_schema.mutable_types()->RemoveLast();
124     }
125   }
126 
127   // Updating about 1/3 of existing types.
128   for (int i = 0; i < new_schema.types_size(); ++i) {
129     std::uniform_int_distribution<> dist(0, 2);
130     if (dist(*random_) == 0) {
131       UpdateType(*new_schema.mutable_types(i), result);
132     }
133   }
134 
135   // Add up to 2 new types.
136   std::uniform_int_distribution<> num_types_to_add_dist(0, 2);
137   for (int num_types_to_add = num_types_to_add_dist(*random_);
138        num_types_to_add >= 0; --num_types_to_add) {
139     *new_schema.add_types() = GenerateType();
140   }
141 
142   return result;
143 }
144 
GenerateProperty(const SchemaTypeConfigProto & type_config,PropertyConfigProto::Cardinality::Code cardinality,bool indexable)145 PropertyConfigProto MonkeySchemaGenerator::GenerateProperty(
146     const SchemaTypeConfigProto& type_config,
147     PropertyConfigProto::Cardinality::Code cardinality, bool indexable) {
148   PropertyConfigProto prop;
149   prop.set_property_name(
150       "MonkeyTestProp" +
151       std::to_string(num_properties_generated_[type_config.schema_type()]++));
152   // TODO: Perhaps in future iterations we will want to generate more types of
153   // properties.
154   // Currently, we are generating either a string or a vector property.
155   if (GetRandomBoolean(random_)) {
156     prop.set_data_type(PropertyConfigProto::DataType::STRING);
157     SetStringIndexingConfig(random_, prop, indexable);
158   } else {
159     prop.set_data_type(PropertyConfigProto::DataType::VECTOR);
160     SetEmbeddingIndexingConfig(random_, prop, indexable);
161   }
162   prop.set_cardinality(cardinality);
163   return prop;
164 }
165 
UpdateProperty(const SchemaTypeConfigProto & type_config,PropertyConfigProto & property,UpdateSchemaResult & result)166 void MonkeySchemaGenerator::UpdateProperty(
167     const SchemaTypeConfigProto& type_config, PropertyConfigProto& property,
168     UpdateSchemaResult& result) {
169   PropertyConfigProto::Cardinality::Code new_cardinality =
170       GetRandomCardinality(random_);
171   if (new_cardinality != property.cardinality()) {
172     // Only do compatible cardinality update for now, otherwise it would be hard
173     // to track which documents will be invalid after updating the schema.
174     //
175     // The following type of updates are not allowed:
176     // - optional -> required
177     // - repeated -> optional
178     // - repeated -> required
179     if (property.cardinality() == PropertyConfigProto::Cardinality::OPTIONAL &&
180         new_cardinality == PropertyConfigProto::Cardinality::REQUIRED) {
181       return;
182     }
183     if (property.cardinality() == PropertyConfigProto::Cardinality::REPEATED &&
184         (new_cardinality == PropertyConfigProto::Cardinality::OPTIONAL ||
185          new_cardinality == PropertyConfigProto::Cardinality::REQUIRED)) {
186       return;
187     }
188     property.set_cardinality(new_cardinality);
189   }
190 
191   bool old_indexable = IsIndexableProperty(property);
192   bool new_indexable = GetRandomBoolean(random_);
193   bool index_incompatible = old_indexable != new_indexable;
194   if (property.data_type() == PropertyConfigProto::DataType::STRING) {
195     TermMatchType::Code old_term_match_type =
196         property.string_indexing_config().term_match_type();
197     SetStringIndexingConfig(random_, property, new_indexable);
198     TermMatchType::Code new_term_match_type =
199         property.string_indexing_config().term_match_type();
200     if (old_term_match_type != new_term_match_type) {
201       index_incompatible = true;
202     }
203   } else if (property.data_type() == PropertyConfigProto::DataType::VECTOR) {
204     SetEmbeddingIndexingConfig(random_, property, new_indexable);
205   }
206   if (index_incompatible) {
207     result.schema_types_index_incompatible.insert(type_config.schema_type());
208   }
209 }
210 
GenerateType()211 SchemaTypeConfigProto MonkeySchemaGenerator::GenerateType() {
212   SchemaTypeConfigProto type_config;
213   type_config.set_schema_type("MonkeyTestType" +
214                               std::to_string(num_types_generated_++));
215   std::uniform_int_distribution<> possible_num_properties_dist(
216       0, config_->possible_num_properties.size() - 1);
217   int total_num_properties =
218       config_->possible_num_properties[possible_num_properties_dist(*random_)];
219 
220   int num_indexed_properties = 0;
221   for (int i = 0; i < total_num_properties; ++i) {
222     bool indexable = false;
223     if (num_indexed_properties < kTotalNumSections) {
224       indexable = GetRandomBoolean(random_);
225     }
226     if (indexable) {
227       num_indexed_properties += 1;
228     }
229     (*type_config.add_properties()) =
230         GenerateProperty(type_config, GetRandomCardinality(random_), indexable);
231   }
232   return type_config;
233 }
234 
UpdateType(SchemaTypeConfigProto & type_config,UpdateSchemaResult & result)235 void MonkeySchemaGenerator::UpdateType(SchemaTypeConfigProto& type_config,
236                                        UpdateSchemaResult& result) {
237   // Delete up to 4 existing property.
238   std::uniform_int_distribution<> num_properties_to_delete_dist(0, 4);
239   for (int num_properties_to_delete = num_properties_to_delete_dist(*random_);
240        num_properties_to_delete >= 0; --num_properties_to_delete) {
241     if (type_config.properties_size() > 0) {
242       std::uniform_int_distribution<> dist(0,
243                                            type_config.properties_size() - 1);
244       int index_to_delete = dist(*random_);
245       // Only delete a required property for now, otherwise it would be hard
246       // to track which documents will be invalid after updating the schema.
247       if (type_config.properties(index_to_delete).cardinality() !=
248           PropertyConfigProto::Cardinality::REQUIRED) {
249         continue;
250       }
251       if (IsIndexableProperty(type_config.properties(index_to_delete))) {
252         result.schema_types_index_incompatible.insert(
253             type_config.schema_type());
254       }
255       // Removing a property will cause the type to be considered as
256       // incompatible.
257       result.schema_types_incompatible.insert(type_config.schema_type());
258 
259       type_config.mutable_properties()->SwapElements(
260           index_to_delete, type_config.properties_size() - 1);
261       type_config.mutable_properties()->RemoveLast();
262     }
263   }
264 
265   // Updating about 1/3 of existing properties.
266   for (int i = 0; i < type_config.properties_size(); ++i) {
267     std::uniform_int_distribution<> dist(0, 2);
268     if (dist(*random_) == 0) {
269       UpdateProperty(type_config, *type_config.mutable_properties(i), result);
270     }
271   }
272 
273   // Add up to 4 new properties.
274   std::uniform_int_distribution<> num_types_to_add_dist(0, 4);
275   for (int num_types_to_add = num_types_to_add_dist(*random_);
276        num_types_to_add >= 0; --num_types_to_add) {
277     PropertyConfigProto::Cardinality::Code new_cardinality =
278         GetRandomCardinality(random_);
279     // Adding a required property will make all document of this type invalid.
280     if (new_cardinality == PropertyConfigProto::Cardinality::REQUIRED) {
281       result.schema_types_incompatible.insert(type_config.schema_type());
282     }
283     bool indexable = GetRandomBoolean(random_);
284     PropertyConfigProto new_property =
285         GenerateProperty(type_config, new_cardinality, indexable);
286     if (indexable) {
287       result.schema_types_index_incompatible.insert(type_config.schema_type());
288     }
289     (*type_config.add_properties()) = std::move(new_property);
290   }
291 
292   int num_indexed_properties = 0;
293   for (int i = 0; i < type_config.properties_size(); ++i) {
294     if (IsIndexableProperty(type_config.properties(i))) {
295       ++num_indexed_properties;
296     }
297   }
298 
299   if (num_indexed_properties > kTotalNumSections) {
300     result.is_invalid_schema = true;
301   }
302 }
303 
GetNamespace() const304 std::string MonkeyDocumentGenerator::GetNamespace() const {
305   uint32_t name_space;
306   // When num_namespaces is 0, all documents generated get different namespaces.
307   // Otherwise, namespaces will be randomly picked from a set with
308   // num_namespaces elements.
309   if (config_->num_namespaces == 0) {
310     name_space = num_docs_generated_;
311   } else {
312     std::uniform_int_distribution<> dist(0, config_->num_namespaces - 1);
313     name_space = dist(*random_);
314   }
315   return absl_ports::StrCat("namespace", std::to_string(name_space));
316 }
317 
GetUri() const318 std::string MonkeyDocumentGenerator::GetUri() const {
319   uint32_t uri;
320   // When num_uris is 0, all documents generated get different URIs. Otherwise,
321   // URIs will be randomly picked from a set with num_uris elements.
322   if (config_->num_uris == 0) {
323     uri = num_docs_generated_;
324   } else {
325     std::uniform_int_distribution<> dist(0, config_->num_uris - 1);
326     uri = dist(*random_);
327   }
328   return absl_ports::StrCat("uri", std::to_string(uri));
329 }
330 
GetNumTokens() const331 int MonkeyDocumentGenerator::GetNumTokens() const {
332   std::uniform_int_distribution<> dist(0,
333                                        config_->possible_num_tokens.size() - 1);
334   int n = config_->possible_num_tokens[dist(*random_)];
335   // Add some noise
336   std::uniform_real_distribution<> real_dist(0.5, 1);
337   float p = real_dist(*random_);
338   return n * p;
339 }
340 
GetStringPropertyContent() const341 std::vector<std::string> MonkeyDocumentGenerator::GetStringPropertyContent()
342     const {
343   int num_tokens = GetNumTokens();
344   std::vector<std::string> content;
345   content.reserve(num_tokens);
346   while (num_tokens) {
347     content.push_back(std::string(GetToken()));
348     --num_tokens;
349   }
350   return content;
351 }
352 
GetNumVectors(PropertyConfigProto::Cardinality::Code cardinality) const353 int MonkeyDocumentGenerator::GetNumVectors(
354     PropertyConfigProto::Cardinality::Code cardinality) const {
355   if (cardinality == PropertyConfigProto::Cardinality::REQUIRED) {
356     return 1;
357   } else if (cardinality == PropertyConfigProto::Cardinality::OPTIONAL) {
358     std::uniform_int_distribution<> dist(0, 1);
359     return dist(*random_);
360   }
361 
362   // For repeated properties:
363   std::uniform_int_distribution<> dist(
364       0, config_->possible_num_vectors.size() - 1);
365   int n = config_->possible_num_vectors[dist(*random_)];
366   // Add some noise
367   std::uniform_real_distribution<> real_dist(0.5, 1);
368   float p = real_dist(*random_);
369   return n * p;
370 }
371 
GetRandomVector() const372 PropertyProto::VectorProto MonkeyDocumentGenerator::GetRandomVector() const {
373   std::uniform_int_distribution<> dimension_dist(
374       0, config_->possible_vector_dimensions.size() - 1);
375   std::uniform_real_distribution<float> value_dist(-1.0, 1.0);
376 
377   PropertyProto::VectorProto vector;
378   vector.set_model_signature("model");
379   int dimension = config_->possible_vector_dimensions[dimension_dist(*random_)];
380   for (int i = 0; i < dimension; ++i) {
381     vector.add_values(value_dist(*random_));
382   }
383   return vector;
384 }
385 
386 std::vector<PropertyProto::VectorProto>
GetVectorPropertyContent(PropertyConfigProto::Cardinality::Code cardinality) const387 MonkeyDocumentGenerator::GetVectorPropertyContent(
388     PropertyConfigProto::Cardinality::Code cardinality) const {
389   int num_vectors = GetNumVectors(cardinality);
390   std::vector<PropertyProto::VectorProto> content;
391   content.reserve(num_vectors);
392   while (num_vectors) {
393     content.push_back(GetRandomVector());
394     --num_vectors;
395   }
396   return content;
397 }
398 
GenerateDocument()399 MonkeyTokenizedDocument MonkeyDocumentGenerator::GenerateDocument() {
400   MonkeyTokenizedDocument document;
401   const SchemaTypeConfigProto& type_config = GetType();
402   const std::string& name_space = GetNamespace();
403   DocumentBuilder doc_builder =
404       DocumentBuilder()
405           .SetNamespace(name_space)
406           .SetSchema(type_config.schema_type())
407           .SetUri(GetUri())
408           .SetCreationTimestampMs(clock_.GetSystemTimeMilliseconds());
409   for (const PropertyConfigProto& prop : type_config.properties()) {
410     if (prop.data_type() == PropertyConfigProto::DataType::STRING) {
411       std::vector<std::string> prop_content = GetStringPropertyContent();
412       doc_builder.AddStringProperty(prop.property_name(),
413                                     absl_ports::StrJoin(prop_content, " "));
414       // No matter whether the property is indexable currently, we have to
415       // create a section for it since a non-indexable property can become
416       // indexable after a schema type change. The in-memory icing will
417       // automatically skip sections that are non-indexable at the time of
418       // search requests.
419       MonkeyTokenizedSection section = {prop.property_name(),
420                                         std::move(prop_content)};
421       document.tokenized_sections.push_back(std::move(section));
422     } else {
423       std::vector<PropertyProto::VectorProto> prop_content =
424           GetVectorPropertyContent(prop.cardinality());
425       doc_builder.AddVectorProperty(prop.property_name(), prop_content);
426 
427       // Similar to the string property, no matter whether the property is
428       // indexable currently, we have to create a section for it.
429       MonkeyTokenizedSection section = {
430           prop.property_name(), /*token_sequence=*/{}, std::move(prop_content)};
431       document.tokenized_sections.push_back(std::move(section));
432     }
433   }
434   document.document = doc_builder.Build();
435   ++num_docs_generated_;
436   return document;
437 }
438 
439 }  // namespace lib
440 }  // namespace icing
441