1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/monkey_test/monkey-test-generators.h"
16
17 #include <array>
18 #include <cstdint>
19 #include <random>
20 #include <string>
21 #include <string_view>
22 #include <unordered_set>
23 #include <utility>
24 #include <vector>
25
26 #include "icing/absl_ports/str_cat.h"
27 #include "icing/absl_ports/str_join.h"
28 #include "icing/document-builder.h"
29 #include "icing/monkey_test/monkey-test-util.h"
30 #include "icing/monkey_test/monkey-tokenized-document.h"
31 #include "icing/proto/schema.pb.h"
32 #include "icing/proto/term.pb.h"
33 #include "icing/schema/section.h"
34
35 namespace icing {
36 namespace lib {
37
38 namespace {
39
40 constexpr std::array<PropertyConfigProto::Cardinality::Code, 3> kCardinalities =
41 {PropertyConfigProto::Cardinality::REPEATED,
42 PropertyConfigProto::Cardinality::OPTIONAL,
43 PropertyConfigProto::Cardinality::REQUIRED};
44
45 constexpr std::array<TermMatchType::Code, 3> kTermMatchTypes = {
46 TermMatchType::UNKNOWN, TermMatchType::EXACT_ONLY, TermMatchType::PREFIX};
47
GetRandomCardinality(MonkeyTestRandomEngine * random)48 PropertyConfigProto::Cardinality::Code GetRandomCardinality(
49 MonkeyTestRandomEngine* random) {
50 std::uniform_int_distribution<> dist(0, kCardinalities.size() - 1);
51 return kCardinalities[dist(*random)];
52 }
53
GetRandomIndexableTermMatchType(MonkeyTestRandomEngine * random)54 TermMatchType::Code GetRandomIndexableTermMatchType(
55 MonkeyTestRandomEngine* random) {
56 std::uniform_int_distribution<> dist(1, kTermMatchTypes.size() - 1);
57 return kTermMatchTypes[dist(*random)];
58 }
59
GetRandomBoolean(MonkeyTestRandomEngine * random)60 bool GetRandomBoolean(MonkeyTestRandomEngine* random) {
61 std::uniform_int_distribution<> dist(0, 1);
62 return dist(*random) == 1;
63 }
64
65 // TODO: Update this function when supporting document_indexing_config.
IsIndexableProperty(const PropertyConfigProto & property)66 bool IsIndexableProperty(const PropertyConfigProto& property) {
67 return property.string_indexing_config().term_match_type() !=
68 TermMatchType::UNKNOWN ||
69 property.embedding_indexing_config().embedding_indexing_type() !=
70 EmbeddingIndexingConfig::EmbeddingIndexingType::UNKNOWN;
71 }
72
SetStringIndexingConfig(MonkeyTestRandomEngine * random,PropertyConfigProto & property,bool indexable)73 void SetStringIndexingConfig(MonkeyTestRandomEngine* random,
74 PropertyConfigProto& property, bool indexable) {
75 property.clear_string_indexing_config();
76 if (indexable) {
77 StringIndexingConfig* string_indexing_config =
78 property.mutable_string_indexing_config();
79 string_indexing_config->set_term_match_type(
80 GetRandomIndexableTermMatchType(random));
81 // TODO: Try to add different TokenizerTypes. VERBATIM, RFC822, and URL are
82 // the remaining candidates to consider.
83 string_indexing_config->set_tokenizer_type(
84 StringIndexingConfig::TokenizerType::PLAIN);
85 }
86 }
87
SetEmbeddingIndexingConfig(MonkeyTestRandomEngine * random,PropertyConfigProto & property,bool indexable)88 void SetEmbeddingIndexingConfig(MonkeyTestRandomEngine* random,
89 PropertyConfigProto& property, bool indexable) {
90 property.clear_embedding_indexing_config();
91 if (indexable) {
92 property.mutable_embedding_indexing_config()->set_embedding_indexing_type(
93 EmbeddingIndexingConfig::EmbeddingIndexingType::LINEAR_SEARCH);
94 }
95 }
96
97 } // namespace
98
GenerateSchema()99 SchemaProto MonkeySchemaGenerator::GenerateSchema() {
100 SchemaProto schema;
101 for (int i = 0; i < config_->num_types; ++i) {
102 *schema.add_types() = GenerateType();
103 }
104 return schema;
105 }
106
UpdateSchema(const SchemaProto & schema)107 MonkeySchemaGenerator::UpdateSchemaResult MonkeySchemaGenerator::UpdateSchema(
108 const SchemaProto& schema) {
109 UpdateSchemaResult result = {std::move(schema)};
110 SchemaProto& new_schema = result.schema;
111
112 // Delete up to 2 existing types.
113 std::uniform_int_distribution<> num_types_to_delete_dist(0, 2);
114 for (int num_types_to_delete = num_types_to_delete_dist(*random_);
115 num_types_to_delete >= 0; --num_types_to_delete) {
116 if (new_schema.types_size() > 0) {
117 std::uniform_int_distribution<> dist(0, new_schema.types_size() - 1);
118 int index_to_delete = dist(*random_);
119 result.schema_types_deleted.insert(
120 new_schema.types(index_to_delete).schema_type());
121 new_schema.mutable_types()->SwapElements(index_to_delete,
122 new_schema.types_size() - 1);
123 new_schema.mutable_types()->RemoveLast();
124 }
125 }
126
127 // Updating about 1/3 of existing types.
128 for (int i = 0; i < new_schema.types_size(); ++i) {
129 std::uniform_int_distribution<> dist(0, 2);
130 if (dist(*random_) == 0) {
131 UpdateType(*new_schema.mutable_types(i), result);
132 }
133 }
134
135 // Add up to 2 new types.
136 std::uniform_int_distribution<> num_types_to_add_dist(0, 2);
137 for (int num_types_to_add = num_types_to_add_dist(*random_);
138 num_types_to_add >= 0; --num_types_to_add) {
139 *new_schema.add_types() = GenerateType();
140 }
141
142 return result;
143 }
144
GenerateProperty(const SchemaTypeConfigProto & type_config,PropertyConfigProto::Cardinality::Code cardinality,bool indexable)145 PropertyConfigProto MonkeySchemaGenerator::GenerateProperty(
146 const SchemaTypeConfigProto& type_config,
147 PropertyConfigProto::Cardinality::Code cardinality, bool indexable) {
148 PropertyConfigProto prop;
149 prop.set_property_name(
150 "MonkeyTestProp" +
151 std::to_string(num_properties_generated_[type_config.schema_type()]++));
152 // TODO: Perhaps in future iterations we will want to generate more types of
153 // properties.
154 // Currently, we are generating either a string or a vector property.
155 if (GetRandomBoolean(random_)) {
156 prop.set_data_type(PropertyConfigProto::DataType::STRING);
157 SetStringIndexingConfig(random_, prop, indexable);
158 } else {
159 prop.set_data_type(PropertyConfigProto::DataType::VECTOR);
160 SetEmbeddingIndexingConfig(random_, prop, indexable);
161 }
162 prop.set_cardinality(cardinality);
163 return prop;
164 }
165
UpdateProperty(const SchemaTypeConfigProto & type_config,PropertyConfigProto & property,UpdateSchemaResult & result)166 void MonkeySchemaGenerator::UpdateProperty(
167 const SchemaTypeConfigProto& type_config, PropertyConfigProto& property,
168 UpdateSchemaResult& result) {
169 PropertyConfigProto::Cardinality::Code new_cardinality =
170 GetRandomCardinality(random_);
171 if (new_cardinality != property.cardinality()) {
172 // Only do compatible cardinality update for now, otherwise it would be hard
173 // to track which documents will be invalid after updating the schema.
174 //
175 // The following type of updates are not allowed:
176 // - optional -> required
177 // - repeated -> optional
178 // - repeated -> required
179 if (property.cardinality() == PropertyConfigProto::Cardinality::OPTIONAL &&
180 new_cardinality == PropertyConfigProto::Cardinality::REQUIRED) {
181 return;
182 }
183 if (property.cardinality() == PropertyConfigProto::Cardinality::REPEATED &&
184 (new_cardinality == PropertyConfigProto::Cardinality::OPTIONAL ||
185 new_cardinality == PropertyConfigProto::Cardinality::REQUIRED)) {
186 return;
187 }
188 property.set_cardinality(new_cardinality);
189 }
190
191 bool old_indexable = IsIndexableProperty(property);
192 bool new_indexable = GetRandomBoolean(random_);
193 bool index_incompatible = old_indexable != new_indexable;
194 if (property.data_type() == PropertyConfigProto::DataType::STRING) {
195 TermMatchType::Code old_term_match_type =
196 property.string_indexing_config().term_match_type();
197 SetStringIndexingConfig(random_, property, new_indexable);
198 TermMatchType::Code new_term_match_type =
199 property.string_indexing_config().term_match_type();
200 if (old_term_match_type != new_term_match_type) {
201 index_incompatible = true;
202 }
203 } else if (property.data_type() == PropertyConfigProto::DataType::VECTOR) {
204 SetEmbeddingIndexingConfig(random_, property, new_indexable);
205 }
206 if (index_incompatible) {
207 result.schema_types_index_incompatible.insert(type_config.schema_type());
208 }
209 }
210
GenerateType()211 SchemaTypeConfigProto MonkeySchemaGenerator::GenerateType() {
212 SchemaTypeConfigProto type_config;
213 type_config.set_schema_type("MonkeyTestType" +
214 std::to_string(num_types_generated_++));
215 std::uniform_int_distribution<> possible_num_properties_dist(
216 0, config_->possible_num_properties.size() - 1);
217 int total_num_properties =
218 config_->possible_num_properties[possible_num_properties_dist(*random_)];
219
220 int num_indexed_properties = 0;
221 for (int i = 0; i < total_num_properties; ++i) {
222 bool indexable = false;
223 if (num_indexed_properties < kTotalNumSections) {
224 indexable = GetRandomBoolean(random_);
225 }
226 if (indexable) {
227 num_indexed_properties += 1;
228 }
229 (*type_config.add_properties()) =
230 GenerateProperty(type_config, GetRandomCardinality(random_), indexable);
231 }
232 return type_config;
233 }
234
UpdateType(SchemaTypeConfigProto & type_config,UpdateSchemaResult & result)235 void MonkeySchemaGenerator::UpdateType(SchemaTypeConfigProto& type_config,
236 UpdateSchemaResult& result) {
237 // Delete up to 4 existing property.
238 std::uniform_int_distribution<> num_properties_to_delete_dist(0, 4);
239 for (int num_properties_to_delete = num_properties_to_delete_dist(*random_);
240 num_properties_to_delete >= 0; --num_properties_to_delete) {
241 if (type_config.properties_size() > 0) {
242 std::uniform_int_distribution<> dist(0,
243 type_config.properties_size() - 1);
244 int index_to_delete = dist(*random_);
245 // Only delete a required property for now, otherwise it would be hard
246 // to track which documents will be invalid after updating the schema.
247 if (type_config.properties(index_to_delete).cardinality() !=
248 PropertyConfigProto::Cardinality::REQUIRED) {
249 continue;
250 }
251 if (IsIndexableProperty(type_config.properties(index_to_delete))) {
252 result.schema_types_index_incompatible.insert(
253 type_config.schema_type());
254 }
255 // Removing a property will cause the type to be considered as
256 // incompatible.
257 result.schema_types_incompatible.insert(type_config.schema_type());
258
259 type_config.mutable_properties()->SwapElements(
260 index_to_delete, type_config.properties_size() - 1);
261 type_config.mutable_properties()->RemoveLast();
262 }
263 }
264
265 // Updating about 1/3 of existing properties.
266 for (int i = 0; i < type_config.properties_size(); ++i) {
267 std::uniform_int_distribution<> dist(0, 2);
268 if (dist(*random_) == 0) {
269 UpdateProperty(type_config, *type_config.mutable_properties(i), result);
270 }
271 }
272
273 // Add up to 4 new properties.
274 std::uniform_int_distribution<> num_types_to_add_dist(0, 4);
275 for (int num_types_to_add = num_types_to_add_dist(*random_);
276 num_types_to_add >= 0; --num_types_to_add) {
277 PropertyConfigProto::Cardinality::Code new_cardinality =
278 GetRandomCardinality(random_);
279 // Adding a required property will make all document of this type invalid.
280 if (new_cardinality == PropertyConfigProto::Cardinality::REQUIRED) {
281 result.schema_types_incompatible.insert(type_config.schema_type());
282 }
283 bool indexable = GetRandomBoolean(random_);
284 PropertyConfigProto new_property =
285 GenerateProperty(type_config, new_cardinality, indexable);
286 if (indexable) {
287 result.schema_types_index_incompatible.insert(type_config.schema_type());
288 }
289 (*type_config.add_properties()) = std::move(new_property);
290 }
291
292 int num_indexed_properties = 0;
293 for (int i = 0; i < type_config.properties_size(); ++i) {
294 if (IsIndexableProperty(type_config.properties(i))) {
295 ++num_indexed_properties;
296 }
297 }
298
299 if (num_indexed_properties > kTotalNumSections) {
300 result.is_invalid_schema = true;
301 }
302 }
303
GetNamespace() const304 std::string MonkeyDocumentGenerator::GetNamespace() const {
305 uint32_t name_space;
306 // When num_namespaces is 0, all documents generated get different namespaces.
307 // Otherwise, namespaces will be randomly picked from a set with
308 // num_namespaces elements.
309 if (config_->num_namespaces == 0) {
310 name_space = num_docs_generated_;
311 } else {
312 std::uniform_int_distribution<> dist(0, config_->num_namespaces - 1);
313 name_space = dist(*random_);
314 }
315 return absl_ports::StrCat("namespace", std::to_string(name_space));
316 }
317
GetUri() const318 std::string MonkeyDocumentGenerator::GetUri() const {
319 uint32_t uri;
320 // When num_uris is 0, all documents generated get different URIs. Otherwise,
321 // URIs will be randomly picked from a set with num_uris elements.
322 if (config_->num_uris == 0) {
323 uri = num_docs_generated_;
324 } else {
325 std::uniform_int_distribution<> dist(0, config_->num_uris - 1);
326 uri = dist(*random_);
327 }
328 return absl_ports::StrCat("uri", std::to_string(uri));
329 }
330
GetNumTokens() const331 int MonkeyDocumentGenerator::GetNumTokens() const {
332 std::uniform_int_distribution<> dist(0,
333 config_->possible_num_tokens.size() - 1);
334 int n = config_->possible_num_tokens[dist(*random_)];
335 // Add some noise
336 std::uniform_real_distribution<> real_dist(0.5, 1);
337 float p = real_dist(*random_);
338 return n * p;
339 }
340
GetStringPropertyContent() const341 std::vector<std::string> MonkeyDocumentGenerator::GetStringPropertyContent()
342 const {
343 int num_tokens = GetNumTokens();
344 std::vector<std::string> content;
345 content.reserve(num_tokens);
346 while (num_tokens) {
347 content.push_back(std::string(GetToken()));
348 --num_tokens;
349 }
350 return content;
351 }
352
GetNumVectors(PropertyConfigProto::Cardinality::Code cardinality) const353 int MonkeyDocumentGenerator::GetNumVectors(
354 PropertyConfigProto::Cardinality::Code cardinality) const {
355 if (cardinality == PropertyConfigProto::Cardinality::REQUIRED) {
356 return 1;
357 } else if (cardinality == PropertyConfigProto::Cardinality::OPTIONAL) {
358 std::uniform_int_distribution<> dist(0, 1);
359 return dist(*random_);
360 }
361
362 // For repeated properties:
363 std::uniform_int_distribution<> dist(
364 0, config_->possible_num_vectors.size() - 1);
365 int n = config_->possible_num_vectors[dist(*random_)];
366 // Add some noise
367 std::uniform_real_distribution<> real_dist(0.5, 1);
368 float p = real_dist(*random_);
369 return n * p;
370 }
371
GetRandomVector() const372 PropertyProto::VectorProto MonkeyDocumentGenerator::GetRandomVector() const {
373 std::uniform_int_distribution<> dimension_dist(
374 0, config_->possible_vector_dimensions.size() - 1);
375 std::uniform_real_distribution<float> value_dist(-1.0, 1.0);
376
377 PropertyProto::VectorProto vector;
378 vector.set_model_signature("model");
379 int dimension = config_->possible_vector_dimensions[dimension_dist(*random_)];
380 for (int i = 0; i < dimension; ++i) {
381 vector.add_values(value_dist(*random_));
382 }
383 return vector;
384 }
385
386 std::vector<PropertyProto::VectorProto>
GetVectorPropertyContent(PropertyConfigProto::Cardinality::Code cardinality) const387 MonkeyDocumentGenerator::GetVectorPropertyContent(
388 PropertyConfigProto::Cardinality::Code cardinality) const {
389 int num_vectors = GetNumVectors(cardinality);
390 std::vector<PropertyProto::VectorProto> content;
391 content.reserve(num_vectors);
392 while (num_vectors) {
393 content.push_back(GetRandomVector());
394 --num_vectors;
395 }
396 return content;
397 }
398
GenerateDocument()399 MonkeyTokenizedDocument MonkeyDocumentGenerator::GenerateDocument() {
400 MonkeyTokenizedDocument document;
401 const SchemaTypeConfigProto& type_config = GetType();
402 const std::string& name_space = GetNamespace();
403 DocumentBuilder doc_builder =
404 DocumentBuilder()
405 .SetNamespace(name_space)
406 .SetSchema(type_config.schema_type())
407 .SetUri(GetUri())
408 .SetCreationTimestampMs(clock_.GetSystemTimeMilliseconds());
409 for (const PropertyConfigProto& prop : type_config.properties()) {
410 if (prop.data_type() == PropertyConfigProto::DataType::STRING) {
411 std::vector<std::string> prop_content = GetStringPropertyContent();
412 doc_builder.AddStringProperty(prop.property_name(),
413 absl_ports::StrJoin(prop_content, " "));
414 // No matter whether the property is indexable currently, we have to
415 // create a section for it since a non-indexable property can become
416 // indexable after a schema type change. The in-memory icing will
417 // automatically skip sections that are non-indexable at the time of
418 // search requests.
419 MonkeyTokenizedSection section = {prop.property_name(),
420 std::move(prop_content)};
421 document.tokenized_sections.push_back(std::move(section));
422 } else {
423 std::vector<PropertyProto::VectorProto> prop_content =
424 GetVectorPropertyContent(prop.cardinality());
425 doc_builder.AddVectorProperty(prop.property_name(), prop_content);
426
427 // Similar to the string property, no matter whether the property is
428 // indexable currently, we have to create a section for it.
429 MonkeyTokenizedSection section = {
430 prop.property_name(), /*token_sequence=*/{}, std::move(prop_content)};
431 document.tokenized_sections.push_back(std::move(section));
432 }
433 }
434 document.document = doc_builder.Build();
435 ++num_docs_generated_;
436 return document;
437 }
438
439 } // namespace lib
440 } // namespace icing
441