1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/monkey_test/monkey-test-generators.h"
16
17 #include <array>
18 #include <cstdint>
19 #include <random>
20 #include <string>
21 #include <string_view>
22 #include <unordered_set>
23 #include <utility>
24 #include <vector>
25
26 #include "icing/absl_ports/str_cat.h"
27 #include "icing/absl_ports/str_join.h"
28 #include "icing/document-builder.h"
29 #include "icing/monkey_test/monkey-test-util.h"
30 #include "icing/monkey_test/monkey-tokenized-document.h"
31 #include "icing/proto/schema.pb.h"
32 #include "icing/proto/term.pb.h"
33 #include "icing/schema/section.h"
34
35 namespace icing {
36 namespace lib {
37
38 namespace {
39
40 constexpr std::array<PropertyConfigProto::Cardinality::Code, 3> kCardinalities =
41 {PropertyConfigProto::Cardinality::REPEATED,
42 PropertyConfigProto::Cardinality::OPTIONAL,
43 PropertyConfigProto::Cardinality::REQUIRED};
44
45 constexpr std::array<TermMatchType::Code, 3> kTermMatchTypes = {
46 TermMatchType::UNKNOWN, TermMatchType::EXACT_ONLY, TermMatchType::PREFIX};
47
GetRandomCardinality(MonkeyTestRandomEngine * random)48 PropertyConfigProto::Cardinality::Code GetRandomCardinality(
49 MonkeyTestRandomEngine* random) {
50 std::uniform_int_distribution<> dist(0, kCardinalities.size() - 1);
51 return kCardinalities[dist(*random)];
52 }
53
GetRandomTermMatchType(MonkeyTestRandomEngine * random)54 TermMatchType::Code GetRandomTermMatchType(MonkeyTestRandomEngine* random) {
55 std::uniform_int_distribution<> dist(0, kTermMatchTypes.size() - 1);
56 return kTermMatchTypes[dist(*random)];
57 }
58
59 // TODO: Update this function when supporting document_indexing_config.
IsIndexableProperty(const PropertyConfigProto & property)60 bool IsIndexableProperty(const PropertyConfigProto& property) {
61 return property.string_indexing_config().term_match_type() !=
62 TermMatchType::UNKNOWN;
63 }
64
SetStringIndexingConfig(PropertyConfigProto & property,TermMatchType::Code term_match_type)65 void SetStringIndexingConfig(PropertyConfigProto& property,
66 TermMatchType::Code term_match_type) {
67 if (term_match_type != TermMatchType::UNKNOWN) {
68 StringIndexingConfig* string_indexing_config =
69 property.mutable_string_indexing_config();
70 string_indexing_config->set_term_match_type(term_match_type);
71 // TODO: Try to add different TokenizerTypes. VERBATIM, RFC822, and URL are
72 // the remaining candidates to consider.
73 string_indexing_config->set_tokenizer_type(
74 StringIndexingConfig::TokenizerType::PLAIN);
75 } else {
76 property.clear_string_indexing_config();
77 }
78 }
79
80 } // namespace
81
GenerateSchema()82 SchemaProto MonkeySchemaGenerator::GenerateSchema() {
83 SchemaProto schema;
84 for (int i = 0; i < config_->num_types; ++i) {
85 *schema.add_types() = GenerateType();
86 }
87 return schema;
88 }
89
UpdateSchema(const SchemaProto & schema)90 MonkeySchemaGenerator::UpdateSchemaResult MonkeySchemaGenerator::UpdateSchema(
91 const SchemaProto& schema) {
92 UpdateSchemaResult result = {std::move(schema)};
93 SchemaProto& new_schema = result.schema;
94
95 // Delete up to 2 existing types.
96 std::uniform_int_distribution<> num_types_to_delete_dist(0, 2);
97 for (int num_types_to_delete = num_types_to_delete_dist(*random_);
98 num_types_to_delete >= 0; --num_types_to_delete) {
99 if (new_schema.types_size() > 0) {
100 std::uniform_int_distribution<> dist(0, new_schema.types_size() - 1);
101 int index_to_delete = dist(*random_);
102 result.schema_types_deleted.insert(
103 new_schema.types(index_to_delete).schema_type());
104 new_schema.mutable_types()->SwapElements(index_to_delete,
105 new_schema.types_size() - 1);
106 new_schema.mutable_types()->RemoveLast();
107 }
108 }
109
110 // Updating about 1/3 of existing types.
111 for (int i = 0; i < new_schema.types_size(); ++i) {
112 std::uniform_int_distribution<> dist(0, 2);
113 if (dist(*random_) == 0) {
114 UpdateType(*new_schema.mutable_types(i), result);
115 }
116 }
117
118 // Add up to 2 new types.
119 std::uniform_int_distribution<> num_types_to_add_dist(0, 2);
120 for (int num_types_to_add = num_types_to_add_dist(*random_);
121 num_types_to_add >= 0; --num_types_to_add) {
122 *new_schema.add_types() = GenerateType();
123 }
124
125 return result;
126 }
127
GenerateProperty(const SchemaTypeConfigProto & type_config,PropertyConfigProto::Cardinality::Code cardinality,TermMatchType::Code term_match_type)128 PropertyConfigProto MonkeySchemaGenerator::GenerateProperty(
129 const SchemaTypeConfigProto& type_config,
130 PropertyConfigProto::Cardinality::Code cardinality,
131 TermMatchType::Code term_match_type) {
132 PropertyConfigProto prop;
133 prop.set_property_name(
134 "MonkeyTestProp" +
135 std::to_string(num_properties_generated_[type_config.schema_type()]++));
136 // TODO: Perhaps in future iterations we will want to generate more than just
137 // string properties.
138 prop.set_data_type(PropertyConfigProto::DataType::STRING);
139 prop.set_cardinality(cardinality);
140 SetStringIndexingConfig(prop, term_match_type);
141 return prop;
142 }
143
UpdateProperty(const SchemaTypeConfigProto & type_config,PropertyConfigProto & property,UpdateSchemaResult & result)144 void MonkeySchemaGenerator::UpdateProperty(
145 const SchemaTypeConfigProto& type_config, PropertyConfigProto& property,
146 UpdateSchemaResult& result) {
147 PropertyConfigProto::Cardinality::Code new_cardinality =
148 GetRandomCardinality(random_);
149 if (new_cardinality != property.cardinality()) {
150 // Only do compatible cardinality update for now, otherwise it would be hard
151 // to track which documents will be invalid after updating the schema.
152 //
153 // The following type of updates are not allowed:
154 // - optional -> required
155 // - repeated -> optional
156 // - repeated -> required
157 if (property.cardinality() == PropertyConfigProto::Cardinality::OPTIONAL &&
158 new_cardinality == PropertyConfigProto::Cardinality::REQUIRED) {
159 return;
160 }
161 if (property.cardinality() == PropertyConfigProto::Cardinality::REPEATED &&
162 (new_cardinality == PropertyConfigProto::Cardinality::OPTIONAL ||
163 new_cardinality == PropertyConfigProto::Cardinality::REQUIRED)) {
164 return;
165 }
166 property.set_cardinality(new_cardinality);
167 }
168
169 if (property.data_type() == PropertyConfigProto::DataType::STRING) {
170 TermMatchType::Code new_term_match_type = GetRandomTermMatchType(random_);
171 if (new_term_match_type !=
172 property.string_indexing_config().term_match_type()) {
173 SetStringIndexingConfig(property, new_term_match_type);
174 result.schema_types_index_incompatible.insert(type_config.schema_type());
175 }
176 }
177 }
178
GenerateType()179 SchemaTypeConfigProto MonkeySchemaGenerator::GenerateType() {
180 SchemaTypeConfigProto type_config;
181 type_config.set_schema_type("MonkeyTestType" +
182 std::to_string(num_types_generated_++));
183 std::uniform_int_distribution<> possible_num_properties_dist(
184 0, config_->possible_num_properties.size() - 1);
185 int total_num_properties =
186 config_->possible_num_properties[possible_num_properties_dist(*random_)];
187
188 int num_indexed_properties = 0;
189 for (int i = 0; i < total_num_properties; ++i) {
190 TermMatchType::Code term_match_type = TermMatchType::UNKNOWN;
191 if (num_indexed_properties < kTotalNumSections) {
192 term_match_type = GetRandomTermMatchType(random_);
193 }
194 if (term_match_type != TermMatchType::UNKNOWN) {
195 num_indexed_properties += 1;
196 }
197 (*type_config.add_properties()) = GenerateProperty(
198 type_config, GetRandomCardinality(random_), term_match_type);
199 }
200 return type_config;
201 }
202
UpdateType(SchemaTypeConfigProto & type_config,UpdateSchemaResult & result)203 void MonkeySchemaGenerator::UpdateType(SchemaTypeConfigProto& type_config,
204 UpdateSchemaResult& result) {
205 // Delete up to 4 existing property.
206 std::uniform_int_distribution<> num_properties_to_delete_dist(0, 4);
207 for (int num_properties_to_delete = num_properties_to_delete_dist(*random_);
208 num_properties_to_delete >= 0; --num_properties_to_delete) {
209 if (type_config.properties_size() > 0) {
210 std::uniform_int_distribution<> dist(0,
211 type_config.properties_size() - 1);
212 int index_to_delete = dist(*random_);
213 // Only delete a required property for now, otherwise it would be hard
214 // to track which documents will be invalid after updating the schema.
215 if (type_config.properties(index_to_delete).cardinality() !=
216 PropertyConfigProto::Cardinality::REQUIRED) {
217 continue;
218 }
219 if (IsIndexableProperty(type_config.properties(index_to_delete))) {
220 result.schema_types_index_incompatible.insert(
221 type_config.schema_type());
222 }
223 // Removing a property will cause the type to be considered as
224 // incompatible.
225 result.schema_types_incompatible.insert(type_config.schema_type());
226
227 type_config.mutable_properties()->SwapElements(
228 index_to_delete, type_config.properties_size() - 1);
229 type_config.mutable_properties()->RemoveLast();
230 }
231 }
232
233 // Updating about 1/3 of existing properties.
234 for (int i = 0; i < type_config.properties_size(); ++i) {
235 std::uniform_int_distribution<> dist(0, 2);
236 if (dist(*random_) == 0) {
237 UpdateProperty(type_config, *type_config.mutable_properties(i), result);
238 }
239 }
240
241 // Add up to 4 new properties.
242 std::uniform_int_distribution<> num_types_to_add_dist(0, 4);
243 for (int num_types_to_add = num_types_to_add_dist(*random_);
244 num_types_to_add >= 0; --num_types_to_add) {
245 PropertyConfigProto::Cardinality::Code new_cardinality =
246 GetRandomCardinality(random_);
247 // Adding a required property will make all document of this type invalid.
248 if (new_cardinality == PropertyConfigProto::Cardinality::REQUIRED) {
249 result.schema_types_incompatible.insert(type_config.schema_type());
250 }
251 PropertyConfigProto new_property = GenerateProperty(
252 type_config, new_cardinality, GetRandomTermMatchType(random_));
253 if (IsIndexableProperty(new_property)) {
254 result.schema_types_index_incompatible.insert(type_config.schema_type());
255 }
256 (*type_config.add_properties()) = std::move(new_property);
257 }
258
259 int num_indexed_properties = 0;
260 for (int i = 0; i < type_config.properties_size(); ++i) {
261 if (IsIndexableProperty(type_config.properties(i))) {
262 ++num_indexed_properties;
263 }
264 }
265
266 if (num_indexed_properties > kTotalNumSections) {
267 result.is_invalid_schema = true;
268 }
269 }
270
GetNamespace() const271 std::string MonkeyDocumentGenerator::GetNamespace() const {
272 uint32_t name_space;
273 // When num_namespaces is 0, all documents generated get different namespaces.
274 // Otherwise, namespaces will be randomly picked from a set with
275 // num_namespaces elements.
276 if (config_->num_namespaces == 0) {
277 name_space = num_docs_generated_;
278 } else {
279 std::uniform_int_distribution<> dist(0, config_->num_namespaces - 1);
280 name_space = dist(*random_);
281 }
282 return absl_ports::StrCat("namespace", std::to_string(name_space));
283 }
284
GetUri() const285 std::string MonkeyDocumentGenerator::GetUri() const {
286 uint32_t uri;
287 // When num_uris is 0, all documents generated get different URIs. Otherwise,
288 // URIs will be randomly picked from a set with num_uris elements.
289 if (config_->num_uris == 0) {
290 uri = num_docs_generated_;
291 } else {
292 std::uniform_int_distribution<> dist(0, config_->num_uris - 1);
293 uri = dist(*random_);
294 }
295 return absl_ports::StrCat("uri", std::to_string(uri));
296 }
297
GetNumTokens() const298 int MonkeyDocumentGenerator::GetNumTokens() const {
299 std::uniform_int_distribution<> dist(
300 0, config_->possible_num_tokens_.size() - 1);
301 int n = config_->possible_num_tokens_[dist(*random_)];
302 // Add some noise
303 std::uniform_real_distribution<> real_dist(0.5, 1);
304 float p = real_dist(*random_);
305 return n * p;
306 }
307
GetPropertyContent() const308 std::vector<std::string> MonkeyDocumentGenerator::GetPropertyContent() const {
309 std::vector<std::string> content;
310 int num_tokens = GetNumTokens();
311 while (num_tokens) {
312 content.push_back(std::string(GetToken()));
313 --num_tokens;
314 }
315 return content;
316 }
317
GenerateDocument()318 MonkeyTokenizedDocument MonkeyDocumentGenerator::GenerateDocument() {
319 MonkeyTokenizedDocument document;
320 const SchemaTypeConfigProto& type_config = GetType();
321 const std::string& name_space = GetNamespace();
322 DocumentBuilder doc_builder =
323 DocumentBuilder()
324 .SetNamespace(name_space)
325 .SetSchema(type_config.schema_type())
326 .SetUri(GetUri())
327 .SetCreationTimestampMs(clock_.GetSystemTimeMilliseconds());
328 for (const PropertyConfigProto& prop : type_config.properties()) {
329 std::vector<std::string> prop_content = GetPropertyContent();
330 doc_builder.AddStringProperty(prop.property_name(),
331 absl_ports::StrJoin(prop_content, " "));
332 // No matter whether the property is indexable currently, we have to create
333 // a section for it since a non-indexable property can become indexable
334 // after a schema type change. The in-memory icing will automatically skip
335 // sections that are non-indexable at the time of search requests.
336 MonkeyTokenizedSection section = {prop.property_name(),
337 std::move(prop_content)};
338 document.tokenized_sections.push_back(std::move(section));
339 }
340 document.document = doc_builder.Build();
341 ++num_docs_generated_;
342 return document;
343 }
344
345 } // namespace lib
346 } // namespace icing
347