• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2022 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/monkey_test/monkey-test-generators.h"
16 
17 #include <array>
18 #include <cstdint>
19 #include <random>
20 #include <string>
21 #include <string_view>
22 #include <unordered_set>
23 #include <utility>
24 #include <vector>
25 
26 #include "icing/absl_ports/str_cat.h"
27 #include "icing/absl_ports/str_join.h"
28 #include "icing/document-builder.h"
29 #include "icing/monkey_test/monkey-test-util.h"
30 #include "icing/monkey_test/monkey-tokenized-document.h"
31 #include "icing/proto/schema.pb.h"
32 #include "icing/proto/term.pb.h"
33 #include "icing/schema/section.h"
34 
35 namespace icing {
36 namespace lib {
37 
38 namespace {
39 
40 constexpr std::array<PropertyConfigProto::Cardinality::Code, 3> kCardinalities =
41     {PropertyConfigProto::Cardinality::REPEATED,
42      PropertyConfigProto::Cardinality::OPTIONAL,
43      PropertyConfigProto::Cardinality::REQUIRED};
44 
45 constexpr std::array<TermMatchType::Code, 3> kTermMatchTypes = {
46     TermMatchType::UNKNOWN, TermMatchType::EXACT_ONLY, TermMatchType::PREFIX};
47 
GetRandomCardinality(MonkeyTestRandomEngine * random)48 PropertyConfigProto::Cardinality::Code GetRandomCardinality(
49     MonkeyTestRandomEngine* random) {
50   std::uniform_int_distribution<> dist(0, kCardinalities.size() - 1);
51   return kCardinalities[dist(*random)];
52 }
53 
GetRandomTermMatchType(MonkeyTestRandomEngine * random)54 TermMatchType::Code GetRandomTermMatchType(MonkeyTestRandomEngine* random) {
55   std::uniform_int_distribution<> dist(0, kTermMatchTypes.size() - 1);
56   return kTermMatchTypes[dist(*random)];
57 }
58 
59 // TODO: Update this function when supporting document_indexing_config.
IsIndexableProperty(const PropertyConfigProto & property)60 bool IsIndexableProperty(const PropertyConfigProto& property) {
61   return property.string_indexing_config().term_match_type() !=
62          TermMatchType::UNKNOWN;
63 }
64 
SetStringIndexingConfig(PropertyConfigProto & property,TermMatchType::Code term_match_type)65 void SetStringIndexingConfig(PropertyConfigProto& property,
66                              TermMatchType::Code term_match_type) {
67   if (term_match_type != TermMatchType::UNKNOWN) {
68     StringIndexingConfig* string_indexing_config =
69         property.mutable_string_indexing_config();
70     string_indexing_config->set_term_match_type(term_match_type);
71     // TODO: Try to add different TokenizerTypes. VERBATIM, RFC822, and URL are
72     // the remaining candidates to consider.
73     string_indexing_config->set_tokenizer_type(
74         StringIndexingConfig::TokenizerType::PLAIN);
75   } else {
76     property.clear_string_indexing_config();
77   }
78 }
79 
80 }  // namespace
81 
GenerateSchema()82 SchemaProto MonkeySchemaGenerator::GenerateSchema() {
83   SchemaProto schema;
84   for (int i = 0; i < config_->num_types; ++i) {
85     *schema.add_types() = GenerateType();
86   }
87   return schema;
88 }
89 
UpdateSchema(const SchemaProto & schema)90 MonkeySchemaGenerator::UpdateSchemaResult MonkeySchemaGenerator::UpdateSchema(
91     const SchemaProto& schema) {
92   UpdateSchemaResult result = {std::move(schema)};
93   SchemaProto& new_schema = result.schema;
94 
95   // Delete up to 2 existing types.
96   std::uniform_int_distribution<> num_types_to_delete_dist(0, 2);
97   for (int num_types_to_delete = num_types_to_delete_dist(*random_);
98        num_types_to_delete >= 0; --num_types_to_delete) {
99     if (new_schema.types_size() > 0) {
100       std::uniform_int_distribution<> dist(0, new_schema.types_size() - 1);
101       int index_to_delete = dist(*random_);
102       result.schema_types_deleted.insert(
103           new_schema.types(index_to_delete).schema_type());
104       new_schema.mutable_types()->SwapElements(index_to_delete,
105                                                new_schema.types_size() - 1);
106       new_schema.mutable_types()->RemoveLast();
107     }
108   }
109 
110   // Updating about 1/3 of existing types.
111   for (int i = 0; i < new_schema.types_size(); ++i) {
112     std::uniform_int_distribution<> dist(0, 2);
113     if (dist(*random_) == 0) {
114       UpdateType(*new_schema.mutable_types(i), result);
115     }
116   }
117 
118   // Add up to 2 new types.
119   std::uniform_int_distribution<> num_types_to_add_dist(0, 2);
120   for (int num_types_to_add = num_types_to_add_dist(*random_);
121        num_types_to_add >= 0; --num_types_to_add) {
122     *new_schema.add_types() = GenerateType();
123   }
124 
125   return result;
126 }
127 
GenerateProperty(const SchemaTypeConfigProto & type_config,PropertyConfigProto::Cardinality::Code cardinality,TermMatchType::Code term_match_type)128 PropertyConfigProto MonkeySchemaGenerator::GenerateProperty(
129     const SchemaTypeConfigProto& type_config,
130     PropertyConfigProto::Cardinality::Code cardinality,
131     TermMatchType::Code term_match_type) {
132   PropertyConfigProto prop;
133   prop.set_property_name(
134       "MonkeyTestProp" +
135       std::to_string(num_properties_generated_[type_config.schema_type()]++));
136   // TODO: Perhaps in future iterations we will want to generate more than just
137   // string properties.
138   prop.set_data_type(PropertyConfigProto::DataType::STRING);
139   prop.set_cardinality(cardinality);
140   SetStringIndexingConfig(prop, term_match_type);
141   return prop;
142 }
143 
UpdateProperty(const SchemaTypeConfigProto & type_config,PropertyConfigProto & property,UpdateSchemaResult & result)144 void MonkeySchemaGenerator::UpdateProperty(
145     const SchemaTypeConfigProto& type_config, PropertyConfigProto& property,
146     UpdateSchemaResult& result) {
147   PropertyConfigProto::Cardinality::Code new_cardinality =
148       GetRandomCardinality(random_);
149   if (new_cardinality != property.cardinality()) {
150     // Only do compatible cardinality update for now, otherwise it would be hard
151     // to track which documents will be invalid after updating the schema.
152     //
153     // The following type of updates are not allowed:
154     // - optional -> required
155     // - repeated -> optional
156     // - repeated -> required
157     if (property.cardinality() == PropertyConfigProto::Cardinality::OPTIONAL &&
158         new_cardinality == PropertyConfigProto::Cardinality::REQUIRED) {
159       return;
160     }
161     if (property.cardinality() == PropertyConfigProto::Cardinality::REPEATED &&
162         (new_cardinality == PropertyConfigProto::Cardinality::OPTIONAL ||
163          new_cardinality == PropertyConfigProto::Cardinality::REQUIRED)) {
164       return;
165     }
166     property.set_cardinality(new_cardinality);
167   }
168 
169   if (property.data_type() == PropertyConfigProto::DataType::STRING) {
170     TermMatchType::Code new_term_match_type = GetRandomTermMatchType(random_);
171     if (new_term_match_type !=
172         property.string_indexing_config().term_match_type()) {
173       SetStringIndexingConfig(property, new_term_match_type);
174       result.schema_types_index_incompatible.insert(type_config.schema_type());
175     }
176   }
177 }
178 
GenerateType()179 SchemaTypeConfigProto MonkeySchemaGenerator::GenerateType() {
180   SchemaTypeConfigProto type_config;
181   type_config.set_schema_type("MonkeyTestType" +
182                               std::to_string(num_types_generated_++));
183   std::uniform_int_distribution<> possible_num_properties_dist(
184       0, config_->possible_num_properties.size() - 1);
185   int total_num_properties =
186       config_->possible_num_properties[possible_num_properties_dist(*random_)];
187 
188   int num_indexed_properties = 0;
189   for (int i = 0; i < total_num_properties; ++i) {
190     TermMatchType::Code term_match_type = TermMatchType::UNKNOWN;
191     if (num_indexed_properties < kTotalNumSections) {
192       term_match_type = GetRandomTermMatchType(random_);
193     }
194     if (term_match_type != TermMatchType::UNKNOWN) {
195       num_indexed_properties += 1;
196     }
197     (*type_config.add_properties()) = GenerateProperty(
198         type_config, GetRandomCardinality(random_), term_match_type);
199   }
200   return type_config;
201 }
202 
UpdateType(SchemaTypeConfigProto & type_config,UpdateSchemaResult & result)203 void MonkeySchemaGenerator::UpdateType(SchemaTypeConfigProto& type_config,
204                                        UpdateSchemaResult& result) {
205   // Delete up to 4 existing property.
206   std::uniform_int_distribution<> num_properties_to_delete_dist(0, 4);
207   for (int num_properties_to_delete = num_properties_to_delete_dist(*random_);
208        num_properties_to_delete >= 0; --num_properties_to_delete) {
209     if (type_config.properties_size() > 0) {
210       std::uniform_int_distribution<> dist(0,
211                                            type_config.properties_size() - 1);
212       int index_to_delete = dist(*random_);
213       // Only delete a required property for now, otherwise it would be hard
214       // to track which documents will be invalid after updating the schema.
215       if (type_config.properties(index_to_delete).cardinality() !=
216           PropertyConfigProto::Cardinality::REQUIRED) {
217         continue;
218       }
219       if (IsIndexableProperty(type_config.properties(index_to_delete))) {
220         result.schema_types_index_incompatible.insert(
221             type_config.schema_type());
222       }
223       // Removing a property will cause the type to be considered as
224       // incompatible.
225       result.schema_types_incompatible.insert(type_config.schema_type());
226 
227       type_config.mutable_properties()->SwapElements(
228           index_to_delete, type_config.properties_size() - 1);
229       type_config.mutable_properties()->RemoveLast();
230     }
231   }
232 
233   // Updating about 1/3 of existing properties.
234   for (int i = 0; i < type_config.properties_size(); ++i) {
235     std::uniform_int_distribution<> dist(0, 2);
236     if (dist(*random_) == 0) {
237       UpdateProperty(type_config, *type_config.mutable_properties(i), result);
238     }
239   }
240 
241   // Add up to 4 new properties.
242   std::uniform_int_distribution<> num_types_to_add_dist(0, 4);
243   for (int num_types_to_add = num_types_to_add_dist(*random_);
244        num_types_to_add >= 0; --num_types_to_add) {
245     PropertyConfigProto::Cardinality::Code new_cardinality =
246         GetRandomCardinality(random_);
247     // Adding a required property will make all document of this type invalid.
248     if (new_cardinality == PropertyConfigProto::Cardinality::REQUIRED) {
249       result.schema_types_incompatible.insert(type_config.schema_type());
250     }
251     PropertyConfigProto new_property = GenerateProperty(
252         type_config, new_cardinality, GetRandomTermMatchType(random_));
253     if (IsIndexableProperty(new_property)) {
254       result.schema_types_index_incompatible.insert(type_config.schema_type());
255     }
256     (*type_config.add_properties()) = std::move(new_property);
257   }
258 
259   int num_indexed_properties = 0;
260   for (int i = 0; i < type_config.properties_size(); ++i) {
261     if (IsIndexableProperty(type_config.properties(i))) {
262       ++num_indexed_properties;
263     }
264   }
265 
266   if (num_indexed_properties > kTotalNumSections) {
267     result.is_invalid_schema = true;
268   }
269 }
270 
GetNamespace() const271 std::string MonkeyDocumentGenerator::GetNamespace() const {
272   uint32_t name_space;
273   // When num_namespaces is 0, all documents generated get different namespaces.
274   // Otherwise, namespaces will be randomly picked from a set with
275   // num_namespaces elements.
276   if (config_->num_namespaces == 0) {
277     name_space = num_docs_generated_;
278   } else {
279     std::uniform_int_distribution<> dist(0, config_->num_namespaces - 1);
280     name_space = dist(*random_);
281   }
282   return absl_ports::StrCat("namespace", std::to_string(name_space));
283 }
284 
GetUri() const285 std::string MonkeyDocumentGenerator::GetUri() const {
286   uint32_t uri;
287   // When num_uris is 0, all documents generated get different URIs. Otherwise,
288   // URIs will be randomly picked from a set with num_uris elements.
289   if (config_->num_uris == 0) {
290     uri = num_docs_generated_;
291   } else {
292     std::uniform_int_distribution<> dist(0, config_->num_uris - 1);
293     uri = dist(*random_);
294   }
295   return absl_ports::StrCat("uri", std::to_string(uri));
296 }
297 
GetNumTokens() const298 int MonkeyDocumentGenerator::GetNumTokens() const {
299   std::uniform_int_distribution<> dist(
300       0, config_->possible_num_tokens_.size() - 1);
301   int n = config_->possible_num_tokens_[dist(*random_)];
302   // Add some noise
303   std::uniform_real_distribution<> real_dist(0.5, 1);
304   float p = real_dist(*random_);
305   return n * p;
306 }
307 
GetPropertyContent() const308 std::vector<std::string> MonkeyDocumentGenerator::GetPropertyContent() const {
309   std::vector<std::string> content;
310   int num_tokens = GetNumTokens();
311   while (num_tokens) {
312     content.push_back(std::string(GetToken()));
313     --num_tokens;
314   }
315   return content;
316 }
317 
GenerateDocument()318 MonkeyTokenizedDocument MonkeyDocumentGenerator::GenerateDocument() {
319   MonkeyTokenizedDocument document;
320   const SchemaTypeConfigProto& type_config = GetType();
321   const std::string& name_space = GetNamespace();
322   DocumentBuilder doc_builder =
323       DocumentBuilder()
324           .SetNamespace(name_space)
325           .SetSchema(type_config.schema_type())
326           .SetUri(GetUri())
327           .SetCreationTimestampMs(clock_.GetSystemTimeMilliseconds());
328   for (const PropertyConfigProto& prop : type_config.properties()) {
329     std::vector<std::string> prop_content = GetPropertyContent();
330     doc_builder.AddStringProperty(prop.property_name(),
331                                   absl_ports::StrJoin(prop_content, " "));
332     // No matter whether the property is indexable currently, we have to create
333     // a section for it since a non-indexable property can become indexable
334     // after a schema type change. The in-memory icing will automatically skip
335     // sections that are non-indexable at the time of search requests.
336     MonkeyTokenizedSection section = {prop.property_name(),
337                                       std::move(prop_content)};
338     document.tokenized_sections.push_back(std::move(section));
339   }
340   document.document = doc_builder.Build();
341   ++num_docs_generated_;
342   return document;
343 }
344 
345 }  // namespace lib
346 }  // namespace icing
347