1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/util/document-validator.h"
16
17 #include <cstdint>
18 #include <unordered_set>
19
20 #include "icing/text_classifier/lib3/utils/base/status.h"
21 #include "icing/absl_ports/canonical_errors.h"
22 #include "icing/proto/document.pb.h"
23 #include "icing/proto/schema.pb.h"
24 #include "icing/schema/schema-util.h"
25 #include "icing/util/status-macros.h"
26
27 namespace icing {
28 namespace lib {
29
30 using PropertyConfigMap =
31 std::unordered_map<std::string_view, const PropertyConfigProto*>;
32
DocumentValidator(const SchemaStore * schema_store)33 DocumentValidator::DocumentValidator(const SchemaStore* schema_store)
34 : schema_store_(schema_store) {}
35
Validate(const DocumentProto & document,int depth)36 libtextclassifier3::Status DocumentValidator::Validate(
37 const DocumentProto& document, int depth) {
38 if (document.namespace_().empty()) {
39 return absl_ports::InvalidArgumentError("Field 'namespace' is empty.");
40 }
41
42 // Only require a non-empty uri on top-level documents.
43 if (depth == 0 && document.uri().empty()) {
44 return absl_ports::InvalidArgumentError("Field 'uri' is empty.");
45 }
46
47 if (document.schema().empty()) {
48 return absl_ports::InvalidArgumentError(
49 absl_ports::StrCat("Field 'schema' is empty for key: (",
50 document.namespace_(), ", ", document.uri(), ")."));
51 }
52
53 if (document.score() < 0) {
54 return absl_ports::InvalidArgumentError("Field 'score' is negative.");
55 }
56
57 if (document.creation_timestamp_ms() < 0) {
58 return absl_ports::InvalidArgumentError(
59 "Field 'creation_timestamp_ms' is negative.");
60 }
61
62 if (document.ttl_ms() < 0) {
63 return absl_ports::InvalidArgumentError("Field 'ttl_ms' is negative.");
64 }
65
66 // TODO(b/144458732): Implement a more robust version of
67 // ICING_ASSIGN_OR_RETURN that can support error logging.
68 auto type_config_or = schema_store_->GetSchemaTypeConfig(document.schema());
69 if (!type_config_or.ok()) {
70 ICING_LOG(ERROR) << type_config_or.status().error_message()
71 << "Error while validating document ("
72 << document.namespace_() << ", " << document.uri() << ")";
73 return type_config_or.status();
74 }
75 const SchemaTypeConfigProto* type_config =
76 std::move(type_config_or).ValueOrDie();
77
78 int32_t num_required_properties_actual = 0;
79 SchemaUtil::ParsedPropertyConfigs parsed_property_configs =
80 SchemaUtil::ParsePropertyConfigs(*type_config);
81 std::unordered_set<std::string_view> unique_properties;
82
83 for (const PropertyProto& property : document.properties()) {
84 if (property.name().empty()) {
85 return absl_ports::InvalidArgumentError(absl_ports::StrCat(
86 "Field 'name' is empty in PropertyProto for key: (",
87 document.namespace_(), ", ", document.uri(), ")."));
88 }
89
90 if (!unique_properties.insert(property.name()).second) {
91 // Failed to insert because of duplicate property name
92 return absl_ports::AlreadyExistsError(absl_ports::StrCat(
93 "Property name '", property.name(), "' already exists for key: (",
94 document.namespace_(), ", ", document.uri(), ")."));
95 }
96
97 const auto& property_iter =
98 parsed_property_configs.property_config_map.find(property.name());
99 if (property_iter == parsed_property_configs.property_config_map.end()) {
100 return absl_ports::NotFoundError(absl_ports::StrCat(
101 "Property config '", property.name(), "' not found for key: (",
102 document.namespace_(), ", ", document.uri(),
103 ") of type: ", document.schema(), "."));
104 }
105 const PropertyConfigProto& property_config = *property_iter->second;
106
107 // Get the property value size according to data type.
108 int value_size = 0;
109 if (property_config.data_type() == PropertyConfigProto::DataType::STRING) {
110 value_size = property.string_values_size();
111 } else if (property_config.data_type() ==
112 PropertyConfigProto::DataType::INT64) {
113 value_size = property.int64_values_size();
114 } else if (property_config.data_type() ==
115 PropertyConfigProto::DataType::DOUBLE) {
116 value_size = property.double_values_size();
117 } else if (property_config.data_type() ==
118 PropertyConfigProto::DataType::BOOLEAN) {
119 value_size = property.boolean_values_size();
120 } else if (property_config.data_type() ==
121 PropertyConfigProto::DataType::BYTES) {
122 value_size = property.bytes_values_size();
123 } else if (property_config.data_type() ==
124 PropertyConfigProto::DataType::DOCUMENT) {
125 value_size = property.document_values_size();
126 }
127
128 if (property_config.cardinality() ==
129 PropertyConfigProto::Cardinality::OPTIONAL) {
130 if (value_size != 0 && value_size != 1) {
131 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
132 "Property '%s' is optional but %d elements are "
133 "found for key: (%s, %s).",
134 property.name().c_str(), value_size, document.namespace_().c_str(),
135 document.uri().c_str()));
136 }
137 } else if (property_config.cardinality() ==
138 PropertyConfigProto::Cardinality::REQUIRED) {
139 if (value_size != 1) {
140 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
141 "Property '%s' with only 1 value is required but "
142 "%d elements are found for key: (%s, %s).",
143 property.name().c_str(), value_size, document.namespace_().c_str(),
144 document.uri().c_str()));
145 }
146 num_required_properties_actual++;
147 }
148
149 // We put the validation for nested DocumentProto at last separately
150 // because it takes longer time to run. If any of the previous validations
151 // fail, we don't need to validate the extra documents.
152 if (property_config.data_type() ==
153 PropertyConfigProto::DataType::DOCUMENT) {
154 ICING_ASSIGN_OR_RETURN(
155 const std::unordered_set<SchemaTypeId>* nested_type_ids_expected,
156 schema_store_->GetSchemaTypeIdsWithChildren(
157 property_config.schema_type()));
158 for (const DocumentProto& nested_document : property.document_values()) {
159 libtextclassifier3::StatusOr<SchemaTypeId> nested_document_type_id_or =
160 schema_store_->GetSchemaTypeId(nested_document.schema());
161 if (!nested_document_type_id_or.ok() ||
162 nested_type_ids_expected->count(
163 nested_document_type_id_or.ValueOrDie()) == 0) {
164 return absl_ports::InvalidArgumentError(absl_ports::StrCat(
165 "Property '", property.name(), "' should be type or subtype of '",
166 property_config.schema_type(), "' but actual value has type '",
167 nested_document.schema(), "' for key: (", document.namespace_(),
168 ", ", document.uri(), ")."));
169 }
170 ICING_RETURN_IF_ERROR(Validate(nested_document, depth + 1));
171 }
172 }
173 }
174 if (num_required_properties_actual <
175 parsed_property_configs.num_required_properties) {
176 return absl_ports::InvalidArgumentError(
177 absl_ports::StrCat("One or more required fields missing for key: (",
178 document.namespace_(), ", ", document.uri(), ")."));
179 }
180 return libtextclassifier3::Status::OK;
181 }
182
183 } // namespace lib
184 } // namespace icing
185