• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/util/document-validator.h"
16 
17 #include <cstdint>
18 #include <string>
19 #include <string_view>
20 #include <unordered_map>
21 #include <unordered_set>
22 #include <utility>
23 
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/absl_ports/canonical_errors.h"
27 #include "icing/absl_ports/str_cat.h"
28 #include "icing/legacy/core/icing-string-util.h"
29 #include "icing/proto/document.pb.h"
30 #include "icing/proto/schema.pb.h"
31 #include "icing/schema/schema-store.h"
32 #include "icing/schema/schema-util.h"
33 #include "icing/store/document-filter-data.h"
34 #include "icing/util/logging.h"
35 #include "icing/util/status-macros.h"
36 
37 namespace icing {
38 namespace lib {
39 
40 static constexpr int32_t kSha256LengthBytes = 32;
41 
42 using PropertyConfigMap =
43     std::unordered_map<std::string_view, const PropertyConfigProto*>;
44 
DocumentValidator(const SchemaStore * schema_store)45 DocumentValidator::DocumentValidator(const SchemaStore* schema_store)
46     : schema_store_(schema_store) {}
47 
Validate(const DocumentProto & document,int depth)48 libtextclassifier3::Status DocumentValidator::Validate(
49     const DocumentProto& document, int depth) {
50   if (document.namespace_().empty()) {
51     return absl_ports::InvalidArgumentError("Field 'namespace' is empty.");
52   }
53 
54   // Only require a non-empty uri on top-level documents.
55   if (depth == 0 && document.uri().empty()) {
56     return absl_ports::InvalidArgumentError("Field 'uri' is empty.");
57   }
58 
59   if (document.schema().empty()) {
60     return absl_ports::InvalidArgumentError(
61         absl_ports::StrCat("Field 'schema' is empty for key: (",
62                            document.namespace_(), ", ", document.uri(), ")."));
63   }
64 
65   if (document.score() < 0) {
66     return absl_ports::InvalidArgumentError("Field 'score' is negative.");
67   }
68 
69   if (document.creation_timestamp_ms() < 0) {
70     return absl_ports::InvalidArgumentError(
71         "Field 'creation_timestamp_ms' is negative.");
72   }
73 
74   if (document.ttl_ms() < 0) {
75     return absl_ports::InvalidArgumentError("Field 'ttl_ms' is negative.");
76   }
77 
78   // TODO(b/144458732): Implement a more robust version of
79   // ICING_ASSIGN_OR_RETURN that can support error logging.
80   auto type_config_or = schema_store_->GetSchemaTypeConfig(document.schema());
81   if (!type_config_or.ok()) {
82     ICING_LOG(ERROR) << type_config_or.status().error_message()
83                      << "Error while validating document ("
84                      << document.namespace_() << ", " << document.uri() << ")";
85     return type_config_or.status();
86   }
87   const SchemaTypeConfigProto* type_config =
88       std::move(type_config_or).ValueOrDie();
89 
90   int32_t num_required_properties_actual = 0;
91   SchemaUtil::ParsedPropertyConfigs parsed_property_configs =
92       SchemaUtil::ParsePropertyConfigs(*type_config);
93   std::unordered_set<std::string_view> unique_properties;
94 
95   for (const PropertyProto& property : document.properties()) {
96     if (property.name().empty()) {
97       return absl_ports::InvalidArgumentError(absl_ports::StrCat(
98           "Field 'name' is empty in PropertyProto for key: (",
99           document.namespace_(), ", ", document.uri(), ")."));
100     }
101 
102     if (!unique_properties.insert(property.name()).second) {
103       // Failed to insert because of duplicate property name
104       return absl_ports::AlreadyExistsError(absl_ports::StrCat(
105           "Property name '", property.name(), "' already exists for key: (",
106           document.namespace_(), ", ", document.uri(), ")."));
107     }
108 
109     const auto& property_iter =
110         parsed_property_configs.property_config_map.find(property.name());
111     if (property_iter == parsed_property_configs.property_config_map.end()) {
112       return absl_ports::NotFoundError(absl_ports::StrCat(
113           "Property config '", property.name(), "' not found for key: (",
114           document.namespace_(), ", ", document.uri(),
115           ") of type: ", document.schema(), "."));
116     }
117     const PropertyConfigProto& property_config = *property_iter->second;
118 
119     // Get the property value size according to data type.
120     int value_size = 0;
121     if (property_config.data_type() == PropertyConfigProto::DataType::STRING) {
122       value_size = property.string_values_size();
123     } else if (property_config.data_type() ==
124                PropertyConfigProto::DataType::INT64) {
125       value_size = property.int64_values_size();
126     } else if (property_config.data_type() ==
127                PropertyConfigProto::DataType::DOUBLE) {
128       value_size = property.double_values_size();
129     } else if (property_config.data_type() ==
130                PropertyConfigProto::DataType::BOOLEAN) {
131       value_size = property.boolean_values_size();
132     } else if (property_config.data_type() ==
133                PropertyConfigProto::DataType::BYTES) {
134       value_size = property.bytes_values_size();
135     } else if (property_config.data_type() ==
136                PropertyConfigProto::DataType::DOCUMENT) {
137       value_size = property.document_values_size();
138     } else if (property_config.data_type() ==
139                PropertyConfigProto::DataType::VECTOR) {
140       value_size = property.vector_values_size();
141       for (const PropertyProto::VectorProto& vector_value :
142            property.vector_values()) {
143         if (vector_value.values_size() == 0) {
144           return absl_ports::InvalidArgumentError(absl_ports::StrCat(
145               "Property '", property.name(),
146               "' contains empty vectors for key: (", document.namespace_(),
147               ", ", document.uri(), ")."));
148         }
149       }
150     } else if (property_config.data_type() ==
151                PropertyConfigProto::DataType::BLOB_HANDLE) {
152       value_size = property.blob_handle_values_size();
153       for (const PropertyProto::BlobHandleProto& blob_handle_value :
154            property.blob_handle_values()) {
155         if (blob_handle_value.digest().size() != kSha256LengthBytes) {
156           return absl_ports::InvalidArgumentError(absl_ports::StrCat(
157               "Property '", property.name(),
158               "' contains non sha-256 blob digest for key: (",
159               document.namespace_(), ", ", document.uri(), ")."));
160         }
161       }
162     }
163 
164     if (property_config.cardinality() ==
165         PropertyConfigProto::Cardinality::OPTIONAL) {
166       if (value_size != 0 && value_size != 1) {
167         return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
168             "Property '%s' is optional but %d elements are "
169             "found for key: (%s, %s).",
170             property.name().c_str(), value_size, document.namespace_().c_str(),
171             document.uri().c_str()));
172       }
173     } else if (property_config.cardinality() ==
174                PropertyConfigProto::Cardinality::REQUIRED) {
175       if (value_size != 1) {
176         return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
177             "Property '%s' with only 1 value is required but "
178             "%d elements are found for key: (%s, %s).",
179             property.name().c_str(), value_size, document.namespace_().c_str(),
180             document.uri().c_str()));
181       }
182       num_required_properties_actual++;
183     }
184 
185     // We put the validation for nested DocumentProto at last separately
186     // because it takes longer time to run. If any of the previous validations
187     // fail, we don't need to validate the extra documents.
188     if (property_config.data_type() ==
189         PropertyConfigProto::DataType::DOCUMENT) {
190       ICING_ASSIGN_OR_RETURN(
191           const std::unordered_set<SchemaTypeId>* nested_type_ids_expected,
192           schema_store_->GetSchemaTypeIdsWithChildren(
193               property_config.schema_type()));
194       for (const DocumentProto& nested_document : property.document_values()) {
195         libtextclassifier3::StatusOr<SchemaTypeId> nested_document_type_id_or =
196             schema_store_->GetSchemaTypeId(nested_document.schema());
197         if (!nested_document_type_id_or.ok() ||
198             nested_type_ids_expected->count(
199                 nested_document_type_id_or.ValueOrDie()) == 0) {
200           return absl_ports::InvalidArgumentError(absl_ports::StrCat(
201               "Property '", property.name(), "' should be type or subtype of '",
202               property_config.schema_type(), "' but actual value has type '",
203               nested_document.schema(), "' for key: (", document.namespace_(),
204               ", ", document.uri(), ")."));
205         }
206         ICING_RETURN_IF_ERROR(Validate(nested_document, depth + 1));
207       }
208     }
209   }
210   if (num_required_properties_actual <
211       parsed_property_configs.required_properties.size()) {
212     return absl_ports::InvalidArgumentError(
213         absl_ports::StrCat("One or more required fields missing for key: (",
214                            document.namespace_(), ", ", document.uri(), ")."));
215   }
216   return libtextclassifier3::Status::OK;
217 }
218 
219 }  // namespace lib
220 }  // namespace icing
221