1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_SCHEMA_SCHEMA_UTIL_H_ 16 #define ICING_SCHEMA_SCHEMA_UTIL_H_ 17 18 #include <cstdint> 19 #include <string> 20 #include <string_view> 21 #include <unordered_map> 22 #include <unordered_set> 23 24 #include "icing/text_classifier/lib3/utils/base/status.h" 25 #include "icing/text_classifier/lib3/utils/base/statusor.h" 26 #include "icing/proto/schema.pb.h" 27 28 namespace icing { 29 namespace lib { 30 31 class SchemaUtil { 32 public: 33 using TypeConfigMap = 34 std::unordered_map<std::string, const SchemaTypeConfigProto>; 35 36 // A data structure that stores the relationships between schema types. The 37 // keys in TypeRelationMap are schema types, and the values are sets of schema 38 // types that are directly or indirectly related to the key. 39 template <typename T> 40 using TypeRelationMap = 41 std::unordered_map<std::string_view, 42 std::unordered_map<std::string_view, T>>; 43 44 // If A -> B is indicated in the map, then type A must be built before 45 // building type B, which implies one of the following situations. 46 // 47 // 1. B has a property of type A. 48 // 2. A is a parent type of B via polymorphism. 49 // 50 // For the first case, this map will also include all PropertyConfigProto 51 // (with DOCUMENT data_type) pointers which *directly* connects type A and B. 52 // IOW, this vector of PropertyConfigProto* are "direct edges" connecting A 53 // and B directly. It will be an empty vector if A and B are not "directly" 54 // connected, but instead via another intermediate level of schema type. For 55 // example, the actual dependency is A -> C -> B, so there will be A -> C and 56 // C -> B with valid PropertyConfigProto* respectively in this map, but we 57 // will also expand transitive dependents: add A -> B into dependent map with 58 // empty vector of "edges". 59 using DependentMap = TypeRelationMap<std::vector<const PropertyConfigProto*>>; 60 61 // If A -> B is indicated in the map, then type A is a parent type of B, 62 // directly or indirectly. If directly, the bool value in the map will be 63 // true, otherwise false. 64 // 65 // Note that all relationships contained in this map are also entries in the 66 // DependentMap, i.e. if B inherits from A, then there will be a mapping from 67 // A to B in both this map and the DependentMap. 68 using InheritanceMap = TypeRelationMap<bool>; 69 70 struct SchemaDelta { 71 // Which schema types were present in the old schema, but were deleted from 72 // the new schema. 73 std::unordered_set<std::string> schema_types_deleted; 74 75 // Which schema types had their SchemaTypeConfigProto changed in a way that 76 // could invalidate existing Documents of that schema type. 77 std::unordered_set<std::string> schema_types_incompatible; 78 79 // Schema types that were added in the new schema. Represented by the 80 // `schema_type` field in the SchemaTypeConfigProto. 81 std::unordered_set<std::string> schema_types_new; 82 83 // Schema types that were changed in a way that was backwards compatible and 84 // didn't invalidate the index. Represented by the `schema_type` field in 85 // the SchemaTypeConfigProto. 86 std::unordered_set<std::string> schema_types_changed_fully_compatible; 87 88 // Schema types that were changed in a way that was backwards compatible, 89 // but invalidated the index. Represented by the `schema_type` field in the 90 // SchemaTypeConfigProto. 91 std::unordered_set<std::string> schema_types_index_incompatible; 92 93 // Schema types that were changed in a way that was backwards compatible, 94 // but invalidated the joinable cache. Represented by the `schema_type` 95 // field in the SchemaTypeConfigProto. 96 std::unordered_set<std::string> schema_types_join_incompatible; 97 98 bool operator==(const SchemaDelta& other) const { 99 return schema_types_deleted == other.schema_types_deleted && 100 schema_types_incompatible == other.schema_types_incompatible && 101 schema_types_new == other.schema_types_new && 102 schema_types_changed_fully_compatible == 103 other.schema_types_changed_fully_compatible && 104 schema_types_index_incompatible == 105 other.schema_types_index_incompatible && 106 schema_types_join_incompatible == 107 other.schema_types_join_incompatible; 108 } 109 }; 110 111 struct ParsedPropertyConfigs { 112 // Mapping of property name to PropertyConfigProto 113 std::unordered_map<std::string_view, const PropertyConfigProto*> 114 property_config_map; 115 116 // Properties that have an indexing config 117 std::unordered_set<std::string_view> indexed_properties; 118 119 // Properties that were REQUIRED 120 std::unordered_set<std::string_view> required_properties; 121 122 // Properties that have joinable config 123 std::unordered_set<std::string_view> joinable_properties; 124 125 // Properties that have DataType::DOCUMENT 126 std::unordered_set<std::string_view> nested_document_properties; 127 }; 128 129 // This function validates: 130 // 1. SchemaTypeConfigProto.schema_type's must be unique 131 // 2. Properties within one SchemaTypeConfigProto must be unique 132 // 3. SchemaTypeConfigProtos.schema_type must be non-empty 133 // 4. PropertyConfigProtos.property_name must be non-empty 134 // 5. PropertyConfigProtos.property_name's must be unique within one 135 // SchemaTypeConfigProto 136 // 6. PropertyConfigProtos.data_type cannot be UNKNOWN 137 // 7. PropertyConfigProtos.data_type of DOCUMENT must also have a 138 // schema_type 139 // 8. PropertyConfigProtos.cardinality cannot be UNKNOWN 140 // 9. PropertyConfigProtos.schema_type's must correspond to a 141 // SchemaTypeConfigProto.schema_type 142 // 10. Property names can only be alphanumeric. 143 // 11. Any STRING data types have a valid string_indexing_config 144 // 12. PropertyConfigProtos.joinable_config must be valid. See 145 // ValidateJoinableConfig for more details. 146 // 13. Any PropertyConfigProtos with nested DOCUMENT data type must not have 147 // REPEATED cardinality if they reference a schema type containing 148 // joinable property. 149 // 14. The schema definition cannot have invalid cycles. A cycle is invalid 150 // if: 151 // a. SchemaTypeConfigProto.parent_type definitions form an inheritance 152 // cycle. 153 // b. The schema's property definitions have schema_types that form a 154 // cycle, and all properties on the cycle declare 155 // DocumentIndexingConfig.index_nested_properties=true. 156 // c. The schema's property definitions have schema_types that form a 157 // cycle, and the cycle leads to an invalid joinable property config. 158 // This is the case if: 159 // i. Any type node in the cycle itself has a joinable proprty 160 // (property whose joinable config is not NONE), OR 161 // ii. Any type node in the cycle has a nested-type (direct or 162 // indirect) with a joinable property. 163 // 15. For DOCUMENT data types, if 164 // DocumentIndexingConfig.indexable_nested_properties_list is non-empty, 165 // DocumentIndexingConfig.index_nested_properties must be false. 166 // 167 // Returns: 168 // On success, a dependent map from each types to their dependent types 169 // that depend on it directly or indirectly. 170 // ALREADY_EXISTS for case 1 and 2 171 // INVALID_ARGUMENT for 3-15 172 static libtextclassifier3::StatusOr<DependentMap> Validate( 173 const SchemaProto& schema, bool allow_circular_schema_definitions); 174 175 // Builds a transitive inheritance map. 176 // 177 // Ex. Suppose we have a schema with four types A, B, C and D, and we have the 178 // following direct inheritance relation. 179 // 180 // A -> B (A is the parent type of B) 181 // B -> C (B is the parent type of C) 182 // C -> D (C is the parent type of D) 183 // 184 // Then, the transitive inheritance map for this schema would be: 185 // 186 // A -> B, C, D 187 // B -> C, D 188 // C -> D 189 // 190 // RETURNS: 191 // On success, a transitive inheritance map of all types in the schema. 192 // INVALID_ARGUMENT if the inheritance graph contains a cycle. 193 static libtextclassifier3::StatusOr<SchemaUtil::InheritanceMap> 194 BuildTransitiveInheritanceGraph(const SchemaProto& schema); 195 196 // Creates a mapping of schema type -> schema type config proto. The 197 // type_config_map is cleared, and then each schema-type_config_proto pair is 198 // placed in the given type_config_map parameter. 199 static void BuildTypeConfigMap(const SchemaProto& schema, 200 TypeConfigMap* type_config_map); 201 202 // Parses the given type_config and returns a struct of easily-parseable 203 // information about the properties. 204 static ParsedPropertyConfigs ParsePropertyConfigs( 205 const SchemaTypeConfigProto& type_config); 206 207 // Computes the delta between the old and new schema. There are a few 208 // differences that'll be reported: 209 // 1. The derived index would be incompatible. This is held in 210 // `SchemaDelta.index_incompatible`. 211 // 2. Some schema types existed in the old schema, but have been deleted 212 // from the new schema. This is held in 213 // `SchemaDelta.schema_types_deleted` 214 // 3. A schema type's new definition would mean any existing data of the old 215 // definition is now incompatible. 216 // 4. The derived join index would be incompatible. This is held in 217 // `SchemaDelta.join_incompatible`. 218 // 219 // For case 1, the two schemas would result in an incompatible index if: 220 // 1.1. The new SchemaProto has a different set of indexed properties than 221 // the old SchemaProto. 222 // 223 // For case 3, the two schemas would result in incompatible data if: 224 // 3.1. A SchemaTypeConfig exists in the old SchemaProto, but is not in the 225 // new SchemaProto 226 // 3.2. A property exists in the old SchemaTypeConfig, but is not in the new 227 // SchemaTypeConfig 228 // 3.3. A property in the new SchemaTypeConfig and has a REQUIRED 229 // PropertyConfigProto.cardinality, but is not in the old 230 // SchemaTypeConfig 231 // 3.4. A property is in both the old and new SchemaTypeConfig, but its 232 // PropertyConfigProto.data_type is different 233 // 3.5. A property is in both the old and new SchemaTypeConfig, but its 234 // PropertyConfigProto.schema_type is different 235 // 3.6. A property is in both the old and new SchemaTypeConfig, but its new 236 // PropertyConfigProto.cardinality is more restrictive. Restrictive 237 // scale defined as: 238 // LEAST <REPEATED - OPTIONAL - REQUIRED> MOST 239 // 240 // For case 4, the two schemas would result in an incompatible join if: 241 // 4.1. A SchematypeConfig exists in the new SchemaProto that has a 242 // different set of joinable properties than it did in the old 243 // SchemaProto. 244 // 245 // A property is defined by the combination of the 246 // SchemaTypeConfig.schema_type and the PropertyConfigProto.property_name. 247 // 248 // Returns a SchemaDelta that captures the aforementioned differences. 249 static const SchemaDelta ComputeCompatibilityDelta( 250 const SchemaProto& old_schema, const SchemaProto& new_schema, 251 const DependentMap& new_schema_dependent_map); 252 253 // Validates the 'property_name' field. 254 // 1. Can't be an empty string 255 // 2. Can only contain alphanumeric characters 256 // 257 // NOTE: schema_type is only used for logging. It is not necessary to populate 258 // it. 259 // 260 // RETURNS: 261 // - OK if property_name is valid 262 // - INVALID_ARGUMENT if property name is empty or contains an 263 // non-alphabetic character. 264 static libtextclassifier3::Status ValidatePropertyName( 265 std::string_view property_name, std::string_view schema_type = ""); 266 267 static bool IsIndexedProperty(const PropertyConfigProto& property_config); 268 269 private: 270 // Validates the 'schema_type' field 271 // 272 // Returns: 273 // INVALID_ARGUMENT if 'schema_type' is an empty string. 274 // OK on success 275 static libtextclassifier3::Status ValidateSchemaType( 276 std::string_view schema_type); 277 278 // Validates the 'data_type' field. 279 // 280 // Returns: 281 // INVALID_ARGUMENT if it's UNKNOWN 282 // OK on success 283 static libtextclassifier3::Status ValidateDataType( 284 PropertyConfigProto::DataType::Code data_type, 285 std::string_view schema_type, std::string_view property_name); 286 287 // Validates the 'cardinality' field. 288 // 289 // Returns: 290 // INVALID_ARGUMENT if it's UNKNOWN 291 // OK on success 292 static libtextclassifier3::Status ValidateCardinality( 293 PropertyConfigProto::Cardinality::Code cardinality, 294 std::string_view schema_type, std::string_view property_name); 295 296 // Checks that the 'string_indexing_config' satisfies the following rules: 297 // 1. Only STRING data types can be indexed 298 // 2. An indexed property must have a valid tokenizer type 299 // 300 // Returns: 301 // INVALID_ARGUMENT if any of the rules are not followed 302 // OK on success 303 static libtextclassifier3::Status ValidateStringIndexingConfig( 304 const StringIndexingConfig& config, 305 PropertyConfigProto::DataType::Code data_type, 306 std::string_view schema_type, std::string_view property_name); 307 308 // Checks that the 'joinable_config' satisfies the following rules: 309 // 1. If the data type matches joinable value type 310 // a. Only STRING data types can use QUALIFIED_ID joinable value type 311 // 2. Only QUALIFIED_ID joinable value type can have delete propagation 312 // enabled 313 // 3. Any joinable property should have non-REPEATED cardinality 314 // 315 // Returns: 316 // INVALID_ARGUMENT if any of the rules are not followed 317 // OK on success 318 static libtextclassifier3::Status ValidateJoinableConfig( 319 const JoinableConfig& config, 320 PropertyConfigProto::DataType::Code data_type, 321 PropertyConfigProto::Cardinality::Code cardinality, 322 std::string_view schema_type, std::string_view property_name); 323 324 // Checks that the 'document_indexing_config' satisfies the following rule: 325 // 1. If indexable_nested_properties is non-empty, index_nested_properties 326 // must be set to false. 327 // 328 // Returns: 329 // INVALID_ARGUMENT if any of the rules are not followed 330 // OK on success 331 static libtextclassifier3::Status ValidateDocumentIndexingConfig( 332 const DocumentIndexingConfig& config, std::string_view schema_type, 333 std::string_view property_name); 334 335 // Returns if 'parent_type' is a direct or indirect parent of 'child_type'. 336 static bool IsParent(const SchemaUtil::InheritanceMap& inheritance_map, 337 std::string_view parent_type, 338 std::string_view child_type); 339 340 // Returns if 'child_property_config' in a child type can override 341 // 'parent_property_config' in the parent type. 342 // 343 // Let's assign 'child_property_config' a type T1 and 'parent_property_config' 344 // a type T2 that captures information for their data_type, schema_type and 345 // cardinalities, so that 'child_property_config' can override 346 // 'parent_property_config' if and only if T1 <: T2, i.e. T1 is a subtype of 347 // T2. 348 // 349 // Below are the rules for inferring subtype relations. 350 // - T <: T for every type T. 351 // - If U extends T, then U <: T. 352 // - For every type T1, T2 and T3, if T1 <: T2 and T2 <: T3, then T1 <: T3. 353 // - Optional<T> <: Repeated<T> for every type T. 354 // - Required<T> <: Optional<T> for every type T. 355 // - If T1 <: T2, then 356 // - Required<T1> <: Required<T2> 357 // - Optional<T1> <: Optional<T2> 358 // - Repeated<T1> <: Repeated<T2> 359 // 360 // We assume the Closed World Assumption (CWA), i.e. if T1 <: T2 cannot be 361 // deduced from the above rules, then T1 is not a subtype of T2. 362 static bool IsInheritedPropertyCompatible( 363 const SchemaUtil::InheritanceMap& inheritance_map, 364 const PropertyConfigProto& child_property_config, 365 const PropertyConfigProto& parent_property_config); 366 367 // Verifies that every child type's property set has included all compatible 368 // properties from parent types, based on the following rule: 369 // 370 // - If a property "prop" of type T is in the parent, then the child type must 371 // also have "prop" that is of type U, such that U <: T, i.e. U is a subtype 372 // of T. 373 // 374 // RETURNS: 375 // Ok on validation success 376 // INVALID_ARGUMENT if an exception that violates the above validation rule 377 // is found. 378 static libtextclassifier3::Status ValidateInheritedProperties( 379 const SchemaProto& schema); 380 }; 381 382 } // namespace lib 383 } // namespace icing 384 385 #endif // ICING_SCHEMA_SCHEMA_UTIL_H_ 386