1 // Copyright (C) 2019 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_SCHEMA_SCHEMA_UTIL_H_ 16 #define ICING_SCHEMA_SCHEMA_UTIL_H_ 17 18 #include <cstdint> 19 #include <string> 20 #include <string_view> 21 #include <unordered_map> 22 #include <unordered_set> 23 24 #include "icing/text_classifier/lib3/utils/base/status.h" 25 #include "icing/text_classifier/lib3/utils/base/statusor.h" 26 #include "icing/proto/schema.pb.h" 27 28 namespace icing { 29 namespace lib { 30 31 class SchemaUtil { 32 public: 33 using TypeConfigMap = 34 std::unordered_map<std::string, const SchemaTypeConfigProto>; 35 36 // A data structure that stores the relationships between schema types. The 37 // keys in TypeRelationMap are schema types, and the values are sets of schema 38 // types that are directly or indirectly related to the key. 39 template <typename T> 40 using TypeRelationMap = 41 std::unordered_map<std::string_view, 42 std::unordered_map<std::string_view, T>>; 43 44 // If A -> B is indicated in the map, then type A must be built before 45 // building type B, which implies one of the following situations. 46 // 47 // 1. B has a property of type A. 48 // 2. A is a parent type of B via polymorphism. 49 // 50 // For the first case, this map will also include all PropertyConfigProto 51 // (with DOCUMENT data_type) pointers which *directly* connects type A and B. 52 // IOW, this vector of PropertyConfigProto* are "direct edges" connecting A 53 // and B directly. It will be an empty vector if A and B are not "directly" 54 // connected, but instead via another intermediate level of schema type. For 55 // example, the actual dependency is A -> C -> B, so there will be A -> C and 56 // C -> B with valid PropertyConfigProto* respectively in this map, but we 57 // will also expand transitive dependents: add A -> B into dependent map with 58 // empty vector of "edges". 59 using DependentMap = TypeRelationMap<std::vector<const PropertyConfigProto*>>; 60 61 // If A -> B is indicated in the map, then type A is a parent type of B, 62 // directly or indirectly. If directly, the bool value in the map will be 63 // true, otherwise false. 64 // 65 // Note that all relationships contained in this map are also entries in the 66 // DependentMap, i.e. if B inherits from A, then there will be a mapping from 67 // A to B in both this map and the DependentMap. 68 using InheritanceMap = TypeRelationMap<bool>; 69 70 struct SchemaDelta { 71 // Which schema types were present in the old schema, but were deleted from 72 // the new schema. 73 std::unordered_set<std::string> schema_types_deleted; 74 75 // Which schema types had their SchemaTypeConfigProto changed in a way that 76 // could invalidate existing Documents of that schema type. 77 std::unordered_set<std::string> schema_types_incompatible; 78 79 // Schema types that were added in the new schema. Represented by the 80 // `schema_type` field in the SchemaTypeConfigProto. 81 std::unordered_set<std::string> schema_types_new; 82 83 // Schema types that were changed in a way that was backwards compatible and 84 // didn't invalidate the index. Represented by the `schema_type` field in 85 // the SchemaTypeConfigProto. 86 std::unordered_set<std::string> schema_types_changed_fully_compatible; 87 88 // Schema types that were changed in a way that was backwards compatible, 89 // but invalidated the index. Represented by the `schema_type` field in the 90 // SchemaTypeConfigProto. 91 std::unordered_set<std::string> schema_types_index_incompatible; 92 93 // Schema types that were changed in a way that was backwards compatible, 94 // but invalidated the joinable cache. Represented by the `schema_type` 95 // field in the SchemaTypeConfigProto. 96 std::unordered_set<std::string> schema_types_join_incompatible; 97 98 bool operator==(const SchemaDelta& other) const { 99 return schema_types_deleted == other.schema_types_deleted && 100 schema_types_incompatible == other.schema_types_incompatible && 101 schema_types_new == other.schema_types_new && 102 schema_types_changed_fully_compatible == 103 other.schema_types_changed_fully_compatible && 104 schema_types_index_incompatible == 105 other.schema_types_index_incompatible && 106 schema_types_join_incompatible == 107 other.schema_types_join_incompatible; 108 } 109 }; 110 111 struct ParsedPropertyConfigs { 112 // Mapping of property name to PropertyConfigProto 113 std::unordered_map<std::string_view, const PropertyConfigProto*> 114 property_config_map; 115 116 // Total number of properties that have an indexing config 117 int32_t num_indexed_properties = 0; 118 119 // Total number of properties that were REQUIRED 120 int32_t num_required_properties = 0; 121 122 // Total number of properties that have joinable config 123 int32_t num_joinable_properties = 0; 124 }; 125 126 // This function validates: 127 // 1. SchemaTypeConfigProto.schema_type's must be unique 128 // 2. Properties within one SchemaTypeConfigProto must be unique 129 // 3. SchemaTypeConfigProtos.schema_type must be non-empty 130 // 4. PropertyConfigProtos.property_name must be non-empty 131 // 5. PropertyConfigProtos.property_name's must be unique within one 132 // SchemaTypeConfigProto 133 // 6. PropertyConfigProtos.data_type cannot be UNKNOWN 134 // 7. PropertyConfigProtos.data_type of DOCUMENT must also have a 135 // schema_type 136 // 8. PropertyConfigProtos.cardinality cannot be UNKNOWN 137 // 9. PropertyConfigProtos.schema_type's must correspond to a 138 // SchemaTypeConfigProto.schema_type 139 // 10. Property names can only be alphanumeric. 140 // 11. Any STRING data types have a valid string_indexing_config 141 // 12. PropertyConfigProtos.joinable_config must be valid. See 142 // ValidateJoinableConfig for more details. 143 // 13. Any PropertyConfigProtos with nested DOCUMENT data type must not have 144 // REPEATED cardinality if they reference a schema type containing 145 // joinable property. 146 // 14. The schema definition cannot have invalid cycles. A cycle is invalid 147 // if: 148 // a. SchemaTypeConfigProto.parent_type definitions form an inheritance 149 // cycle. 150 // b. The schema's property definitions have schema_types that form a 151 // cycle, and all properties on the cycle declare 152 // DocumentIndexingConfig.index_nested_properties=true. 153 // c. The schema's property definitions have schema_types that form a 154 // cycle, and the cycle leads to an invalid joinable property config. 155 // This is the case if: 156 // i. Any type node in the cycle itself has a joinable proprty 157 // (property whose joinable config is not NONE), OR 158 // ii. Any type node in the cycle has a nested-type (direct or 159 // indirect) with a joinable property. 160 // 161 // Returns: 162 // On success, a dependent map from each types to their dependent types 163 // that depend on it directly or indirectly. 164 // ALREADY_EXISTS for case 1 and 2 165 // INVALID_ARGUMENT for 3-15 166 static libtextclassifier3::StatusOr<DependentMap> Validate( 167 const SchemaProto& schema, bool allow_circular_schema_definitions); 168 169 // Builds a transitive inheritance map. 170 // 171 // Ex. Suppose we have a schema with four types A, B, C and D, and we have the 172 // following direct inheritance relation. 173 // 174 // A -> B (A is the parent type of B) 175 // B -> C (B is the parent type of C) 176 // C -> D (C is the parent type of D) 177 // 178 // Then, the transitive inheritance map for this schema would be: 179 // 180 // A -> B, C, D 181 // B -> C, D 182 // C -> D 183 // 184 // RETURNS: 185 // On success, a transitive inheritance map of all types in the schema. 186 // INVALID_ARGUMENT if the inheritance graph contains a cycle. 187 static libtextclassifier3::StatusOr<SchemaUtil::InheritanceMap> 188 BuildTransitiveInheritanceGraph(const SchemaProto& schema); 189 190 // Creates a mapping of schema type -> schema type config proto. The 191 // type_config_map is cleared, and then each schema-type_config_proto pair is 192 // placed in the given type_config_map parameter. 193 static void BuildTypeConfigMap(const SchemaProto& schema, 194 TypeConfigMap* type_config_map); 195 196 // Parses the given type_config and returns a struct of easily-parseable 197 // information about the properties. 198 static ParsedPropertyConfigs ParsePropertyConfigs( 199 const SchemaTypeConfigProto& type_config); 200 201 // Computes the delta between the old and new schema. There are a few 202 // differences that'll be reported: 203 // 1. The derived index would be incompatible. This is held in 204 // `SchemaDelta.index_incompatible`. 205 // 2. Some schema types existed in the old schema, but have been deleted 206 // from the new schema. This is held in 207 // `SchemaDelta.schema_types_deleted` 208 // 3. A schema type's new definition would mean any existing data of the old 209 // definition is now incompatible. 210 // 4. The derived join index would be incompatible. This is held in 211 // `SchemaDelta.join_incompatible`. 212 // 213 // For case 1, the two schemas would result in an incompatible index if: 214 // 1.1. The new SchemaProto has a different set of indexed properties than 215 // the old SchemaProto. 216 // 217 // For case 3, the two schemas would result in incompatible data if: 218 // 3.1. A SchemaTypeConfig exists in the old SchemaProto, but is not in the 219 // new SchemaProto 220 // 3.2. A property exists in the old SchemaTypeConfig, but is not in the new 221 // SchemaTypeConfig 222 // 3.3. A property in the new SchemaTypeConfig and has a REQUIRED 223 // PropertyConfigProto.cardinality, but is not in the old 224 // SchemaTypeConfig 225 // 3.4. A property is in both the old and new SchemaTypeConfig, but its 226 // PropertyConfigProto.data_type is different 227 // 3.5. A property is in both the old and new SchemaTypeConfig, but its 228 // PropertyConfigProto.schema_type is different 229 // 3.6. A property is in both the old and new SchemaTypeConfig, but its new 230 // PropertyConfigProto.cardinality is more restrictive. Restrictive 231 // scale defined as: 232 // LEAST <REPEATED - OPTIONAL - REQUIRED> MOST 233 // 234 // For case 4, the two schemas would result in an incompatible join if: 235 // 4.1. A SchematypeConfig exists in the new SchemaProto that has a 236 // different set of joinable properties than it did in the old 237 // SchemaProto. 238 // 239 // A property is defined by the combination of the 240 // SchemaTypeConfig.schema_type and the PropertyConfigProto.property_name. 241 // 242 // Returns a SchemaDelta that captures the aforementioned differences. 243 static const SchemaDelta ComputeCompatibilityDelta( 244 const SchemaProto& old_schema, const SchemaProto& new_schema, 245 const DependentMap& new_schema_dependent_map); 246 247 // Validates the 'property_name' field. 248 // 1. Can't be an empty string 249 // 2. Can only contain alphanumeric characters 250 // 251 // NOTE: schema_type is only used for logging. It is not necessary to populate 252 // it. 253 // 254 // RETURNS: 255 // - OK if property_name is valid 256 // - INVALID_ARGUMENT if property name is empty or contains an 257 // non-alphabetic character. 258 static libtextclassifier3::Status ValidatePropertyName( 259 std::string_view property_name, std::string_view schema_type = ""); 260 261 static bool IsIndexedProperty(const PropertyConfigProto& property_config); 262 263 private: 264 // Validates the 'schema_type' field 265 // 266 // Returns: 267 // INVALID_ARGUMENT if 'schema_type' is an empty string. 268 // OK on success 269 static libtextclassifier3::Status ValidateSchemaType( 270 std::string_view schema_type); 271 272 // Validates the 'data_type' field. 273 // 274 // Returns: 275 // INVALID_ARGUMENT if it's UNKNOWN 276 // OK on success 277 static libtextclassifier3::Status ValidateDataType( 278 PropertyConfigProto::DataType::Code data_type, 279 std::string_view schema_type, std::string_view property_name); 280 281 // Validates the 'cardinality' field. 282 // 283 // Returns: 284 // INVALID_ARGUMENT if it's UNKNOWN 285 // OK on success 286 static libtextclassifier3::Status ValidateCardinality( 287 PropertyConfigProto::Cardinality::Code cardinality, 288 std::string_view schema_type, std::string_view property_name); 289 290 // Checks that the 'string_indexing_config' satisfies the following rules: 291 // 1. Only STRING data types can be indexed 292 // 2. An indexed property must have a valid tokenizer type 293 // 294 // Returns: 295 // INVALID_ARGUMENT if any of the rules are not followed 296 // OK on success 297 static libtextclassifier3::Status ValidateStringIndexingConfig( 298 const StringIndexingConfig& config, 299 PropertyConfigProto::DataType::Code data_type, 300 std::string_view schema_type, std::string_view property_name); 301 302 // Checks that the 'joinable_config' satisfies the following rules: 303 // 1. If the data type matches joinable value type 304 // a. Only STRING data types can use QUALIFIED_ID joinable value type 305 // 2. Only QUALIFIED_ID joinable value type can have delete propagation 306 // enabled 307 // 3. Any joinable property should have non-REPEATED cardinality 308 // 309 // Returns: 310 // INVALID_ARGUMENT if any of the rules are not followed 311 // OK on success 312 static libtextclassifier3::Status ValidateJoinableConfig( 313 const JoinableConfig& config, 314 PropertyConfigProto::DataType::Code data_type, 315 PropertyConfigProto::Cardinality::Code cardinality, 316 std::string_view schema_type, std::string_view property_name); 317 318 // Returns if 'parent_type' is a direct or indirect parent of 'child_type'. 319 static bool IsParent(const SchemaUtil::InheritanceMap& inheritance_map, 320 std::string_view parent_type, 321 std::string_view child_type); 322 323 // Returns if 'child_property_config' in a child type can override 324 // 'parent_property_config' in the parent type. 325 // 326 // Let's assign 'child_property_config' a type T1 and 'parent_property_config' 327 // a type T2 that captures information for their data_type, schema_type and 328 // cardinalities, so that 'child_property_config' can override 329 // 'parent_property_config' if and only if T1 <: T2, i.e. T1 is a subtype of 330 // T2. 331 // 332 // Below are the rules for inferring subtype relations. 333 // - T <: T for every type T. 334 // - If U extends T, then U <: T. 335 // - For every type T1, T2 and T3, if T1 <: T2 and T2 <: T3, then T1 <: T3. 336 // - Optional<T> <: Repeated<T> for every type T. 337 // - Required<T> <: Optional<T> for every type T. 338 // - If T1 <: T2, then 339 // - Required<T1> <: Required<T2> 340 // - Optional<T1> <: Optional<T2> 341 // - Repeated<T1> <: Repeated<T2> 342 // 343 // We assume the Closed World Assumption (CWA), i.e. if T1 <: T2 cannot be 344 // deduced from the above rules, then T1 is not a subtype of T2. 345 static bool IsInheritedPropertyCompatible( 346 const SchemaUtil::InheritanceMap& inheritance_map, 347 const PropertyConfigProto& child_property_config, 348 const PropertyConfigProto& parent_property_config); 349 350 // Verifies that every child type's property set has included all compatible 351 // properties from parent types, based on the following rule: 352 // 353 // - If a property "prop" of type T is in the parent, then the child type must 354 // also have "prop" that is of type U, such that U <: T, i.e. U is a subtype 355 // of T. 356 // 357 // RETURNS: 358 // Ok on validation success 359 // INVALID_ARGUMENT if an exception that violates the above validation rule 360 // is found. 361 static libtextclassifier3::Status ValidateInheritedProperties( 362 const SchemaProto& schema); 363 }; 364 365 } // namespace lib 366 } // namespace icing 367 368 #endif // ICING_SCHEMA_SCHEMA_UTIL_H_ 369