1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/schema/section-manager.h"
16
17 #include <algorithm>
18 #include <cstdint>
19 #include <string>
20 #include <string_view>
21 #include <utility>
22 #include <vector>
23
24 #include "icing/text_classifier/lib3/utils/base/status.h"
25 #include "icing/text_classifier/lib3/utils/base/statusor.h"
26 #include "icing/absl_ports/canonical_errors.h"
27 #include "icing/legacy/core/icing-string-util.h"
28 #include "icing/proto/document.pb.h"
29 #include "icing/proto/schema.pb.h"
30 #include "icing/proto/term.pb.h"
31 #include "icing/schema/property-util.h"
32 #include "icing/schema/section.h"
33 #include "icing/store/document-filter-data.h"
34 #include "icing/store/key-mapper.h"
35 #include "icing/util/status-macros.h"
36
37 namespace icing {
38 namespace lib {
39
40 namespace {
41
42 // Helper function to append a new section metadata
AppendNewSectionMetadata(std::vector<SectionMetadata> * metadata_list,std::string && concatenated_path,const PropertyConfigProto & property_config)43 libtextclassifier3::Status AppendNewSectionMetadata(
44 std::vector<SectionMetadata>* metadata_list,
45 std::string&& concatenated_path,
46 const PropertyConfigProto& property_config) {
47 // Validates next section id, makes sure that section id is the same as the
48 // list index so that we could find any section metadata by id in O(1) later.
49 SectionId new_section_id = static_cast<SectionId>(metadata_list->size());
50 if (!IsSectionIdValid(new_section_id)) {
51 // Max number of sections reached
52 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
53 "Too many properties to be indexed, max number of properties "
54 "allowed: %d",
55 kMaxSectionId - kMinSectionId + 1));
56 }
57
58 // Creates section metadata
59 metadata_list->push_back(SectionMetadata(
60 new_section_id, property_config.data_type(),
61 property_config.string_indexing_config().tokenizer_type(),
62 property_config.string_indexing_config().term_match_type(),
63 property_config.integer_indexing_config().numeric_match_type(),
64 property_config.embedding_indexing_config().embedding_indexing_type(),
65 std::move(concatenated_path)));
66 return libtextclassifier3::Status::OK;
67 }
68
69 template <typename T>
AppendSection(SectionMetadata section_metadata,libtextclassifier3::StatusOr<std::vector<T>> && section_content_or,std::vector<Section<T>> & sections_out)70 void AppendSection(
71 SectionMetadata section_metadata,
72 libtextclassifier3::StatusOr<std::vector<T>>&& section_content_or,
73 std::vector<Section<T>>& sections_out) {
74 if (!section_content_or.ok()) {
75 return;
76 }
77
78 std::vector<T> section_content = std::move(section_content_or).ValueOrDie();
79 if (!section_content.empty()) {
80 // Adds to result vector if section is found in document
81 sections_out.emplace_back(std::move(section_metadata),
82 std::move(section_content));
83 }
84 }
85
86 } // namespace
87
88 libtextclassifier3::Status
ProcessSchemaTypePropertyConfig(SchemaTypeId schema_type_id,const PropertyConfigProto & property_config,std::string && property_path)89 SectionManager::Builder::ProcessSchemaTypePropertyConfig(
90 SchemaTypeId schema_type_id, const PropertyConfigProto& property_config,
91 std::string&& property_path) {
92 if (schema_type_id < 0 || schema_type_id >= section_metadata_cache_.size()) {
93 return absl_ports::InvalidArgumentError("Invalid schema type id");
94 }
95
96 // We don't need to check if the property is indexable. This method will
97 // only be called properties that should consume sectionIds, even if the
98 // property's indexing configuration itself is not indexable.
99 // This would be the case for unknown and non-indexable property paths that
100 // are defined in the indexable_nested_properties_list.
101 ICING_RETURN_IF_ERROR(
102 AppendNewSectionMetadata(§ion_metadata_cache_[schema_type_id],
103 std::move(property_path), property_config));
104 return libtextclassifier3::Status::OK;
105 }
106
107 libtextclassifier3::StatusOr<const SectionMetadata*>
GetSectionMetadata(SchemaTypeId schema_type_id,SectionId section_id) const108 SectionManager::GetSectionMetadata(SchemaTypeId schema_type_id,
109 SectionId section_id) const {
110 if (schema_type_id < 0 || schema_type_id >= section_metadata_cache_.size()) {
111 return absl_ports::InvalidArgumentError("Invalid schema type id");
112 }
113 if (!IsSectionIdValid(section_id)) {
114 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
115 "Section id %d is greater than the max value %d", section_id,
116 kMaxSectionId));
117 }
118
119 const std::vector<SectionMetadata>& section_metadatas =
120 section_metadata_cache_[schema_type_id];
121 if (section_id >= section_metadatas.size()) {
122 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
123 "Section with id %d doesn't exist in type config with id %d",
124 section_id, schema_type_id));
125 }
126
127 // The index of metadata list is the same as the section id, so we can use
128 // section id as the index.
129 return §ion_metadatas[section_id];
130 }
131
ExtractSections(const DocumentProto & document) const132 libtextclassifier3::StatusOr<SectionGroup> SectionManager::ExtractSections(
133 const DocumentProto& document) const {
134 ICING_ASSIGN_OR_RETURN(const std::vector<SectionMetadata>* metadata_list,
135 GetMetadataList(document.schema()));
136 SectionGroup section_group;
137 for (const SectionMetadata& section_metadata : *metadata_list) {
138 switch (section_metadata.data_type) {
139 case PropertyConfigProto::DataType::STRING: {
140 if (section_metadata.term_match_type == TermMatchType::UNKNOWN ||
141 section_metadata.tokenizer ==
142 StringIndexingConfig::TokenizerType::NONE) {
143 // Skip if term-match type is UNKNOWN, or if the tokenizer-type is
144 // NONE.
145 break;
146 }
147 AppendSection(
148 section_metadata,
149 property_util::ExtractPropertyValuesFromDocument<std::string_view>(
150 document, section_metadata.path),
151 section_group.string_sections);
152 break;
153 }
154 case PropertyConfigProto::DataType::INT64: {
155 if (section_metadata.numeric_match_type ==
156 IntegerIndexingConfig::NumericMatchType::UNKNOWN) {
157 // Skip if numeric-match type is UNKNOWN.
158 break;
159 }
160 AppendSection(section_metadata,
161 property_util::ExtractPropertyValuesFromDocument<int64_t>(
162 document, section_metadata.path),
163 section_group.integer_sections);
164 break;
165 }
166 case PropertyConfigProto::DataType::VECTOR: {
167 if (section_metadata.embedding_indexing_type ==
168 EmbeddingIndexingConfig::EmbeddingIndexingType::UNKNOWN) {
169 // Skip if embedding indexing type is UNKNOWN.
170 break;
171 }
172 AppendSection(
173 section_metadata,
174 property_util::ExtractPropertyValuesFromDocument<
175 PropertyProto::VectorProto>(document, section_metadata.path),
176 section_group.vector_sections);
177 break;
178 }
179 default: {
180 // Skip other data types.
181 break;
182 }
183 }
184 }
185 return section_group;
186 }
187
188 libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
GetMetadataList(const std::string & type_config_name) const189 SectionManager::GetMetadataList(const std::string& type_config_name) const {
190 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
191 schema_type_mapper_.Get(type_config_name));
192 return §ion_metadata_cache_.at(schema_type_id);
193 }
194
195 } // namespace lib
196 } // namespace icing
197