1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/schema/section-manager.h"
16
17 #include <algorithm>
18 #include <cinttypes>
19 #include <cstddef>
20 #include <cstdint>
21 #include <iterator>
22 #include <memory>
23 #include <string>
24 #include <string_view>
25 #include <unordered_map>
26 #include <unordered_set>
27 #include <utility>
28 #include <vector>
29
30 #include "icing/text_classifier/lib3/utils/base/status.h"
31 #include "icing/text_classifier/lib3/utils/base/statusor.h"
32 #include "icing/absl_ports/canonical_errors.h"
33 #include "icing/absl_ports/str_cat.h"
34 #include "icing/legacy/core/icing-string-util.h"
35 #include "icing/proto/document.pb.h"
36 #include "icing/proto/schema.pb.h"
37 #include "icing/proto/term.pb.h"
38 #include "icing/schema/schema-util.h"
39 #include "icing/schema/section.h"
40 #include "icing/store/document-filter-data.h"
41 #include "icing/store/key-mapper.h"
42 #include "icing/util/status-macros.h"
43
44 namespace icing {
45 namespace lib {
46 namespace {
47
48 using TypeSectionMap =
49 std::unordered_map<std::string, const std::vector<SectionMetadata>>;
50
51 // Helper function to concatenate a path and a property name
ConcatenatePath(const std::string & path,const std::string & next_property_name)52 std::string ConcatenatePath(const std::string& path,
53 const std::string& next_property_name) {
54 if (path.empty()) {
55 return next_property_name;
56 }
57 return absl_ports::StrCat(path, kPropertySeparator, next_property_name);
58 }
59
AssignSections(const SchemaTypeConfigProto & current_type_config,const std::string & current_section_path,const SchemaUtil::TypeConfigMap & type_config_map,std::vector<SectionMetadata> * metadata_list)60 libtextclassifier3::Status AssignSections(
61 const SchemaTypeConfigProto& current_type_config,
62 const std::string& current_section_path,
63 const SchemaUtil::TypeConfigMap& type_config_map,
64 std::vector<SectionMetadata>* metadata_list) {
65 // Sorts properties by name's alphabetical order so that order doesn't affect
66 // section assigning.
67 auto sorted_properties = current_type_config.properties();
68 std::sort(sorted_properties.pointer_begin(), sorted_properties.pointer_end(),
69 [](const PropertyConfigProto* p1, const PropertyConfigProto* p2) {
70 return p1->property_name() < p2->property_name();
71 });
72 for (const auto& property_config : sorted_properties) {
73 if (property_config.data_type() ==
74 PropertyConfigProto::DataType::DOCUMENT) {
75 auto nested_type_config_iter =
76 type_config_map.find(property_config.schema_type());
77 if (nested_type_config_iter == type_config_map.end()) {
78 // This should never happen because our schema should already be
79 // validated by this point.
80 return absl_ports::NotFoundError(absl_ports::StrCat(
81 "Type config not found: ", property_config.schema_type()));
82 }
83
84 if (property_config.document_indexing_config()
85 .index_nested_properties()) {
86 // Assign any indexed sections recursively
87 const SchemaTypeConfigProto& nested_type_config =
88 nested_type_config_iter->second;
89 ICING_RETURN_IF_ERROR(
90 AssignSections(nested_type_config,
91 ConcatenatePath(current_section_path,
92 property_config.property_name()),
93 type_config_map, metadata_list));
94 }
95 }
96
97 // Only index strings currently.
98 if (property_config.has_data_type() !=
99 PropertyConfigProto::DataType::STRING ||
100 property_config.string_indexing_config().term_match_type() ==
101 TermMatchType::UNKNOWN) {
102 // No need to create section for current property
103 continue;
104 }
105
106 // Creates section metadata according to data type
107 // Validates next section id, makes sure that section id is the same as
108 // the list index so that we could find any section metadata by id in O(1)
109 // later.
110 auto new_section_id = static_cast<SectionId>(metadata_list->size());
111 if (!IsSectionIdValid(new_section_id)) {
112 // Max number of sections reached
113 return absl_ports::OutOfRangeError(IcingStringUtil::StringPrintf(
114 "Too many properties to be indexed, max number of properties "
115 "allowed: %d",
116 kMaxSectionId - kMinSectionId + 1));
117 }
118
119 // Creates section metadata from property config
120 metadata_list->emplace_back(
121 new_section_id,
122 property_config.string_indexing_config().term_match_type(),
123 property_config.string_indexing_config().tokenizer_type(),
124 ConcatenatePath(current_section_path, property_config.property_name()));
125 }
126 return libtextclassifier3::Status::OK;
127 }
128
129 // Builds a vector of vectors that holds SectionMetadatas for all the schema
130 // types. The outer vector's index corresponds with a type's SchemaTypeId. The
131 // inner vector's index corresponds to the section's SectionId.
132 libtextclassifier3::StatusOr<std::vector<std::vector<SectionMetadata>>>
BuildSectionMetadataCache(const SchemaUtil::TypeConfigMap & type_config_map,const KeyMapper<SchemaTypeId> & schema_type_mapper)133 BuildSectionMetadataCache(const SchemaUtil::TypeConfigMap& type_config_map,
134 const KeyMapper<SchemaTypeId>& schema_type_mapper) {
135 // Create our vector and reserve the number of schema types we have
136 std::vector<std::vector<SectionMetadata>> section_metadata_cache(
137 schema_type_mapper.num_keys());
138
139 for (const auto& name_and_type : type_config_map) {
140 // Assigns sections for each type config
141 const std::string& type_config_name = name_and_type.first;
142 const SchemaTypeConfigProto& type_config = name_and_type.second;
143 std::vector<SectionMetadata> metadata_list;
144 ICING_RETURN_IF_ERROR(AssignSections(type_config,
145 /*current_section_path*/ "",
146 type_config_map, &metadata_list));
147
148 // Insert the section metadata list at the index of the type's SchemaTypeId
149 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
150 schema_type_mapper.Get(type_config_name));
151 section_metadata_cache[schema_type_id] = std::move(metadata_list);
152 }
153 return section_metadata_cache;
154 }
155
156 // Helper function to get string content from a property. Repeated values are
157 // joined into one string. We only care about the STRING data type.
GetStringPropertyContent(const PropertyProto & property)158 std::vector<std::string_view> GetStringPropertyContent(
159 const PropertyProto& property) {
160 std::vector<std::string_view> values;
161 if (!property.string_values().empty()) {
162 std::copy(property.string_values().begin(), property.string_values().end(),
163 std::back_inserter(values));
164 }
165 return values;
166 }
167
168 } // namespace
169
SectionManager(const KeyMapper<SchemaTypeId> * schema_type_mapper,std::vector<std::vector<SectionMetadata>> && section_metadata_cache)170 SectionManager::SectionManager(
171 const KeyMapper<SchemaTypeId>* schema_type_mapper,
172 std::vector<std::vector<SectionMetadata>>&& section_metadata_cache)
173 : schema_type_mapper_(*schema_type_mapper),
174 section_metadata_cache_(std::move(section_metadata_cache)) {}
175
176 libtextclassifier3::StatusOr<std::unique_ptr<SectionManager>>
Create(const SchemaUtil::TypeConfigMap & type_config_map,const KeyMapper<SchemaTypeId> * schema_type_mapper)177 SectionManager::Create(const SchemaUtil::TypeConfigMap& type_config_map,
178 const KeyMapper<SchemaTypeId>* schema_type_mapper) {
179 ICING_RETURN_ERROR_IF_NULL(schema_type_mapper);
180
181 ICING_ASSIGN_OR_RETURN(
182 std::vector<std::vector<SectionMetadata>> section_metadata_cache,
183 BuildSectionMetadataCache(type_config_map, *schema_type_mapper));
184 return std::unique_ptr<SectionManager>(new SectionManager(
185 schema_type_mapper, std::move(section_metadata_cache)));
186 }
187
188 libtextclassifier3::StatusOr<std::vector<std::string_view>>
GetStringSectionContent(const DocumentProto & document,std::string_view section_path) const189 SectionManager::GetStringSectionContent(const DocumentProto& document,
190 std::string_view section_path) const {
191 // Finds the first property name in section_path
192 size_t separator_position = section_path.find(kPropertySeparator);
193 std::string_view current_property_name =
194 (separator_position == std::string::npos)
195 ? section_path
196 : section_path.substr(0, separator_position);
197
198 // Tries to match the property name with the ones in document
199 auto property_iterator =
200 std::find_if(document.properties().begin(), document.properties().end(),
201 [current_property_name](const PropertyProto& property) {
202 return property.name() == current_property_name;
203 });
204
205 if (property_iterator == document.properties().end()) {
206 // Property name not found, it could be one of the following 2 cases:
207 // 1. The property is optional and it's not in the document
208 // 2. The property name is invalid
209 return absl_ports::NotFoundError(absl_ports::StrCat(
210 "Section path '", section_path, "' not found in document."));
211 }
212
213 if (separator_position == std::string::npos) {
214 // Current property name is the last one in section path
215 std::vector<std::string_view> content =
216 GetStringPropertyContent(*property_iterator);
217 if (content.empty()) {
218 // The content of property is explicitly set to empty, we'll treat it as
219 // NOT_FOUND because the index doesn't care about empty strings.
220 return absl_ports::NotFoundError(absl_ports::StrCat(
221 "Section path '", section_path, "' content was empty"));
222 }
223 return content;
224 }
225
226 // Gets section content recursively
227 std::string_view sub_section_path =
228 section_path.substr(separator_position + 1);
229 std::vector<std::string_view> nested_document_content;
230 for (const auto& nested_document : property_iterator->document_values()) {
231 auto content_or =
232 GetStringSectionContent(nested_document, sub_section_path);
233 if (content_or.ok()) {
234 std::vector<std::string_view> content =
235 std::move(content_or).ValueOrDie();
236 std::move(content.begin(), content.end(),
237 std::back_inserter(nested_document_content));
238 }
239 }
240 if (nested_document_content.empty()) {
241 return absl_ports::NotFoundError(
242 absl_ports::StrCat("Section path ", section_path,
243 " not found in type config ", document.schema()));
244 }
245 return nested_document_content;
246 }
247
248 libtextclassifier3::StatusOr<std::vector<std::string_view>>
GetStringSectionContent(const DocumentProto & document,SectionId section_id) const249 SectionManager::GetStringSectionContent(const DocumentProto& document,
250 SectionId section_id) const {
251 if (!IsSectionIdValid(section_id)) {
252 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
253 "Section id %d is greater than the max value %d", section_id,
254 kMaxSectionId));
255 }
256 ICING_ASSIGN_OR_RETURN(const std::vector<SectionMetadata>* metadata_list,
257 GetMetadataList(document.schema()));
258 if (section_id >= metadata_list->size()) {
259 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
260 "Section with id %d doesn't exist in type config %s", section_id,
261 document.schema().c_str()));
262 }
263 // The index of metadata list is the same as the section id, so we can use
264 // section id as the index.
265 return GetStringSectionContent(document, metadata_list->at(section_id).path);
266 }
267
268 libtextclassifier3::StatusOr<const SectionMetadata*>
GetSectionMetadata(SchemaTypeId schema_type_id,SectionId section_id) const269 SectionManager::GetSectionMetadata(SchemaTypeId schema_type_id,
270 SectionId section_id) const {
271 if (!IsSectionIdValid(section_id)) {
272 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
273 "Section id %d is greater than the max value %d", section_id,
274 kMaxSectionId));
275 }
276 const std::vector<SectionMetadata>& section_metadatas =
277 section_metadata_cache_[schema_type_id];
278 if (section_id >= section_metadatas.size()) {
279 return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
280 "Section with id %d doesn't exist in type config with id %d",
281 section_id, schema_type_id));
282 }
283
284 // The index of metadata list is the same as the section id, so we can use
285 // section id as the index.
286 return §ion_metadatas[section_id];
287 }
288
289 libtextclassifier3::StatusOr<std::vector<Section>>
ExtractSections(const DocumentProto & document) const290 SectionManager::ExtractSections(const DocumentProto& document) const {
291 ICING_ASSIGN_OR_RETURN(const std::vector<SectionMetadata>* metadata_list,
292 GetMetadataList(document.schema()));
293 std::vector<Section> sections;
294 for (const auto& section_metadata : *metadata_list) {
295 auto section_content_or =
296 GetStringSectionContent(document, section_metadata.path);
297 // Adds to result vector if section is found in document
298 if (section_content_or.ok()) {
299 sections.emplace_back(SectionMetadata(section_metadata),
300 std::move(section_content_or).ValueOrDie());
301 }
302 }
303 return sections;
304 }
305
306 libtextclassifier3::StatusOr<const std::vector<SectionMetadata>*>
GetMetadataList(const std::string & type_config_name) const307 SectionManager::GetMetadataList(const std::string& type_config_name) const {
308 ICING_ASSIGN_OR_RETURN(SchemaTypeId schema_type_id,
309 schema_type_mapper_.Get(type_config_name));
310 return §ion_metadata_cache_.at(schema_type_id);
311 }
312
313 } // namespace lib
314 } // namespace icing
315