1 // Copyright (C) 2020 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/util/tokenized-document.h"
16
17 #include <memory>
18 #include <string_view>
19 #include <utility>
20 #include <vector>
21
22 #include "icing/text_classifier/lib3/utils/base/statusor.h"
23 #include "icing/proto/document.pb.h"
24 #include "icing/schema/joinable-property.h"
25 #include "icing/schema/schema-store.h"
26 #include "icing/schema/section.h"
27 #include "icing/tokenization/language-segmenter.h"
28 #include "icing/tokenization/token.h"
29 #include "icing/tokenization/tokenizer-factory.h"
30 #include "icing/tokenization/tokenizer.h"
31 #include "icing/util/document-validator.h"
32 #include "icing/util/status-macros.h"
33
34 namespace icing {
35 namespace lib {
36
37 namespace {
38
Tokenize(const SchemaStore * schema_store,const LanguageSegmenter * language_segmenter,const std::vector<Section<std::string_view>> & string_sections)39 libtextclassifier3::StatusOr<std::vector<TokenizedSection>> Tokenize(
40 const SchemaStore* schema_store,
41 const LanguageSegmenter* language_segmenter,
42 const std::vector<Section<std::string_view>>& string_sections) {
43 std::vector<TokenizedSection> tokenized_string_sections;
44 for (const Section<std::string_view>& section : string_sections) {
45 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer> tokenizer,
46 tokenizer_factory::CreateIndexingTokenizer(
47 section.metadata.tokenizer, language_segmenter));
48 std::vector<std::string_view> token_sequence;
49 for (std::string_view subcontent : section.content) {
50 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> itr,
51 tokenizer->Tokenize(subcontent));
52 while (itr->Advance()) {
53 std::vector<Token> batch_tokens = itr->GetTokens();
54 for (const Token& token : batch_tokens) {
55 token_sequence.push_back(token.text);
56 }
57 }
58 }
59 tokenized_string_sections.emplace_back(SectionMetadata(section.metadata),
60 std::move(token_sequence));
61 }
62
63 return tokenized_string_sections;
64 }
65
66 } // namespace
67
68 /* static */ libtextclassifier3::StatusOr<TokenizedDocument>
Create(const SchemaStore * schema_store,const LanguageSegmenter * language_segmenter,DocumentProto document)69 TokenizedDocument::Create(const SchemaStore* schema_store,
70 const LanguageSegmenter* language_segmenter,
71 DocumentProto document) {
72 DocumentValidator validator(schema_store);
73 ICING_RETURN_IF_ERROR(validator.Validate(document));
74
75 ICING_ASSIGN_OR_RETURN(SectionGroup section_group,
76 schema_store->ExtractSections(document));
77
78 ICING_ASSIGN_OR_RETURN(JoinablePropertyGroup joinable_property_group,
79 schema_store->ExtractJoinableProperties(document));
80
81 // Tokenize string sections
82 ICING_ASSIGN_OR_RETURN(
83 std::vector<TokenizedSection> tokenized_string_sections,
84 Tokenize(schema_store, language_segmenter,
85 section_group.string_sections));
86
87 return TokenizedDocument(std::move(document),
88 std::move(tokenized_string_sections),
89 std::move(section_group.integer_sections),
90 std::move(section_group.vector_sections),
91 std::move(joinable_property_group));
92 }
93
94 } // namespace lib
95 } // namespace icing
96