1 // Copyright (C) 2020 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/util/tokenized-document.h"
16
17 #include <memory>
18 #include <string_view>
19 #include <utility>
20 #include <vector>
21
22 #include "icing/text_classifier/lib3/utils/base/statusor.h"
23 #include "icing/proto/document.pb.h"
24 #include "icing/schema/joinable-property.h"
25 #include "icing/schema/schema-store.h"
26 #include "icing/schema/section.h"
27 #include "icing/tokenization/language-segmenter.h"
28 #include "icing/tokenization/token.h"
29 #include "icing/tokenization/tokenizer-factory.h"
30 #include "icing/tokenization/tokenizer.h"
31 #include "icing/util/document-validator.h"
32 #include "icing/util/status-macros.h"
33
34 namespace icing {
35 namespace lib {
36
37 namespace {
38
Tokenize(const SchemaStore * schema_store,const LanguageSegmenter * language_segmenter,const std::vector<Section<std::string_view>> & string_sections)39 libtextclassifier3::StatusOr<std::vector<TokenizedSection>> Tokenize(
40 const SchemaStore* schema_store,
41 const LanguageSegmenter* language_segmenter,
42 const std::vector<Section<std::string_view>>& string_sections) {
43 std::vector<TokenizedSection> tokenized_string_sections;
44 for (const Section<std::string_view>& section : string_sections) {
45 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer> tokenizer,
46 tokenizer_factory::CreateIndexingTokenizer(
47 section.metadata.tokenizer, language_segmenter));
48 std::vector<std::string_view> token_sequence;
49 for (std::string_view subcontent : section.content) {
50 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> itr,
51 tokenizer->Tokenize(subcontent));
52 while (itr->Advance()) {
53 std::vector<Token> batch_tokens = itr->GetTokens();
54 for (const Token& token : batch_tokens) {
55 token_sequence.push_back(token.text);
56 }
57 }
58 }
59 tokenized_string_sections.emplace_back(SectionMetadata(section.metadata),
60 std::move(token_sequence));
61 }
62
63 return tokenized_string_sections;
64 }
65
66 } // namespace
67
68 /* static */ libtextclassifier3::StatusOr<TokenizedDocument>
Create(const SchemaStore * schema_store,const LanguageSegmenter * language_segmenter,DocumentProto document)69 TokenizedDocument::Create(const SchemaStore* schema_store,
70 const LanguageSegmenter* language_segmenter,
71 DocumentProto document) {
72 // Since there are many std::string_view objects pointing to the document
73 // proto, we should make sure DocumentProto has a fixed address. The simplest
74 // way is to use a unique_ptr.
75 auto document_ptr = std::make_unique<DocumentProto>(std::move(document));
76
77 DocumentValidator validator(schema_store);
78 ICING_RETURN_IF_ERROR(validator.Validate(*document_ptr));
79
80 ICING_ASSIGN_OR_RETURN(SectionGroup section_group,
81 schema_store->ExtractSections(*document_ptr));
82
83 ICING_ASSIGN_OR_RETURN(
84 JoinablePropertyGroup joinable_property_group,
85 schema_store->ExtractJoinableProperties(*document_ptr));
86
87 // Tokenize string sections
88 ICING_ASSIGN_OR_RETURN(
89 std::vector<TokenizedSection> tokenized_string_sections,
90 Tokenize(schema_store, language_segmenter,
91 section_group.string_sections));
92
93 return TokenizedDocument(std::move(document_ptr),
94 std::move(tokenized_string_sections),
95 std::move(section_group.integer_sections),
96 std::move(section_group.vector_sections),
97 std::move(joinable_property_group));
98 }
99
100 } // namespace lib
101 } // namespace icing
102