• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2020 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/util/tokenized-document.h"
16 
17 #include <memory>
18 #include <string_view>
19 #include <utility>
20 #include <vector>
21 
22 #include "icing/text_classifier/lib3/utils/base/statusor.h"
23 #include "icing/proto/document.pb.h"
24 #include "icing/schema/joinable-property.h"
25 #include "icing/schema/schema-store.h"
26 #include "icing/schema/section.h"
27 #include "icing/tokenization/language-segmenter.h"
28 #include "icing/tokenization/token.h"
29 #include "icing/tokenization/tokenizer-factory.h"
30 #include "icing/tokenization/tokenizer.h"
31 #include "icing/util/document-validator.h"
32 #include "icing/util/status-macros.h"
33 
34 namespace icing {
35 namespace lib {
36 
37 namespace {
38 
Tokenize(const SchemaStore * schema_store,const LanguageSegmenter * language_segmenter,const std::vector<Section<std::string_view>> & string_sections)39 libtextclassifier3::StatusOr<std::vector<TokenizedSection>> Tokenize(
40     const SchemaStore* schema_store,
41     const LanguageSegmenter* language_segmenter,
42     const std::vector<Section<std::string_view>>& string_sections) {
43   std::vector<TokenizedSection> tokenized_string_sections;
44   for (const Section<std::string_view>& section : string_sections) {
45     ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer> tokenizer,
46                            tokenizer_factory::CreateIndexingTokenizer(
47                                section.metadata.tokenizer, language_segmenter));
48     std::vector<std::string_view> token_sequence;
49     for (std::string_view subcontent : section.content) {
50       ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> itr,
51                              tokenizer->Tokenize(subcontent));
52       while (itr->Advance()) {
53         std::vector<Token> batch_tokens = itr->GetTokens();
54         for (const Token& token : batch_tokens) {
55           token_sequence.push_back(token.text);
56         }
57       }
58     }
59     tokenized_string_sections.emplace_back(SectionMetadata(section.metadata),
60                                            std::move(token_sequence));
61   }
62 
63   return tokenized_string_sections;
64 }
65 
66 }  // namespace
67 
68 /* static */ libtextclassifier3::StatusOr<TokenizedDocument>
Create(const SchemaStore * schema_store,const LanguageSegmenter * language_segmenter,DocumentProto document)69 TokenizedDocument::Create(const SchemaStore* schema_store,
70                           const LanguageSegmenter* language_segmenter,
71                           DocumentProto document) {
72   // Since there are many std::string_view objects pointing to the document
73   // proto, we should make sure DocumentProto has a fixed address. The simplest
74   // way is to use a unique_ptr.
75   auto document_ptr = std::make_unique<DocumentProto>(std::move(document));
76 
77   DocumentValidator validator(schema_store);
78   ICING_RETURN_IF_ERROR(validator.Validate(*document_ptr));
79 
80   ICING_ASSIGN_OR_RETURN(SectionGroup section_group,
81                          schema_store->ExtractSections(*document_ptr));
82 
83   ICING_ASSIGN_OR_RETURN(
84       JoinablePropertyGroup joinable_property_group,
85       schema_store->ExtractJoinableProperties(*document_ptr));
86 
87   // Tokenize string sections
88   ICING_ASSIGN_OR_RETURN(
89       std::vector<TokenizedSection> tokenized_string_sections,
90       Tokenize(schema_store, language_segmenter,
91                section_group.string_sections));
92 
93   return TokenizedDocument(std::move(document_ptr),
94                            std::move(tokenized_string_sections),
95                            std::move(section_group.integer_sections),
96                            std::move(section_group.vector_sections),
97                            std::move(joinable_property_group));
98 }
99 
100 }  // namespace lib
101 }  // namespace icing
102