1 // Copyright (C) 2020 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/util/tokenized-document.h"
16
17 #include <string>
18 #include <string_view>
19 #include <vector>
20
21 #include "icing/text_classifier/lib3/utils/base/status.h"
22 #include "icing/proto/document.pb.h"
23 #include "icing/schema/schema-store.h"
24 #include "icing/schema/section.h"
25 #include "icing/tokenization/language-segmenter.h"
26 #include "icing/tokenization/tokenizer-factory.h"
27 #include "icing/tokenization/tokenizer.h"
28 #include "icing/util/document-validator.h"
29 #include "icing/util/status-macros.h"
30
31 namespace icing {
32 namespace lib {
33
Create(const SchemaStore * schema_store,const LanguageSegmenter * language_segmenter,DocumentProto document)34 libtextclassifier3::StatusOr<TokenizedDocument> TokenizedDocument::Create(
35 const SchemaStore* schema_store,
36 const LanguageSegmenter* language_segmenter, DocumentProto document) {
37 TokenizedDocument tokenized_document(std::move(document));
38 ICING_RETURN_IF_ERROR(
39 tokenized_document.Tokenize(schema_store, language_segmenter));
40 return tokenized_document;
41 }
42
TokenizedDocument(DocumentProto document)43 TokenizedDocument::TokenizedDocument(DocumentProto document)
44 : document_(std::move(document)) {}
45
Tokenize(const SchemaStore * schema_store,const LanguageSegmenter * language_segmenter)46 libtextclassifier3::Status TokenizedDocument::Tokenize(
47 const SchemaStore* schema_store,
48 const LanguageSegmenter* language_segmenter) {
49 DocumentValidator validator(schema_store);
50 ICING_RETURN_IF_ERROR(validator.Validate(document_));
51
52 ICING_ASSIGN_OR_RETURN(std::vector<Section> sections,
53 schema_store->ExtractSections(document_));
54 for (const Section& section : sections) {
55 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer> tokenizer,
56 tokenizer_factory::CreateIndexingTokenizer(
57 section.metadata.tokenizer, language_segmenter));
58 std::vector<std::string_view> token_sequence;
59 for (std::string_view subcontent : section.content) {
60 ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> itr,
61 tokenizer->Tokenize(subcontent));
62 while (itr->Advance()) {
63 token_sequence.push_back(itr->GetToken().text);
64 }
65 }
66 tokenized_sections_.emplace_back(SectionMetadata(section.metadata),
67 std::move(token_sequence));
68 }
69
70 return libtextclassifier3::Status::OK;
71 }
72
73 } // namespace lib
74 } // namespace icing
75