• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2020 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/util/tokenized-document.h"
16 
17 #include <string>
18 #include <string_view>
19 #include <vector>
20 
21 #include "icing/text_classifier/lib3/utils/base/status.h"
22 #include "icing/proto/document.pb.h"
23 #include "icing/schema/schema-store.h"
24 #include "icing/schema/section.h"
25 #include "icing/tokenization/language-segmenter.h"
26 #include "icing/tokenization/tokenizer-factory.h"
27 #include "icing/tokenization/tokenizer.h"
28 #include "icing/util/document-validator.h"
29 #include "icing/util/status-macros.h"
30 
31 namespace icing {
32 namespace lib {
33 
Create(const SchemaStore * schema_store,const LanguageSegmenter * language_segmenter,DocumentProto document)34 libtextclassifier3::StatusOr<TokenizedDocument> TokenizedDocument::Create(
35     const SchemaStore* schema_store,
36     const LanguageSegmenter* language_segmenter, DocumentProto document) {
37   TokenizedDocument tokenized_document(std::move(document));
38   ICING_RETURN_IF_ERROR(
39       tokenized_document.Tokenize(schema_store, language_segmenter));
40   return tokenized_document;
41 }
42 
TokenizedDocument(DocumentProto document)43 TokenizedDocument::TokenizedDocument(DocumentProto document)
44     : document_(std::move(document)) {}
45 
Tokenize(const SchemaStore * schema_store,const LanguageSegmenter * language_segmenter)46 libtextclassifier3::Status TokenizedDocument::Tokenize(
47     const SchemaStore* schema_store,
48     const LanguageSegmenter* language_segmenter) {
49   DocumentValidator validator(schema_store);
50   ICING_RETURN_IF_ERROR(validator.Validate(document_));
51 
52   ICING_ASSIGN_OR_RETURN(std::vector<Section> sections,
53                          schema_store->ExtractSections(document_));
54   for (const Section& section : sections) {
55     ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer> tokenizer,
56                            tokenizer_factory::CreateIndexingTokenizer(
57                                section.metadata.tokenizer, language_segmenter));
58     std::vector<std::string_view> token_sequence;
59     for (std::string_view subcontent : section.content) {
60       ICING_ASSIGN_OR_RETURN(std::unique_ptr<Tokenizer::Iterator> itr,
61                              tokenizer->Tokenize(subcontent));
62       while (itr->Advance()) {
63         token_sequence.push_back(itr->GetToken().text);
64       }
65     }
66     tokenized_sections_.emplace_back(SectionMetadata(section.metadata),
67                                      std::move(token_sequence));
68   }
69 
70   return libtextclassifier3::Status::OK;
71 }
72 
73 }  // namespace lib
74 }  // namespace icing
75