• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2020 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_STORE_TOKENIZED_DOCUMENT_H_
16 #define ICING_STORE_TOKENIZED_DOCUMENT_H_
17 
18 #include <cstdint>
19 #include <string>
20 #include <vector>
21 
22 #include "icing/text_classifier/lib3/utils/base/statusor.h"
23 #include "icing/proto/document.pb.h"
24 #include "icing/schema/schema-store.h"
25 #include "icing/schema/section.h"
26 #include "icing/tokenization/language-segmenter.h"
27 
28 namespace icing {
29 namespace lib {
30 
31 struct TokenizedSection {
32   SectionMetadata metadata;
33   std::vector<std::string_view> token_sequence;
34 
TokenizedSectionTokenizedSection35   TokenizedSection(SectionMetadata&& metadata_in,
36                    std::vector<std::string_view>&& token_sequence_in)
37       : metadata(std::move(metadata_in)),
38         token_sequence(std::move(token_sequence_in)) {}
39 };
40 
41 class TokenizedDocument {
42  public:
43   static libtextclassifier3::StatusOr<TokenizedDocument> Create(
44       const SchemaStore* schema_store,
45       const LanguageSegmenter* language_segmenter, DocumentProto document);
46 
document()47   const DocumentProto& document() const { return document_; }
48 
num_tokens()49   int32_t num_tokens() const {
50     int32_t num_tokens = 0;
51     for (const TokenizedSection& section : tokenized_sections_) {
52       num_tokens += section.token_sequence.size();
53     }
54     return num_tokens;
55   }
56 
sections()57   const std::vector<TokenizedSection>& sections() const {
58     return tokenized_sections_;
59   }
60 
61  private:
62   // Use TokenizedDocument::Create() to instantiate.
63   explicit TokenizedDocument(DocumentProto document);
64 
65   DocumentProto document_;
66   std::vector<TokenizedSection> tokenized_sections_;
67 
68   libtextclassifier3::Status Tokenize(
69       const SchemaStore* schema_store,
70       const LanguageSegmenter* language_segmenter);
71 };
72 
73 }  // namespace lib
74 }  // namespace icing
75 
76 #endif  // ICING_STORE_TOKENIZED_DOCUMENT_H_
77