• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2020 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef ICING_STORE_TOKENIZED_DOCUMENT_H_
16 #define ICING_STORE_TOKENIZED_DOCUMENT_H_
17 
18 #include <cstdint>
19 #include <string>
20 #include <vector>
21 
22 #include "icing/text_classifier/lib3/utils/base/statusor.h"
23 #include "icing/proto/document.pb.h"
24 #include "icing/schema/joinable-property.h"
25 #include "icing/schema/schema-store.h"
26 #include "icing/schema/section.h"
27 #include "icing/tokenization/language-segmenter.h"
28 
29 namespace icing {
30 namespace lib {
31 
32 struct TokenizedSection {
33   SectionMetadata metadata;
34   std::vector<std::string_view> token_sequence;
35 
TokenizedSectionTokenizedSection36   TokenizedSection(SectionMetadata&& metadata_in,
37                    std::vector<std::string_view>&& token_sequence_in)
38       : metadata(std::move(metadata_in)),
39         token_sequence(std::move(token_sequence_in)) {}
40 };
41 
42 class TokenizedDocument {
43  public:
44   static libtextclassifier3::StatusOr<TokenizedDocument> Create(
45       const SchemaStore* schema_store,
46       const LanguageSegmenter* language_segmenter, DocumentProto document);
47 
document()48   const DocumentProto& document() const { return document_; }
49 
num_string_tokens()50   int32_t num_string_tokens() const {
51     int32_t num_string_tokens = 0;
52     for (const TokenizedSection& section : tokenized_string_sections_) {
53       num_string_tokens += section.token_sequence.size();
54     }
55     return num_string_tokens;
56   }
57 
tokenized_string_sections()58   const std::vector<TokenizedSection>& tokenized_string_sections() const {
59     return tokenized_string_sections_;
60   }
61 
integer_sections()62   const std::vector<Section<int64_t>>& integer_sections() const {
63     return integer_sections_;
64   }
65 
66   const std::vector<JoinableProperty<std::string_view>>&
qualified_id_join_properties()67   qualified_id_join_properties() const {
68     return joinable_property_group_.qualified_id_properties;
69   }
70 
71  private:
72   // Use TokenizedDocument::Create() to instantiate.
TokenizedDocument(DocumentProto && document,std::vector<TokenizedSection> && tokenized_string_sections,std::vector<Section<int64_t>> && integer_sections,JoinablePropertyGroup && joinable_property_group)73   explicit TokenizedDocument(
74       DocumentProto&& document,
75       std::vector<TokenizedSection>&& tokenized_string_sections,
76       std::vector<Section<int64_t>>&& integer_sections,
77       JoinablePropertyGroup&& joinable_property_group)
78       : document_(std::move(document)),
79         tokenized_string_sections_(std::move(tokenized_string_sections)),
80         integer_sections_(std::move(integer_sections)),
81         joinable_property_group_(std::move(joinable_property_group)) {}
82 
83   DocumentProto document_;
84   std::vector<TokenizedSection> tokenized_string_sections_;
85   std::vector<Section<int64_t>> integer_sections_;
86   JoinablePropertyGroup joinable_property_group_;
87 };
88 
89 }  // namespace lib
90 }  // namespace icing
91 
92 #endif  // ICING_STORE_TOKENIZED_DOCUMENT_H_
93