1 // Copyright (C) 2020 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_STORE_TOKENIZED_DOCUMENT_H_ 16 #define ICING_STORE_TOKENIZED_DOCUMENT_H_ 17 18 #include <cstdint> 19 #include <string> 20 #include <vector> 21 22 #include "icing/text_classifier/lib3/utils/base/statusor.h" 23 #include "icing/proto/document.pb.h" 24 #include "icing/schema/joinable-property.h" 25 #include "icing/schema/schema-store.h" 26 #include "icing/schema/section.h" 27 #include "icing/tokenization/language-segmenter.h" 28 29 namespace icing { 30 namespace lib { 31 32 struct TokenizedSection { 33 SectionMetadata metadata; 34 std::vector<std::string_view> token_sequence; 35 TokenizedSectionTokenizedSection36 TokenizedSection(SectionMetadata&& metadata_in, 37 std::vector<std::string_view>&& token_sequence_in) 38 : metadata(std::move(metadata_in)), 39 token_sequence(std::move(token_sequence_in)) {} 40 }; 41 42 class TokenizedDocument { 43 public: 44 static libtextclassifier3::StatusOr<TokenizedDocument> Create( 45 const SchemaStore* schema_store, 46 const LanguageSegmenter* language_segmenter, DocumentProto document); 47 document()48 const DocumentProto& document() const { return document_; } 49 num_string_tokens()50 int32_t num_string_tokens() const { 51 int32_t num_string_tokens = 0; 52 for (const TokenizedSection& section : tokenized_string_sections_) { 53 num_string_tokens += section.token_sequence.size(); 54 } 55 return num_string_tokens; 56 } 57 tokenized_string_sections()58 const std::vector<TokenizedSection>& tokenized_string_sections() const { 59 return tokenized_string_sections_; 60 } 61 integer_sections()62 const std::vector<Section<int64_t>>& integer_sections() const { 63 return integer_sections_; 64 } 65 66 const std::vector<JoinableProperty<std::string_view>>& qualified_id_join_properties()67 qualified_id_join_properties() const { 68 return joinable_property_group_.qualified_id_properties; 69 } 70 71 private: 72 // Use TokenizedDocument::Create() to instantiate. TokenizedDocument(DocumentProto && document,std::vector<TokenizedSection> && tokenized_string_sections,std::vector<Section<int64_t>> && integer_sections,JoinablePropertyGroup && joinable_property_group)73 explicit TokenizedDocument( 74 DocumentProto&& document, 75 std::vector<TokenizedSection>&& tokenized_string_sections, 76 std::vector<Section<int64_t>>&& integer_sections, 77 JoinablePropertyGroup&& joinable_property_group) 78 : document_(std::move(document)), 79 tokenized_string_sections_(std::move(tokenized_string_sections)), 80 integer_sections_(std::move(integer_sections)), 81 joinable_property_group_(std::move(joinable_property_group)) {} 82 83 DocumentProto document_; 84 std::vector<TokenizedSection> tokenized_string_sections_; 85 std::vector<Section<int64_t>> integer_sections_; 86 JoinablePropertyGroup joinable_property_group_; 87 }; 88 89 } // namespace lib 90 } // namespace icing 91 92 #endif // ICING_STORE_TOKENIZED_DOCUMENT_H_ 93