1 // Copyright (C) 2020 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #ifndef ICING_STORE_TOKENIZED_DOCUMENT_H_ 16 #define ICING_STORE_TOKENIZED_DOCUMENT_H_ 17 18 #include <cstdint> 19 #include <string> 20 #include <vector> 21 22 #include "icing/text_classifier/lib3/utils/base/statusor.h" 23 #include "icing/proto/document.pb.h" 24 #include "icing/schema/schema-store.h" 25 #include "icing/schema/section.h" 26 #include "icing/tokenization/language-segmenter.h" 27 28 namespace icing { 29 namespace lib { 30 31 struct TokenizedSection { 32 SectionMetadata metadata; 33 std::vector<std::string_view> token_sequence; 34 TokenizedSectionTokenizedSection35 TokenizedSection(SectionMetadata&& metadata_in, 36 std::vector<std::string_view>&& token_sequence_in) 37 : metadata(std::move(metadata_in)), 38 token_sequence(std::move(token_sequence_in)) {} 39 }; 40 41 class TokenizedDocument { 42 public: 43 static libtextclassifier3::StatusOr<TokenizedDocument> Create( 44 const SchemaStore* schema_store, 45 const LanguageSegmenter* language_segmenter, DocumentProto document); 46 document()47 const DocumentProto& document() const { return document_; } 48 num_tokens()49 int32_t num_tokens() const { 50 int32_t num_tokens = 0; 51 for (const TokenizedSection& section : tokenized_sections_) { 52 num_tokens += section.token_sequence.size(); 53 } 54 return num_tokens; 55 } 56 sections()57 const std::vector<TokenizedSection>& sections() const { 58 return tokenized_sections_; 59 } 60 61 private: 62 // Use TokenizedDocument::Create() to instantiate. 63 explicit TokenizedDocument(DocumentProto document); 64 65 DocumentProto document_; 66 std::vector<TokenizedSection> tokenized_sections_; 67 68 libtextclassifier3::Status Tokenize( 69 const SchemaStore* schema_store, 70 const LanguageSegmenter* language_segmenter); 71 }; 72 73 } // namespace lib 74 } // namespace icing 75 76 #endif // ICING_STORE_TOKENIZED_DOCUMENT_H_ 77