1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_ 18 #define LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_ 19 20 #include <string> 21 #include <vector> 22 23 #include "absl/strings/string_view.h" 24 25 namespace libtextclassifier3 { 26 27 struct LookupStatus { LookupStatusLookupStatus28 LookupStatus() : error_msg(""), success(true) {} LookupStatusLookupStatus29 explicit LookupStatus(const std::string& msg) 30 : error_msg(msg), success(false) {} 31 std::string error_msg; 32 bool success; 33 OKLookupStatus34 static LookupStatus OK() { return LookupStatus(); } 35 }; 36 37 class WordpieceVocab { 38 public: ~WordpieceVocab()39 virtual ~WordpieceVocab() {} 40 virtual LookupStatus Contains(const absl::string_view key, 41 bool* value) const = 0; 42 }; 43 44 LookupStatus WordpieceTokenize( 45 const absl::string_view token, const int max_bytes_per_token, 46 const int max_chars_per_subtoken, const std::string& suffix_indicator, 47 bool use_unknown_token, const std::string& unknown_token, 48 bool split_unknown_characters, const WordpieceVocab* vocab_map, 49 std::vector<std::string>* subwords, std::vector<int>* begin_offset, 50 std::vector<int>* end_offset, int* num_word_pieces); 51 52 // As above but with `max_bytes_per_subtoken` unknown, 53 // and split_unknown_characters=false. (For backwards compatibility.) 54 LookupStatus WordpieceTokenize( 55 const absl::string_view token, const int max_bytes_per_token, 56 const std::string& suffix_indicator, bool use_unknown_token, 57 const std::string& unknown_token, const WordpieceVocab* vocab_map, 58 std::vector<std::string>* subwords, std::vector<int>* begin_offset, 59 std::vector<int>* end_offset, int* num_word_pieces); 60 61 } // namespace libtextclassifier3 62 63 #endif // LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_ 64