• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_
18 #define LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_
19 
20 #include <string>
21 #include <vector>
22 
23 #include "absl/strings/string_view.h"
24 
25 namespace libtextclassifier3 {
26 
27 struct LookupStatus {
LookupStatusLookupStatus28   LookupStatus() : error_msg(""), success(true) {}
LookupStatusLookupStatus29   explicit LookupStatus(const std::string& msg)
30       : error_msg(msg), success(false) {}
31   std::string error_msg;
32   bool success;
33 
OKLookupStatus34   static LookupStatus OK() { return LookupStatus(); }
35 };
36 
37 class WordpieceVocab {
38  public:
~WordpieceVocab()39   virtual ~WordpieceVocab() {}
40   virtual LookupStatus Contains(const absl::string_view key,
41                                 bool* value) const = 0;
42 };
43 
44 LookupStatus WordpieceTokenize(
45     const absl::string_view token, const int max_bytes_per_token,
46     const int max_chars_per_subtoken, const std::string& suffix_indicator,
47     bool use_unknown_token, const std::string& unknown_token,
48     bool split_unknown_characters, const WordpieceVocab* vocab_map,
49     std::vector<std::string>* subwords, std::vector<int>* begin_offset,
50     std::vector<int>* end_offset, int* num_word_pieces);
51 
52 // As above but with `max_bytes_per_subtoken` unknown,
53 // and split_unknown_characters=false. (For backwards compatibility.)
54 LookupStatus WordpieceTokenize(
55     const absl::string_view token, const int max_bytes_per_token,
56     const std::string& suffix_indicator, bool use_unknown_token,
57     const std::string& unknown_token, const WordpieceVocab* vocab_map,
58     std::vector<std::string>* subwords, std::vector<int>* begin_offset,
59     std::vector<int>* end_offset, int* num_word_pieces);
60 
61 }  // namespace libtextclassifier3
62 
63 #endif  // LIBTEXTCLASSIFIER_UTILS_WORDPIECE_TOKENIZER_H_
64