• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2023 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_TEXT_H_
18 #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_TEXT_H_
19 
20 #include <memory>
21 #include <optional>
22 #include <string>
23 #include <unordered_map>
24 #include <utility>
25 #include <vector>
26 
27 #include "include/api/dual_abi_helper.h"
28 #include "include/api/status.h"
29 #include "include/dataset/constants.h"
30 #include "include/dataset/transforms.h"
31 
32 namespace mindspore {
33 namespace dataset {
34 class TensorOperation;
35 class Vectors;
36 
37 using WordIdType = int32_t;
38 using WordType = std::string;
39 
40 /// \brief Vocab object that is used to save pairs of words and ids.
41 /// \note It contains a map that maps each word(str) to an id(int) or reverse.
42 class Vocab {
43  public:
44   /// \brief Build a vocab from an unordered_map. IDs should be no duplicate and continuous.
45   /// \param[in] words An unordered_map containing word id pair.
46   /// \param[out] vocab A vocab object.
47   /// \return Status code.
48   /// \par Example
49   /// \code
50   ///     // Build a map
51   ///     std::unordered_map<std::string, int32_t> dict;
52   ///     dict["banana"] = 0;
53   ///     dict["apple"] = 1;
54   ///     dict["cat"] = 2;
55   ///     dict["dog"] = 3;
56   ///     // Build vocab from map
57   ///     std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
58   ///     Status s = Vocab::BuildFromUnorderedMap(dict, &vocab);
59   /// \endcode
60   static Status BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdType> &words,
61                                       std::shared_ptr<Vocab> *vocab);
62 
63   /// \brief Build a vocab from a c++ vector. id no duplicate and continuous.
64   /// \param[in] words A vector of string containing words.
65   /// \param[in] special_tokens A vector of string containing special tokens.
66   /// \param[in] prepend_special Whether the special_tokens will be prepended/appended to vocab.
67   /// \param[out] vocab A vocab object.
68   /// \return Status code.
69   /// \par Example
70   /// \code
71   ///     // Build vocab from a vector of words, special tokens are prepended to vocab
72   ///     std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"};
73   ///     std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
74   ///     Status s = Vocab::BuildFromVector(list, {"<unk>"}, true, &vocab);
75   /// \endcode
76   static Status BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens,
77                                 bool prepend_special, std::shared_ptr<Vocab> *vocab);
78 
79   /// \brief Build a vocab from vocab file, IDs will be automatically assigned.
80   /// \param[in] path Path to vocab file, each line in file is assumed as a word (including space).
81   /// \param[in] delimiter Delimiter to break each line, characters after the delimiter will be deprecated.
82   /// \param[in] vocab_size Number of lines to be read from file.
83   /// \param[in] special_tokens A vector of string containing special tokens.
84   /// \param[in] prepend_special Whether the special_tokens will be prepended/appended to vocab.
85   /// \param[out] vocab A vocab object.
86   /// \return Status code.
87   /// \par Example
88   /// \code
89   ///     // Build vocab from local file
90   ///     std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt";
91   ///     std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
92   ///     Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab);
93   /// \endcode
94   static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size,
95                               const std::vector<WordType> &special_tokens, bool prepend_special,
96                               std::shared_ptr<Vocab> *vocab);
97 
98   /// Lookup the id of a word, if the word doesn't exist in vocab, return -1.
99   /// \param word Word to be looked up.
100   /// \return ID of the word in the vocab.
101   /// \par Example
102   /// \code
103   ///     // lookup, convert token to id
104   ///     auto single_index = vocab->TokensToIds("home");
105   ///     single_index = vocab->TokensToIds("hello");
106   /// \endcode
107   WordIdType TokensToIds(const WordType &word) const;
108 
109   /// Lookup the id of a word, if the word doesn't exist in vocab, return -1.
110   /// \param words Words to be looked up.
111   /// \return ID of the word in the vocab.
112   /// \par Example
113   /// \code
114   ///     // lookup multiple tokens
115   ///     auto multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "behind"});
116   ///     std::vector<int32_t> expected_multi_indexs = {0, 4};
117   ///     multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "apple"});
118   ///     expected_multi_indexs = {0, -1};
119   /// \endcode
120   std::vector<WordIdType> TokensToIds(const std::vector<WordType> &words) const;
121 
122   /// Lookup the word of an ID, if ID doesn't exist in vocab, return empty string.
123   /// \param id ID to be looked up.
124   /// \return Indicates the word corresponding to the ID.
125   /// \par Example
126   /// \code
127   ///     // reverse lookup, convert id to token
128   ///     auto single_word = vocab->IdsToTokens(2);
129   ///     single_word = vocab->IdsToTokens(-1);
130   /// \endcode
131   WordType IdsToTokens(const WordIdType &id);
132 
133   /// Lookup the word of an ID, if ID doesn't exist in vocab, return empty string.
134   /// \param ids ID to be looked up.
135   /// \return Indicates the word corresponding to the ID.
136   /// \par Example
137   /// \code
138   ///     // reverse lookup multiple ids
139   ///     auto multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 4});
140   ///     std::vector<std::string> expected_multi_words = {"<pad>", "behind"};
141   ///     multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 99});
142   ///     expected_multi_words = {"<pad>", ""};
143   /// \endcode
144   std::vector<WordType> IdsToTokens(const std::vector<WordIdType> &ids);
145 
146   /// Constructor, shouldn't be called directly, can't be private due to std::make_unique().
147   /// \param map Sanitized word2id map.
148   explicit Vocab(std::unordered_map<WordType, WordIdType> map);
149 
150   /// \brief Add one word to vocab, increment it's index automatically.
151   /// \param word Word to be added, word will skip if word already exists.
152   void AppendWord(const std::string &word);
153 
154   /// \brief Return a read-only vocab in unordered_map type.
155   /// \return A unordered_map of word2id.
GetVocab()156   const std::unordered_map<WordType, WordIdType> &GetVocab() const { return word2id_; }
157 
158   /// \brief Constructor.
159   Vocab() = default;
160 
161   /// \brief Destructor.
162   ~Vocab() = default;
163 
164   static const WordIdType kNoTokenExists;
165   static const WordType kNoIdExists;
166 
167  private:
168   std::unordered_map<WordType, WordIdType> word2id_;
169   std::unordered_map<WordIdType, WordType> id2word_;
170 };
171 
172 /// \brief SentencePiece object that is used to do words segmentation.
173 class SentencePieceVocab {
174  public:
175   /// \brief Build a SentencePiece object from a file.
176   /// \param[in] path_list Path to the file which contains the SentencePiece list.
177   /// \param[in] vocab_size Vocabulary size.
178   /// \param[in] character_coverage Amount of characters covered by the model, good defaults are: 0.9995 for
179   ///              languages with rich character set like Japanese or Chinese and 1.0 for other languages with small
180   ///              character set.
181   /// \param[in] model_type It can be any of [SentencePieceModel::kUnigram, SentencePieceModel::kBpe,
182   ///              SentencePieceModel::kChar, SentencePieceModel::kWord], default is SentencePieceModel::kUnigram. The
183   ///              input sentence must be pre-tokenized when using SentencePieceModel.WORD type.
184   ///              - SentencePieceModel.kUnigram, Unigram Language Model means the next word in the sentence is assumed
185   ///                to be independent of the previous words generated by the model.
186   ///              - SentencePieceModel.kBpe, refers to byte pair encoding algorithm, which replaces the most frequent
187   ///                pair of bytes in a sentence with a single, unused byte.
188   ///              - SentencePieceModel.kChar, refers to char based sentencePiece Model type.
189   ///              - SentencePieceModel.kWord, refers to word based sentencePiece Model type.
190   /// \param[in] params A dictionary with no incoming parameters(The parameters are derived from SentencePiece library).
191   /// \param[out] vocab A SentencePieceVocab object.
192   /// \return SentencePieceVocab, vocab built from the file.
193   /// \par Example
194   /// \code
195   ///     std::string dataset_path;
196   ///     dataset_path = datasets_root_path_ + "/test_sentencepiece/vocab.txt";
197   ///     std::vector<std::string> path_list;
198   ///     path_list.emplace_back(dataset_path);
199   ///     std::unordered_map<std::string, std::string> param_map;
200   ///     std::shared_ptr<SentencePieceVocab> spm = std::make_unique<SentencePieceVocab>();
201   ///     Status rc = SentencePieceVocab::BuildFromFile(path_list, 5000, 0.9995,
202   ///                                                   SentencePieceModel::kUnigram, param_map, &spm);
203   /// \endcode
204   static Status BuildFromFile(const std::vector<std::string> &path_list, int32_t vocab_size, float character_coverage,
205                               const SentencePieceModel &model_type,
206                               const std::unordered_map<std::string, std::string> &params,
207                               std::shared_ptr<SentencePieceVocab> *vocab);
208 
209   /// \brief Save the SentencePiece model into given file path.
210   /// \param[in] vocab A SentencePiece object to be saved.
211   /// \param[in] path Path to store the model.
212   /// \param[in] filename The save name of model file.
213   /// \par Example
214   /// \code
215   ///     // Save vocab model to local
216   ///     vocab->SaveModel(&vocab, datasets_root_path_ + "/test_sentencepiece", "m.model");
217   /// \endcode
218   static Status SaveModel(const std::shared_ptr<SentencePieceVocab> *vocab, const std::string &path,
219                           const std::string &filename);
220 
221   /// \brief Constructor.
222   SentencePieceVocab();
223 
224   /// \brief Destructor.
225   ~SentencePieceVocab() = default;
226 
227   const std::string &model_proto();
228 
229   void set_model_proto(const std::string &model_proto);
230 
231  private:
232   std::string model_proto_;
233 };
234 
235 // Transform operations for text
236 namespace text {
237 /// \brief Add token to beginning or end of sequence.
238 class DATASET_API AddToken final : public TensorTransform {
239  public:
240   /// \brief Constructor.
241   /// \param[in] token The token to be added.
242   /// \param[in] begin Whether to insert token at start or end of sequence. Default: true.
243   /// \par Example
244   /// \code
245   ///     /* Define operations */
246   ///     auto add_token_op = text::AddToken(token='TOKEN', begin=True);
247   ///
248   ///     /* dataset is an instance of Dataset object */
249   ///     dataset = dataset->Map({add_token_op},   // operations
250   ///                            {"text"});       // input columns
251   /// \endcode
252   explicit AddToken(const std::string &token, bool begin = true);
253 
254   /// \brief Destructor.
255   ~AddToken() override = default;
256 
257  protected:
258   /// \brief Function to convert TensorTransform object into a TensorOperation object.
259   /// \return Shared pointer to TensorOperation object.
260   std::shared_ptr<TensorOperation> Parse() override;
261 
262  private:
263   struct Data;
264   std::shared_ptr<Data> data_;
265 };
266 
267 #ifndef _WIN32
268 /// \brief Tokenize a scalar tensor of UTF-8 string by specific rules.
269 /// \note BasicTokenizer is not supported on the Windows platform yet.
270 class DATASET_API BasicTokenizer final : public TensorTransform {
271  public:
272   /// \brief Constructor.
273   /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8 (NFD mode) and RegexReplace operations to
274   ///    the input text to fold the text to lower case and strip accents characters. If false, only apply
275   ///    the NormalizeUTF8('normalization_form' mode) operation to the input text (default=false).
276   /// \param[in] keep_whitespace If true, the whitespace will be kept in output tokens (default=false).
277   /// \param[in] normalize_form This parameter is used to specify a specific normalize mode. This is only effective
278   ///    when 'lower_case' is false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
279   /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]' and
280   ///    '[MASK]' (default=true).
281   /// \param[in] with_offsets Whether to output offsets of tokens (default=false).
282   /// \par Example
283   /// \code
284   ///     /* Define operations */
285   ///     auto tokenizer_op = text::BasicTokenizer();
286   ///
287   ///     /* dataset is an instance of Dataset object */
288   ///     dataset = dataset->Map({tokenizer_op},   // operations
289   ///                            {"text"});        // input columns
290   /// \endcode
291   explicit BasicTokenizer(bool lower_case = false, bool keep_whitespace = false,
292                           NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true,
293                           bool with_offsets = false);
294 
295   /// \brief Destructor
296   ~BasicTokenizer() override = default;
297 
298  protected:
299   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
300   /// \return Shared pointer to the TensorOperation object.
301   std::shared_ptr<TensorOperation> Parse() override;
302 
303  private:
304   struct Data;
305   std::shared_ptr<Data> data_;
306 };
307 
308 /// \brief A tokenizer used for Bert text process.
309 /// \note BertTokenizer is not supported on the Windows platform yet.
310 class DATASET_API BertTokenizer final : public TensorTransform {
311  public:
312   /// \brief Constructor.
313   /// \param[in] vocab A Vocab object.
314   /// \param[in] suffix_indicator This parameter is used to show that the sub-word
315   ///    is the last part of a word (default='##').
316   /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100).
317   /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty
318   ///    string, else return the specified string (default='[UNK]').
319   /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8 (NFD mode) and RegexReplace operations to
320   ///    the input text to fold the text to lower case and strip accents characters. If false, only apply
321   ///    the NormalizeUTF8('normalization_form' mode) operation to the input text (default=false).
322   /// \param[in] keep_whitespace If true, the whitespace will be kept in output tokens (default=false).
323   /// \param[in] normalize_form This parameter is used to specify a specific normalize mode. This is only effective
324   ///    when 'lower_case' is false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
325   /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]' and
326   ///   '[MASK]' (default=true).
327   /// \param[in] with_offsets Whether to output offsets of tokens (default=false).
328   /// \par Example
329   /// \code
330   ///     /* Define operations */
331   ///     std::vector<std::string> list = {"a", "b", "c", "d"};
332   ///     std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
333   ///     Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
334   ///     auto tokenizer_op = text::BertTokenizer(vocab);
335   ///
336   ///     /* dataset is an instance of Dataset object */
337   ///     dataset = dataset->Map({tokenizer_op},   // operations
338   ///                            {"text"});        // input columns
339   /// \endcode
340   explicit BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##",
341                          int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]",
342                          bool lower_case = false, bool keep_whitespace = false,
343                          const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true,
344                          bool with_offsets = false)
BertTokenizer(vocab,StringToChar (suffix_indicator),max_bytes_per_token,StringToChar (unknown_token),lower_case,keep_whitespace,normalize_form,preserve_unused_token,with_offsets)345       : BertTokenizer(vocab, StringToChar(suffix_indicator), max_bytes_per_token, StringToChar(unknown_token),
346                       lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets) {}
347   /// \brief Constructor.
348   /// \param[in] vocab A Vocab object.
349   /// \param[in] suffix_indicator This parameter is used to show that the sub-word
350   ///    is the last part of a word (default='##').
351   /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100).
352   /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty
353   ///    string, else return the specified string (default='[UNK]').
354   /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8 (NFD mode) and RegexReplace operations to
355   ///    the input text to fold the text to lower case and strip accents characters. If false, only apply
356   ///    the NormalizeUTF8('normalization_form' mode) operation to the input text (default=false).
357   /// \param[in] keep_whitespace If true, the whitespace will be kept in output tokens (default=false).
358   /// \param[in] normalize_form This parameter is used to specify a specific normalize mode. This is only effective
359   ///    when 'lower_case' is false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
360   /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]' and
361   ///   '[MASK]' (default=true).
362   /// \param[in] with_offsets Whether to output offsets of tokens (default=false).
363   BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
364                 int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool lower_case,
365                 bool keep_whitespace, NormalizeForm normalize_form, bool preserve_unused_token, bool with_offsets);
366 
367   /// \brief Destructor
368   ~BertTokenizer() override = default;
369 
370  protected:
371   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
372   /// \return Shared pointer to the TensorOperation object.
373   std::shared_ptr<TensorOperation> Parse() override;
374 
375  private:
376   struct Data;
377   std::shared_ptr<Data> data_;
378 };
379 
380 /// \brief Apply case fold operation on UTF-8 string tensors.
381 class DATASET_API CaseFold final : public TensorTransform {
382  public:
383   /// \brief Constructor.
384   /// \par Example
385   /// \code
386   ///     /* Define operations */
387   ///     auto casefold_op = text::CaseFold();
388   ///
389   ///     /* dataset is an instance of Dataset object */
390   ///     dataset = dataset->Map({casefold_op},   // operations
391   ///                            {"text"});       // input columns
392   /// \endcode
393   CaseFold();
394 
395   /// \brief Destructor
396   ~CaseFold() override = default;
397 
398  protected:
399   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
400   /// \return Shared pointer to the TensorOperation object.
401   std::shared_ptr<TensorOperation> Parse() override;
402 };
403 
404 /// \brief Filter wikipedia xml lines.
405 class FilterWikipediaXML final : public TensorTransform {
406  public:
407   /// \brief Constructor.
408   FilterWikipediaXML();
409 
410   /// \brief Destructor
411   ~FilterWikipediaXML() override = default;
412 
413  protected:
414   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
415   /// \return Shared pointer to the TensorOperation object.
416   std::shared_ptr<TensorOperation> Parse() override;
417 };
418 #endif
419 
420 /// \brief Tokenize a Chinese string into words based on the dictionary.
421 /// \note The integrity of the HMMSegment algorithm and MPSegment algorithm files must be confirmed.
422 class DATASET_API JiebaTokenizer final : public TensorTransform {
423  public:
424   /// \brief Constructor.
425   /// \param[in] hmm_path Dictionary file is used by the HMMSegment algorithm. The dictionary can be obtained on the
426   ///   official website of cppjieba (https://github.com/yanyiwu/cppjieba).
427   /// \param[in] mp_path Dictionary file is used by the MPSegment algorithm. The dictionary can be obtained on the
428   ///   official website of cppjieba (https://github.com/yanyiwu/cppjieba).
429   /// \param[in] mode Valid values can be any of JiebaMode.kMP, JiebaMode.kHMM and JiebaMode.kMIX
430   ///   (default=JiebaMode.kMIX).
431   ///   - JiebaMode.kMP, tokenizes with MPSegment algorithm.
432   ///   - JiebaMode.kHMM, tokenizes with Hidden Markov Model Segment algorithm.
433   ///   - JiebaMode.kMIX, tokenizes with a mix of MPSegment and HMMSegment algorithms.
434   /// \param[in] with_offsets Whether to output offsets of tokens (default=false).
435   /// \par Example
436   /// \code
437   ///     /* Define operations */
438   ///     auto tokenizer_op = text::JiebaTokenizer("/path/to/hmm/file", "/path/to/mp/file");
439   ///
440   ///     /* dataset is an instance of Dataset object */
441   ///     dataset = dataset->Map({tokenizer_op},   // operations
442   ///                            {"text"});        // input columns
443   /// \endcode
444   JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode = JiebaMode::kMix,
445                  bool with_offsets = false)
JiebaTokenizer(StringToChar (hmm_path),StringToChar (mp_path),mode,with_offsets)446       : JiebaTokenizer(StringToChar(hmm_path), StringToChar(mp_path), mode, with_offsets) {}
447 
448   /// \brief Constructor.
449   /// \param[in] hmm_path Dictionary file is used by the HMMSegment algorithm. The dictionary can be obtained on the
450   ///   official website of cppjieba (https://github.com/yanyiwu/cppjieba).
451   /// \param[in] mp_path Dictionary file is used by the MPSegment algorithm. The dictionary can be obtained on the
452   ///   official website of cppjieba (https://github.com/yanyiwu/cppjieba).
453   /// \param[in] mode Valid values can be any of JiebaMode.kMP, JiebaMode.kHMM and JiebaMode.kMIX
454   ///   (default=JiebaMode.kMIX).
455   ///   - JiebaMode.kMP, tokenizes with MPSegment algorithm.
456   ///   - JiebaMode.kHMM, tokenizes with Hidden Markov Model Segment algorithm.
457   ///   - JiebaMode.kMIX, tokenizes with a mix of MPSegment and HMMSegment algorithms.
458   /// \param[in] with_offsets Whether to output offsets of tokens (default=false).
459   JiebaTokenizer(const std::vector<char> &hmm_path, const std::vector<char> &mp_path, const JiebaMode &mode,
460                  bool with_offsets);
461 
462   /// \brief Destructor
463   ~JiebaTokenizer() override = default;
464 
465   /// \brief Add a user defined word to the JiebaTokenizer's dictionary.
466   /// \param[in] word The word to be added to the JiebaTokenizer instance.
467   ///   The added word will not be written into the built-in dictionary on disk.
468   /// \param[in] freq The frequency of the word to be added. The higher the frequency,
469   ///   the better chance the word will be tokenized (default=None, use default frequency).
470   /// \return Status error code, returns OK if no error is encountered.
471   /// \par Example
472   /// \code
473   ///     /* Define operations */
474   ///     auto tokenizer_op = text::JiebaTokenizer("/path/to/hmm/file", "/path/to/mp/file");
475   ///
476   ///     Status s = tokenizer_op.AddWord("hello", 2);
477   /// \endcode
478   Status AddWord(const std::string &word, int64_t freq = 0) { return AddWordChar(StringToChar(word), freq); }
479 
480   /// \brief Add a user defined dictionary of word-freq pairs to the JiebaTokenizer's dictionary.
481   /// \param[in] user_dict Vector of word-freq pairs to be added to the JiebaTokenizer's dictionary.
482   /// \return Status error code, returns OK if no error is encountered.
483   /// \par Example
484   /// \code
485   ///     /* Define operations */
486   ///     auto tokenizer_op = text::JiebaTokenizer("/path/to/hmm/file", "/path/to/mp/file");
487   ///
488   ///     std::vector<std::pair<std::string, int64_t>> user_dict = {{"a", 1}, {"b", 2}, {"c", 3}};
489   ///     Status s = tokenizer_op.AddDict(user_dict);
490   /// \endcode
AddDict(const std::vector<std::pair<std::string,int64_t>> & user_dict)491   Status AddDict(const std::vector<std::pair<std::string, int64_t>> &user_dict) {
492     return AddDictChar(PairStringInt64ToPairCharInt64(user_dict));
493   }
494 
495   /// \brief Add user defined dictionary of word-freq pairs to the JiebaTokenizer's dictionary from a file.
496   ///   Only valid word-freq pairs in user defined file will be added into the dictionary.
497   ///   Rows containing invalid inputs will be ignored, no error nor warning status is returned.
498   /// \param[in] file_path Path to the dictionary which includes user defined word-freq pairs.
499   /// \return Status error code, returns OK if no error is encountered.
500   /// \par Example
501   /// \code
502   ///     /* Define operations */
503   ///     auto tokenizer_op = text::JiebaTokenizer("/path/to/hmm/file", "/path/to/mp/file");
504   ///
505   ///     Status s = tokenizer_op.AddDict("/path/to/dict/file");
506   /// \endcode
AddDict(const std::string & file_path)507   Status AddDict(const std::string &file_path) { return AddDictChar(StringToChar(file_path)); }
508 
509  protected:
510   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
511   /// \return Shared pointer to the TensorOperation object.
512   std::shared_ptr<TensorOperation> Parse() override;
513 
514  private:
515   /// \brief Parser user defined words by files.
516   /// \param[in] file_path Path to the user defined file.
517   /// \param[in] user_dict Vector of word-freq pairs extracted from the user defined file.
518   Status ParserFile(const std::string &file_path, std::vector<std::pair<std::string, int64_t>> *const user_dict);
519 
520   /// \brief Used to translate all API strings to vector of char and reverse.
521   Status AddWordChar(const std::vector<char> &word, int64_t freq = 0);
522 
523   /// \brief Used to translate all API strings to vector of char and reverse.
524   Status AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict);
525 
526   /// \brief Used to translate all API strings to vector of char and reverse.
527   Status AddDictChar(const std::vector<char> &file_path);
528 
529   struct Data;
530   std::shared_ptr<Data> data_;
531 };
532 
533 /// \brief Look up a word into an id according to the input vocabulary table.
534 class DATASET_API Lookup final : public TensorTransform {
535  public:
536   /// \brief Constructor.
537   /// \param[in] vocab a Vocab object.
538   /// \param[in] unknown_token Word is used for lookup. In case of the word is out of vocabulary (OOV),
539   ///    the result of lookup will be replaced to unknown_token. If the unknown_token is not specified or it is OOV,
540   ///    runtime error will be thrown (default={}, means no unknown_token is specified).
541   /// \param[in] data_type mindspore::DataType of the tensor after lookup; must be numeric, including bool.
542   ///   (default=mindspore::DataType::kNumberTypeInt32).
543   /// \par Example
544   /// \code
545   ///     /* Define operations */
546   ///    std::vector<std::string> list = {"a", "b", "c", "d"};
547   ///     std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
548   ///     Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
549   ///     auto lookup_op = text::Lookup(vocab, "[unk]");
550   ///
551   ///     /* dataset is an instance of Dataset object */
552   ///     dataset = dataset->Map({lookup_op},   // operations
553   ///                            {"text"});     // input columns
554   /// \endcode
555   explicit Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token = {},
556                   mindspore::DataType data_type = mindspore::DataType::kNumberTypeInt32) {
557     std::optional<std::vector<char>> unknown_token_c = std::nullopt;
558     if (unknown_token != std::nullopt) {
559       unknown_token_c = std::vector<char>(unknown_token->begin(), unknown_token->end());
560     }
561     new (this) Lookup(vocab, unknown_token_c, data_type);
562   }
563 
564   /// \brief Constructor.
565   /// \param[in] vocab a Vocab object.
566   /// \param[in] unknown_token Word is used for lookup. In case of the word is out of vocabulary (OOV),
567   ///    the result of lookup will be replaced to unknown_token. If the unknown_token is not specified or it is OOV,
568   ///    runtime error will be thrown (default={}, means no unknown_token is specified).
569   /// \param[in] data_type mindspore::DataType of the tensor after lookup; must be numeric, including bool.
570   ///   (default=mindspore::DataType::kNumberTypeInt32).
571   Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
572          mindspore::DataType data_type = mindspore::DataType::kNumberTypeInt32);
573 
574   /// \brief Destructor
575   ~Lookup() override = default;
576 
577  protected:
578   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
579   /// \return Shared pointer to the TensorOperation object.
580   std::shared_ptr<TensorOperation> Parse() override;
581 
582  private:
583   struct Data;
584   std::shared_ptr<Data> data_;
585 };
586 
587 /// \brief Generate n-gram from a 1-D string Tensor.
588 class DATASET_API Ngram final : public TensorTransform {
589  public:
590   /// \brief Constructor.
591   /// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result
592   ///    would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
593   ///    a n-gram, an empty string will be returned.
594   /// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will
595   ///    be capped at n-1. left_pad=("_",2) would pad the left side of the sequence with "__" (default={"", 0}}).
596   /// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will
597   ///    be capped at n-1. right_pad=("-",2) would pad the right side of the sequence with "--" (default={"", 0}}).
598   /// \param[in] separator Symbol used to join strings together (default=" ").
599   /// \par Example
600   /// \code
601   ///     /* Define operations */
602   ///     auto ngram_op = text::Ngram({2, 3}, {"&", 2}, {"&", 2}, "-");
603   ///
604   ///     /* dataset is an instance of Dataset object */
605   ///     dataset = dataset->Map({ngram_op},   // operations
606   ///                            {"text"});    // input columns
607   /// \endcode
608   explicit Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad = {"", 0},
609                  const std::pair<std::string, int32_t> &right_pad = {"", 0}, const std::string &separator = " ")
Ngram(ngrams,PairStringToChar (left_pad),PairStringToChar (right_pad),StringToChar (separator))610       : Ngram(ngrams, PairStringToChar(left_pad), PairStringToChar(right_pad), StringToChar(separator)) {}
611 
612   /// \brief Constructor.
613   /// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result
614   ///    would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
615   ///    a n-gram, an empty string will be returned.
616   /// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will
617   ///    be capped at n-1. left_pad=("_",2) would pad the left side of the sequence with "__" (default={"", 0}}).
618   /// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will
619   ///    be capped at n-1. right_pad=("-",2) would pad the right side of the sequence with "--" (default={"", 0}}).
620   /// \param[in] separator Symbol used to join strings together (default=" ").
621   Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
622         const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator);
623 
624   /// \brief Destructor
625   ~Ngram() override = default;
626 
627  protected:
628   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
629   /// \return Shared pointer to the TensorOperation object.
630   std::shared_ptr<TensorOperation> Parse() override;
631 
632  private:
633   struct Data;
634   std::shared_ptr<Data> data_;
635 };
636 
637 #ifndef _WIN32
638 /// \brief Apply normalize operation to UTF-8 string tensors.
639 class DATASET_API NormalizeUTF8 final : public TensorTransform {
640  public:
641   /// \brief Constructor.
642   /// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc,
643   ///   NormalizeForm::kNfkc, NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc).
644   ///   See <http://unicode.org/reports/tr15/> for details.
645   ///   - NormalizeForm.kNone, remain the input string tensor unchanged.
646   ///   - NormalizeForm.kNfc, normalizes with Normalization Form C.
647   ///   - NormalizeForm.kNfkc, normalizes with Normalization Form KC.
648   ///   - NormalizeForm.kNfd, normalizes with Normalization Form D.
649   ///   - NormalizeForm.kNfkd, normalizes with Normalization Form KD.
650   /// \par Example
651   /// \code
652   ///     /* Define operations */
653   ///     auto normalizeutf8_op = text::NormalizeUTF8();
654   ///
655   ///     /* dataset is an instance of Dataset object */
656   ///     dataset = dataset->Map({normalizeutf8_op},   // operations
657   ///                            {"text"});            // input columns
658   /// \endcode
659   explicit NormalizeUTF8(NormalizeForm normalize_form = NormalizeForm::kNfkc);
660 
661   /// \brief Destructor
662   ~NormalizeUTF8() override = default;
663 
664  protected:
665   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
666   /// \return Shared pointer to the TensorOperation object.
667   std::shared_ptr<TensorOperation> Parse() override;
668 
669  private:
670   struct Data;
671   std::shared_ptr<Data> data_;
672 };
673 
674 /// \brief Replace a UTF-8 string tensor with 'replace' according to regular expression 'pattern'.
675 class DATASET_API RegexReplace final : public TensorTransform {
676  public:
677   /// \brief Constructor.
678   /// \param[in] pattern The regex expression patterns.
679   /// \param[in] replace The string to replace the matched element.
680   /// \param[in] replace_all Confirm whether to replace all. If false, only replace the first matched element;
681   ///   if true, replace all matched elements (default=true).
682   /// \par Example
683   /// \code
684   ///     /* Define operations */
685   ///     auto regex_op = text::RegexReplace("\\s+", "_", true);
686   ///
687   ///     /* dataset is an instance of Dataset object */
688   ///     dataset = dataset->Map({regex_op},   // operations
689   ///                            {"text"});    // input columns
690   /// \endcode
691   RegexReplace(const std::string &pattern, const std::string &replace, bool replace_all = true)
RegexReplace(StringToChar (pattern),StringToChar (replace),replace_all)692       : RegexReplace(StringToChar(pattern), StringToChar(replace), replace_all) {}
693 
694   /// \brief Constructor.
695   /// \param[in] pattern The regex expression patterns. Type should be char of vector.
696   /// \param[in] replace The string to replace the matched element.
697   /// \param[in] replace_all Confirm whether to replace all. If false, only replace the first matched element;
698   ///   if true, replace all matched elements (default=true).
699   RegexReplace(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all);
700 
701   /// \brief Destructor
702   ~RegexReplace() override = default;
703 
704  protected:
705   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
706   /// \return Shared pointer to the TensorOperation object.
707   std::shared_ptr<TensorOperation> Parse() override;
708 
709  private:
710   struct Data;
711   std::shared_ptr<Data> data_;
712 };
713 
714 /// \brief Tokenize a scalar tensor of UTF-8 string by the regex expression pattern.
715 class DATASET_API RegexTokenizer final : public TensorTransform {
716  public:
717   /// \brief Constructor.
718   /// \param[in] delim_pattern The pattern of regex delimiters.
719   /// \param[in] keep_delim_pattern The string matched with 'delim_pattern' can be kept as a token if it can be
720   ///   matched by 'keep_delim_pattern'. The default value is an empty string ("").
721   ///   which means that delimiters will not be kept as an output token (default="").
722   /// \param[in] with_offsets Whether to output offsets of tokens (default=false).
723   /// \par Example
724   /// \code
725   ///     /* Define operations */
726   ///     auto regex_op = text::RegexTokenizer("\\s+", "\\s+", false);
727   ///
728   ///     /* dataset is an instance of Dataset object */
729   ///     dataset = dataset->Map({regex_op},   // operations
730   ///                            {"text"});    // input columns
731   /// \endcode
732   explicit RegexTokenizer(const std::string &delim_pattern, const std::string &keep_delim_pattern = "",
733                           bool with_offsets = false)
RegexTokenizer(StringToChar (delim_pattern),StringToChar (keep_delim_pattern),with_offsets)734       : RegexTokenizer(StringToChar(delim_pattern), StringToChar(keep_delim_pattern), with_offsets) {}
735 
736   explicit RegexTokenizer(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern,
737                           bool with_offsets);
738 
739   /// \brief Destructor
740   ~RegexTokenizer() override = default;
741 
742  protected:
743   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
744   /// \return Shared pointer to the TensorOperation object.
745   std::shared_ptr<TensorOperation> Parse() override;
746 
747  private:
748   struct Data;
749   std::shared_ptr<Data> data_;
750 };
751 #endif
752 
753 /// \brief Tokenize a scalar token or a 1-D token to tokens by sentencepiece.
754 class DATASET_API SentencePieceTokenizer final : public TensorTransform {
755  public:
756   /// \brief Constructor.
757   /// \param[in] vocab a SentencePieceVocab object.
758   /// \param[in] out_type The type of the output.
759   /// \par Example
760   /// \code
761   ///     /* Define operations */
762   ///     std::shared_ptr<Dataset> ds_vocab = TextFile({"/path/to/vocab/file"}, 0, ShuffleMode::kFalse);
763   ///     std::shared_ptr<SentencePieceVocab> vocab =
764   ///         ds_vocab->BuildSentencePieceVocab({}, 0, 0.9995, SentencePieceModel::kUnigram, {});
765   ///     auto tokenizer_op = text::SentencePieceTokenizer(vocab, mindspore::dataset::SPieceTokenizerOutType::kString);
766   ///
767   ///     /* dataset is an instance of Dataset object */
768   ///     dataset = dataset->Map({tokenizer_op},   // operations
769   ///                            {"text"});        // input columns
770   /// \endcode
771   SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> &vocab,
772                          mindspore::dataset::SPieceTokenizerOutType out_type);
773 
774   /// \brief Constructor.
775   /// \param[in] vocab_path vocab model file path.
776   /// \param[in] out_type The type of the output.
777   /// \par Example
778   /// \code
779   ///     /* Define operations */
780   ///     auto tokenizer_op = text::SentencePieceTokenizer("/path/to/model",
781   ///                                                      mindspore::dataset::SPieceTokenizerOutType::kInt);
782   ///
783   ///     /* dataset is an instance of Dataset object */
784   ///     dataset = dataset->Map({tokenizer_op},   // operations
785   ///                            {"text"});        // input columns
786   /// \endcode
SentencePieceTokenizer(const std::string & vocab_path,mindspore::dataset::SPieceTokenizerOutType out_type)787   SentencePieceTokenizer(const std::string &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type)
788       : SentencePieceTokenizer(StringToChar(vocab_path), out_type) {}
789 
790   /// \brief Constructor.
791   /// \param[in] vocab_path vocab model file path. type should be char of vector.
792   /// \param[in] out_type The type of the output.
793   SentencePieceTokenizer(const std::vector<char> &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type);
794 
795   /// \brief Destructor
796   ~SentencePieceTokenizer() override = default;
797 
798  protected:
799   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
800   /// \return Shared pointer to the TensorOperation object.
801   std::shared_ptr<TensorOperation> Parse() override;
802 
803  private:
804   struct Data;
805   std::shared_ptr<Data> data_;
806 };
807 
808 /// \brief Construct a tensor from data (only 1-D for now), where each element in the dimension
809 ///   axis is a slice of data starting at the corresponding position, with a specified width.
810 class DATASET_API SlidingWindow final : public TensorTransform {
811  public:
812   /// \brief Constructor.
813   /// \param[in] width The width of the window. It must be an integer and greater than zero.
814   /// \param[in] axis The axis where the sliding window is computed (default=0), axis only
815   ///    supports 0 or -1 for now.
816   /// \par Example
817   /// \code
818   ///     /* Define operations */
819   ///     auto slidingwindow_op = text::SlidingWindow(5, 0);
820   ///
821   ///     /* dataset is an instance of Dataset object */
822   ///     dataset = dataset->Map({slidingwindow_op},   // operations
823   ///                            {"text"});            // input columns
824   /// \endcode
825   explicit SlidingWindow(int32_t width, int32_t axis = 0);
826 
827   /// \brief Destructor
828   ~SlidingWindow() override = default;
829 
830  protected:
831   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
832   /// \return Shared pointer to the TensorOperation object.
833   std::shared_ptr<TensorOperation> Parse() override;
834 
835  private:
836   struct Data;
837   std::shared_ptr<Data> data_;
838 };
839 
840 /// \brief Convert every element in a string tensor to a number.
841 ///   Strings are cast according to the rules specified in the following links:
842 ///   https://en.cppreference.com/w/cpp/string/basic_string/stof,
843 ///   https://en.cppreference.com/w/cpp/string/basic_string/stoul,
844 ///   except that any strings which represent negative numbers cannot be cast to an unsigned integer type.
845 class DATASET_API ToNumber final : public TensorTransform {
846  public:
847   /// \brief Constructor.
848   /// \param[in] data_type mindspore::DataType of the tensor to be cast to. Must be a numeric type, excluding bool.
849   /// \par Example
850   /// \code
851   ///     /* Define operations */
852   ///     auto to_number_op = text::ToNumber(mindspore::DataType::kNumberTypeInt8);
853   ///
854   ///     /* dataset is an instance of Dataset object */
855   ///     dataset = dataset->Map({to_number_op},   // operations
856   ///                            {"text"});        // input columns
857   /// \endcode
858   explicit ToNumber(mindspore::DataType data_type);
859 
860   /// \brief Destructor
861   ~ToNumber() override = default;
862 
863  protected:
864   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
865   /// \return Shared pointer to the TensorOperation object.
866   std::shared_ptr<TensorOperation> Parse() override;
867 
868  private:
869   struct Data;
870   std::shared_ptr<Data> data_;
871 };
872 
873 /// \brief Look up a token into an vector according to the input Vectors table.
874 class DATASET_API ToVectors final : public TensorTransform {
875  public:
876   /// \brief Constructor.
877   /// \param[in] vectors A Vectors object.
878   /// \param[in] unk_init In case of the token is out-of-vectors (OOV), the result will be initialized with `unk_init`.
879   ///     (default={}, means to initialize with zero vectors).
880   /// \param[in] lower_case_backup Whether to look up the token in the lower case (default=false).
881   explicit ToVectors(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init = {},
882                      bool lower_case_backup = false);
883 
884   /// \brief Destructor
885   ~ToVectors() override = default;
886 
887  protected:
888   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
889   /// \return Shared pointer to the TensorOperation object.
890   std::shared_ptr<TensorOperation> Parse() override;
891 
892  private:
893   struct Data;
894   std::shared_ptr<Data> data_;
895 };
896 
897 /// \brief Truncate the input sequence so that it does not exceed the maximum length.
898 class DATASET_API Truncate final : public TensorTransform {
899  public:
900   /// \brief Constructor.
901   /// \param[in] max_seq_len Maximum allowable length.
902   /// \par Example
903   /// \code
904   ///     /* Define operations */
905   ///     auto truncate_op = text::Truncate(5);
906   ///
907   ///     /* dataset is an instance of Dataset object */
908   ///     dataset = dataset->Map({truncate_op},   // operations
909   ///                            {"text"});       // input columns
910   /// \endcode
911   explicit Truncate(int32_t max_seq_len);
912 
913   /// \brief Destructor.
914   ~Truncate() override = default;
915 
916  protected:
917   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
918   /// \return Shared pointer to the TensorOperation object.
919   std::shared_ptr<TensorOperation> Parse() override;
920 
921  private:
922   struct Data;
923   std::shared_ptr<Data> data_;
924 };
925 
926 /// \brief Truncate a pair of rank-1 tensors such that the total length is less than max_length.
927 class DATASET_API TruncateSequencePair final : public TensorTransform {
928  public:
929   /// \brief Constructor.
930   /// \param[in] max_length Maximum length required.
931   /// \par Example
932   /// \code
933   ///     /* Define operations */
934   ///     auto truncate_op = text::TruncateSequencePair(5);
935   ///
936   ///     /* dataset is an instance of Dataset object */
937   ///     dataset = dataset->Map({truncate_op},   // operations
938   ///                            {"text"});       // input columns
939   /// \endcode
940   explicit TruncateSequencePair(int32_t max_length);
941 
942   /// \brief Destructor
943   ~TruncateSequencePair() override = default;
944 
945  protected:
946   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
947   /// \return Shared pointer to the TensorOperation object.
948   std::shared_ptr<TensorOperation> Parse() override;
949 
950  private:
951   struct Data;
952   std::shared_ptr<Data> data_;
953 };
954 
955 /// \brief Tokenize a scalar tensor of UTF-8 string to Unicode characters.
956 class DATASET_API UnicodeCharTokenizer final : public TensorTransform {
957  public:
958   /// \brief Constructor.
959   /// \param[in] with_offsets whether to output offsets of tokens (default=false).
960   /// \par Example
961   /// \code
962   ///     /* Define operations */
963   ///     auto tokenizer_op = text::UnicodeCharTokenizer();
964   ///
965   ///     /* dataset is an instance of Dataset object */
966   ///     dataset = dataset->Map({tokenizer_op},   // operations
967   ///                            {"text"});        // input columns
968   /// \endcode
969   explicit UnicodeCharTokenizer(bool with_offsets = false);
970 
971   /// \brief Destructor
972   ~UnicodeCharTokenizer() override = default;
973 
974  protected:
975   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
976   /// \return Shared pointer to the TensorOperation object.
977   std::shared_ptr<TensorOperation> Parse() override;
978 
979  private:
980   struct Data;
981   std::shared_ptr<Data> data_;
982 };
983 
984 /// \brief Tokenize scalar token or 1-D tokens to 1-D sub-word tokens.
985 class DATASET_API WordpieceTokenizer final : public TensorTransform {
986  public:
987   /// \brief Constructor.
988   /// \param[in] vocab A Vocab object.
989   /// \param[in] suffix_indicator This parameter is used to show that the sub-word
990   ///    is the last part of a word (default='##').
991   /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100).
992   /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty
993   ///    string, else return the specified string (default='[UNK]').
994   /// \param[in] with_offsets whether to output offsets of tokens (default=false).
995   /// \par Example
996   /// \code
997   ///     /* Define operations */
998   ///     std::vector<std::string> word_list = {"book", "apple", "rabbit"};
999   ///     std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
1000   ///     Status s = Vocab::BuildFromVector(word_list, {}, true, &vocab);
1001   ///     auto tokenizer_op = text::WordpieceTokenizer(vocab);
1002   ///
1003   ///     /* dataset is an instance of Dataset object */
1004   ///     dataset = dataset->Map({tokenizer_op},   // operations
1005   ///                            {"text"});        // input columns
1006   /// \endcode
1007   explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##",
1008                               int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]",
1009                               bool with_offsets = false)
WordpieceTokenizer(vocab,StringToChar (suffix_indicator),max_bytes_per_token,StringToChar (unknown_token),with_offsets)1010       : WordpieceTokenizer(vocab, StringToChar(suffix_indicator), max_bytes_per_token, StringToChar(unknown_token),
1011                            with_offsets) {}
1012 
1013   explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
1014                               int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool with_offsets);
1015 
1016   /// \brief Destructor
1017   ~WordpieceTokenizer() override = default;
1018 
1019  protected:
1020   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
1021   /// \return Shared pointer to the TensorOperation object.
1022   std::shared_ptr<TensorOperation> Parse() override;
1023 
1024  private:
1025   struct Data;
1026   std::shared_ptr<Data> data_;
1027 };
1028 
1029 #ifndef _WIN32
1030 /// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
1031 class DATASET_API UnicodeScriptTokenizer final : public TensorTransform {
1032  public:
1033   /// \brief Constructor.
1034   /// \param[in] keep_whitespace whether to emit whitespace tokens (default=false).
1035   /// \param[in] with_offsets whether to output offsets of tokens (default=false).
1036   /// \par Example
1037   /// \code
1038   ///     /* Define operations */
1039   ///     auto tokenizer_op = text::UnicodeScriptTokenizer(false, true);
1040   ///
1041   ///     /* dataset is an instance of Dataset object */
1042   ///     dataset = dataset->Map({tokenizer_op},   // operations
1043   ///                            {"text"});        // input columns
1044   /// \endcode
1045   explicit UnicodeScriptTokenizer(bool keep_whitespace = false, bool with_offsets = false);
1046 
1047   /// \brief Destructor
1048   ~UnicodeScriptTokenizer() override = default;
1049 
1050  protected:
1051   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
1052   /// \return Shared pointer to the TensorOperation object.
1053   std::shared_ptr<TensorOperation> Parse() override;
1054 
1055  private:
1056   struct Data;
1057   std::shared_ptr<Data> data_;
1058 };
1059 
1060 /// \brief Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces.
1061 class DATASET_API WhitespaceTokenizer final : public TensorTransform {
1062  public:
1063   /// \brief Constructor.
1064   /// \param[in] with_offsets whether to output offsets of tokens (default=false).
1065   /// \par Example
1066   /// \code
1067   ///     /* Define operations */
1068   ///     auto tokenizer_op = text::WhitespaceTokenizer(false);
1069   ///
1070   ///     /* dataset is an instance of Dataset object */
1071   ///     dataset = dataset->Map({tokenizer_op},   // operations
1072   ///                            {"text"});        // input columns
1073   /// \endcode
1074   explicit WhitespaceTokenizer(bool with_offsets = false);
1075 
1076   /// \brief Destructor
1077   ~WhitespaceTokenizer() override = default;
1078 
1079  protected:
1080   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
1081   /// \return Shared pointer to the TensorOperation object.
1082   std::shared_ptr<TensorOperation> Parse() override;
1083 
1084  private:
1085   struct Data;
1086   std::shared_ptr<Data> data_;
1087 };
1088 #endif
1089 }  // namespace text
1090 }  // namespace dataset
1091 }  // namespace mindspore
1092 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_TEXT_H_
1093