• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_TEXT_H_
18 #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_TEXT_H_
19 
20 #include <memory>
21 #include <optional>
22 #include <string>
23 #include <utility>
24 #include <vector>
25 
26 #include "include/api/dual_abi_helper.h"
27 #include "include/api/status.h"
28 #include "include/dataset/constants.h"
29 #include "include/dataset/transforms.h"
30 
31 namespace mindspore {
32 namespace dataset {
33 
34 class Vocab;
35 class SentencePieceVocab;
36 class TensorOperation;
37 
38 // Transform operations for text
39 namespace text {
40 
41 #ifndef _WIN32
42 /// \brief Tokenize a scalar tensor of UTF-8 string by specific rules.
43 /// \note BasicTokenizer is not supported on the Windows platform yet.
44 class BasicTokenizer final : public TensorTransform {
45  public:
46   /// \brief Constructor.
47   /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8 (NFD mode) and RegexReplace operations to
48   ///    the input text to fold the text to lower case and strip accents characters. If false, only apply
49   ///    the NormalizeUTF8('normalization_form' mode) operation to the input text (default=false).
50   /// \param[in] keep_whitespace If true, the whitespace will be kept in output tokens (default=false).
51   /// \param[in] normalize_form This parameter is used to specify a specific normalize mode. This is only effective
52   ///    when 'lower_case' is false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
53   /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]' and
54   ///    '[MASK]' (default=true).
55   /// \param[in] with_offsets Whether to output offsets of tokens (default=false).
56   explicit BasicTokenizer(bool lower_case = false, bool keep_whitespace = false,
57                           const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true,
58                           bool with_offsets = false);
59 
60   /// \brief Destructor
61   ~BasicTokenizer() = default;
62 
63  protected:
64   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
65   /// \return Shared pointer to the TensorOperation object.
66   std::shared_ptr<TensorOperation> Parse() override;
67 
68  private:
69   struct Data;
70   std::shared_ptr<Data> data_;
71 };
72 
73 /// \brief A tokenizer used for Bert text process.
74 /// \note BertTokenizer is not supported on the Windows platform yet.
75 class BertTokenizer final : public TensorTransform {
76  public:
77   /// \brief Constructor.
78   /// \param[in] vocab A Vocab object.
79   /// \param[in] suffix_indicator This parameter is used to show that the sub-word
80   ///    is the last part of a word (default='##').
81   /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100).
82   /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty
83   ///    string, else return the specified string (default='[UNK]').
84   /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8 (NFD mode) and RegexReplace operations to
85   ///    the input text to fold the text to lower case and strip accents characters. If false, only apply
86   ///    the NormalizeUTF8('normalization_form' mode) operation to the input text (default=false).
87   /// \param[in] keep_whitespace If true, the whitespace will be kept in output tokens (default=false).
88   /// \param[in] normalize_form This parameter is used to specify a specific normalize mode. This is only effective
89   ///    when 'lower_case' is false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
90   /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]' and
91   ///   '[MASK]' (default=true).
92   /// \param[in] with_offsets Whether to output offsets of tokens (default=false).
93   explicit BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##",
94                          int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]",
95                          bool lower_case = false, bool keep_whitespace = false,
96                          const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true,
97                          bool with_offsets = false)
BertTokenizer(vocab,StringToChar (suffix_indicator),max_bytes_per_token,StringToChar (unknown_token),lower_case,keep_whitespace,normalize_form,preserve_unused_token,with_offsets)98       : BertTokenizer(vocab, StringToChar(suffix_indicator), max_bytes_per_token, StringToChar(unknown_token),
99                       lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets) {}
100   /// \brief Constructor.
101   /// \param[in] vocab A Vocab object.
102   /// \param[in] suffix_indicator This parameter is used to show that the sub-word
103   ///    is the last part of a word (default='##').
104   /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100).
105   /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty
106   ///    string, else return the specified string (default='[UNK]').
107   /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8 (NFD mode) and RegexReplace operations to
108   ///    the input text to fold the text to lower case and strip accents characters. If false, only apply
109   ///    the NormalizeUTF8('normalization_form' mode) operation to the input text (default=false).
110   /// \param[in] keep_whitespace If true, the whitespace will be kept in output tokens (default=false).
111   /// \param[in] normalize_form This parameter is used to specify a specific normalize mode. This is only effective
112   ///    when 'lower_case' is false. See NormalizeUTF8 for details (default=NormalizeForm::kNone).
113   /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]' and
114   ///   '[MASK]' (default=true).
115   /// \param[in] with_offsets Whether to output offsets of tokens (default=false).
116   BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
117                 int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool lower_case,
118                 bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token,
119                 bool with_offsets);
120 
121   /// \brief Destructor
122   ~BertTokenizer() = default;
123 
124  protected:
125   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
126   /// \return Shared pointer to the TensorOperation object.
127   std::shared_ptr<TensorOperation> Parse() override;
128 
129  private:
130   struct Data;
131   std::shared_ptr<Data> data_;
132 };
133 
134 /// \brief Apply case fold operation on UTF-8 string tensors.
135 class CaseFold final : public TensorTransform {
136  public:
137   /// \brief Constructor.
138   CaseFold();
139 
140   /// \brief Destructor
141   ~CaseFold() = default;
142 
143  protected:
144   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
145   /// \return Shared pointer to the TensorOperation object.
146   std::shared_ptr<TensorOperation> Parse() override;
147 };
148 #endif
149 
150 /// \brief Tokenize a Chinese string into words based on the dictionary.
151 /// \note The integrity of the HMMSegment algorithm and MPSegment algorithm files must be confirmed.
152 class JiebaTokenizer final : public TensorTransform {
153  public:
154   /// \brief Constructor.
155   /// \param[in] hmm_path Dictionary file is used by the HMMSegment algorithm. The dictionary can be obtained on the
156   ///   official website of cppjieba (https://github.com/yanyiwu/cppjieba).
157   /// \param[in] mp_path Dictionary file is used by the MPSegment algorithm. The dictionary can be obtained on the
158   ///   official website of cppjieba (https://github.com/yanyiwu/cppjieba).
159   /// \param[in] mode Valid values can be any of JiebaMode.kMP, JiebaMode.kHMM and JiebaMode.kMIX
160   ///   (default=JiebaMode.kMIX).
161   ///   - JiebaMode.kMP, tokenizes with MPSegment algorithm.
162   ///   - JiebaMode.kHMM, tokenizes with Hidden Markov Model Segment algorithm.
163   ///   - JiebaMode.kMIX, tokenizes with a mix of MPSegment and HMMSegment algorithms.
164   /// \param[in] with_offsets Whether to output offsets of tokens (default=false).
165   JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode = JiebaMode::kMix,
166                  bool with_offsets = false)
JiebaTokenizer(StringToChar (hmm_path),StringToChar (mp_path),mode,with_offsets)167       : JiebaTokenizer(StringToChar(hmm_path), StringToChar(mp_path), mode, with_offsets) {}
168 
169   /// \brief Constructor.
170   /// \param[in] hmm_path Dictionary file is used by the HMMSegment algorithm. The dictionary can be obtained on the
171   ///   official website of cppjieba (https://github.com/yanyiwu/cppjieba).
172   /// \param[in] mp_path Dictionary file is used by the MPSegment algorithm. The dictionary can be obtained on the
173   ///   official website of cppjieba (https://github.com/yanyiwu/cppjieba).
174   /// \param[in] mode Valid values can be any of JiebaMode.kMP, JiebaMode.kHMM and JiebaMode.kMIX
175   ///   (default=JiebaMode.kMIX).
176   ///   - JiebaMode.kMP, tokenizes with MPSegment algorithm.
177   ///   - JiebaMode.kHMM, tokenizes with Hidden Markov Model Segment algorithm.
178   ///   - JiebaMode.kMIX, tokenizes with a mix of MPSegment and HMMSegment algorithms.
179   /// \param[in] with_offsets Whether to output offsets of tokens (default=false).
180   JiebaTokenizer(const std::vector<char> &hmm_path, const std::vector<char> &mp_path, const JiebaMode &mode,
181                  bool with_offsets);
182 
183   /// \brief Destructor
184   ~JiebaTokenizer() = default;
185 
186   /// \brief Add a user defined word to the JiebaTokenizer's dictionary.
187   /// \param[in] word The word to be added to the JiebaTokenizer instance.
188   ///   The added word will not be written into the built-in dictionary on disk.
189   /// \param[in] freq The frequency of the word to be added. The higher the frequency,
190   ///   the better chance the word will be tokenized (default=None, use default frequency).
191   /// \return Status error code, returns OK if no error is encountered.
192   Status AddWord(const std::string &word, int64_t freq = 0) { return AddWordChar(StringToChar(word), freq); }
193 
194   /// \brief Add a user defined dictionary of word-freq pairs to the JiebaTokenizer's dictionary.
195   /// \param[in] user_dict Vector of word-freq pairs to be added to the JiebaTokenizer's dictionary.
196   /// \return Status error code, returns OK if no error is encountered.
AddDict(const std::vector<std::pair<std::string,int64_t>> & user_dict)197   Status AddDict(const std::vector<std::pair<std::string, int64_t>> &user_dict) {
198     return AddDictChar(PairStringInt64ToPairCharInt64(user_dict));
199   }
200 
201   /// \brief Add user defined dictionary of word-freq pairs to the JiebaTokenizer's dictionary from a file.
202   ///   Only valid word-freq pairs in user defined file will be added into the dictionary.
203   ///   Rows containing invalid inputs will be ignored, no error nor warning status is returned.
204   /// \param[in] file_path Path to the dictionary which includes user defined word-freq pairs.
205   /// \return Status error code, returns OK if no error is encountered.
AddDict(const std::string & file_path)206   Status AddDict(const std::string &file_path) { return AddDictChar(StringToChar(file_path)); }
207 
208  protected:
209   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
210   /// \return Shared pointer to the TensorOperation object.
211   std::shared_ptr<TensorOperation> Parse() override;
212 
213  private:
214   /// \brief Parser user defined words by files.
215   /// \param[in] file_path Path to the user defined file.
216   /// \param[in] user_dict Vector of word-freq pairs extracted from the user defined file.
217   Status ParserFile(const std::string &file_path, std::vector<std::pair<std::string, int64_t>> *const user_dict);
218 
219   /// \brief Used to translate all API strings to vector of char and reverse.
220   Status AddWordChar(const std::vector<char> &word, int64_t freq = 0);
221 
222   /// \brief Used to translate all API strings to vector of char and reverse.
223   Status AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict);
224 
225   /// \brief Used to translate all API strings to vector of char and reverse.
226   Status AddDictChar(const std::vector<char> &file_path);
227 
228   struct Data;
229   std::shared_ptr<Data> data_;
230 };
231 
232 /// \brief Look up a word into an id according to the input vocabulary table.
233 class Lookup final : public TensorTransform {
234  public:
235   /// \brief Constructor.
236   /// \param[in] vocab a Vocab object.
237   /// \param[in] unknown_token Word is used for lookup. In case of the word is out of vocabulary (OOV),
238   ///    the result of lookup will be replaced to unknown_token. If the unknown_token is not specified or it is OOV,
239   ///    runtime error will be thrown (default={}, means no unknown_token is specified).
240   /// \param[in] data_type mindspore::DataType of the tensor after lookup; must be numeric, including bool.
241   ///   (default=mindspore::DataType::kNumberTypeInt32).
242   explicit Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token = {},
243                   mindspore::DataType data_type = mindspore::DataType::kNumberTypeInt32) {
244     std::optional<std::vector<char>> unknown_token_c = std::nullopt;
245     if (unknown_token != std::nullopt) {
246       unknown_token_c = std::vector<char>(unknown_token->begin(), unknown_token->end());
247     }
248     new (this) Lookup(vocab, unknown_token_c, data_type);
249   }
250 
251   /// \brief Constructor.
252   /// \param[in] vocab a Vocab object.
253   /// \param[in] unknown_token Word is used for lookup. In case of the word is out of vocabulary (OOV),
254   ///    the result of lookup will be replaced to unknown_token. If the unknown_token is not specified or it is OOV,
255   ///    runtime error will be thrown (default={}, means no unknown_token is specified).
256   /// \param[in] data_type mindspore::DataType of the tensor after lookup; must be numeric, including bool.
257   ///   (default=mindspore::DataType::kNumberTypeInt32).
258   Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token,
259          mindspore::DataType data_type = mindspore::DataType::kNumberTypeInt32);
260 
261   /// \brief Destructor
262   ~Lookup() = default;
263 
264  protected:
265   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
266   /// \return Shared pointer to the TensorOperation object.
267   std::shared_ptr<TensorOperation> Parse() override;
268 
269  private:
270   struct Data;
271   std::shared_ptr<Data> data_;
272 };
273 
274 /// \brief Generate n-gram from a 1-D string Tensor.
275 class Ngram final : public TensorTransform {
276  public:
277   /// \brief Constructor.
278   /// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result
279   ///    would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
280   ///    a n-gram, an empty string will be returned.
281   /// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will
282   ///    be capped at n-1. left_pad=("_",2) would pad the left side of the sequence with "__" (default={"", 0}}).
283   /// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will
284   ///    be capped at n-1. right_pad=("-",2) would pad the right side of the sequence with "--" (default={"", 0}}).
285   /// \param[in] separator Symbol used to join strings together (default=" ").
286   explicit Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad = {"", 0},
287                  const std::pair<std::string, int32_t> &right_pad = {"", 0}, const std::string &separator = " ")
Ngram(ngrams,PairStringToChar (left_pad),PairStringToChar (right_pad),StringToChar (separator))288       : Ngram(ngrams, PairStringToChar(left_pad), PairStringToChar(right_pad), StringToChar(separator)) {}
289 
290   /// \brief Constructor.
291   /// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result
292   ///    would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up
293   ///    a n-gram, an empty string will be returned.
294   /// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will
295   ///    be capped at n-1. left_pad=("_",2) would pad the left side of the sequence with "__" (default={"", 0}}).
296   /// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will
297   ///    be capped at n-1. right_pad=("-",2) would pad the right side of the sequence with "--" (default={"", 0}}).
298   /// \param[in] separator Symbol used to join strings together (default=" ").
299   Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad,
300         const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator);
301 
302   /// \brief Destructor
303   ~Ngram() = default;
304 
305  protected:
306   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
307   /// \return Shared pointer to the TensorOperation object.
308   std::shared_ptr<TensorOperation> Parse() override;
309 
310  private:
311   struct Data;
312   std::shared_ptr<Data> data_;
313 };
314 
315 #ifndef _WIN32
316 /// \brief Apply normalize operation to UTF-8 string tensors.
317 class NormalizeUTF8 final : public TensorTransform {
318  public:
319   /// \brief Constructor.
320   /// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc,
321   ///   NormalizeForm::kNfkc, NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc).
322   ///   See http://unicode.org/reports/tr15/ for details.
323   ///   - NormalizeForm.kNone, remain the input string tensor unchanged.
324   ///   - NormalizeForm.kNfc, normalizes with Normalization Form C.
325   ///   - NormalizeForm.kNfkc, normalizes with Normalization Form KC.
326   ///   - NormalizeForm.kNfd, normalizes with Normalization Form D.
327   ///   - NormalizeForm.kNfkd, normalizes with Normalization Form KD.
328   explicit NormalizeUTF8(NormalizeForm normalize_form = NormalizeForm::kNfkc);
329 
330   /// \brief Destructor
331   ~NormalizeUTF8() = default;
332 
333  protected:
334   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
335   /// \return Shared pointer to the TensorOperation object.
336   std::shared_ptr<TensorOperation> Parse() override;
337 
338  private:
339   struct Data;
340   std::shared_ptr<Data> data_;
341 };
342 
343 /// \brief Replace a UTF-8 string tensor with 'replace' according to regular expression 'pattern'.
344 class RegexReplace final : public TensorTransform {
345  public:
346   /// \brief Constructor.
347   /// \param[in] pattern The regex expression patterns.
348   /// \param[in] replace The string to replace the matched element.
349   /// \param[in] replace_all Confirm whether to replace all. If false, only replace the first matched element;
350   ///   if true, replace all matched elements (default=true).
351   RegexReplace(std::string pattern, std::string replace, bool replace_all = true)
RegexReplace(StringToChar (pattern),StringToChar (replace),replace_all)352       : RegexReplace(StringToChar(pattern), StringToChar(replace), replace_all) {}
353 
354   /// \brief Constructor.
355   /// \param[in] pattern The regex expression patterns. Type should be char of vector.
356   /// \param[in] replace The string to replace the matched element.
357   /// \param[in] replace_all Confirm whether to replace all. If false, only replace the first matched element;
358   ///   if true, replace all matched elements (default=true).
359   RegexReplace(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all);
360 
361   /// \brief Destructor
362   ~RegexReplace() = default;
363 
364  protected:
365   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
366   /// \return Shared pointer to the TensorOperation object.
367   std::shared_ptr<TensorOperation> Parse() override;
368 
369  private:
370   struct Data;
371   std::shared_ptr<Data> data_;
372 };
373 
374 /// \brief Tokenize a scalar tensor of UTF-8 string by the regex expression pattern.
375 class RegexTokenizer final : public TensorTransform {
376  public:
377   /// \brief Constructor.
378   /// \param[in] delim_pattern The pattern of regex delimiters.
379   /// \param[in] keep_delim_pattern The string matched with 'delim_pattern' can be kept as a token if it can be
380   ///   matched by 'keep_delim_pattern'. The default value is an empty string ("").
381   ///   which means that delimiters will not be kept as an output token (default="").
382   /// \param[in] with_offsets Whether to output offsets of tokens (default=false).
383   explicit RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern = "", bool with_offsets = false)
RegexTokenizer(StringToChar (delim_pattern),StringToChar (keep_delim_pattern),with_offsets)384       : RegexTokenizer(StringToChar(delim_pattern), StringToChar(keep_delim_pattern), with_offsets) {}
385 
386   explicit RegexTokenizer(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern,
387                           bool with_offsets);
388 
389   /// \brief Destructor
390   ~RegexTokenizer() = default;
391 
392  protected:
393   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
394   /// \return Shared pointer to the TensorOperation object.
395   std::shared_ptr<TensorOperation> Parse() override;
396 
397  private:
398   struct Data;
399   std::shared_ptr<Data> data_;
400 };
401 #endif
402 
403 /// \brief Tokenize a scalar token or a 1-D token to tokens by sentencepiece.
404 class SentencePieceTokenizer final : public TensorTransform {
405  public:
406   /// \brief Constructor.
407   /// \param[in] vocab a SentencePieceVocab object.
408   /// \param[in] out_type The type of the output.
409   SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> &vocab,
410                          mindspore::dataset::SPieceTokenizerOutType out_type);
411 
412   /// \brief Constructor.
413   /// \param[in] vocab_path vocab model file path.
414   /// \param[in] out_type The type of the output.
SentencePieceTokenizer(const std::string & vocab_path,mindspore::dataset::SPieceTokenizerOutType out_type)415   SentencePieceTokenizer(const std::string &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type)
416       : SentencePieceTokenizer(StringToChar(vocab_path), out_type) {}
417 
418   /// \brief Constructor.
419   /// \param[in] vocab_path vocab model file path. type should be char of vector.
420   /// \param[in] out_type The type of the output.
421   SentencePieceTokenizer(const std::vector<char> &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type);
422 
423   /// \brief Destructor
424   ~SentencePieceTokenizer() = default;
425 
426  protected:
427   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
428   /// \return Shared pointer to the TensorOperation object.
429   std::shared_ptr<TensorOperation> Parse() override;
430 
431  private:
432   struct Data;
433   std::shared_ptr<Data> data_;
434 };
435 
436 /// \brief Construct a tensor from data (only 1-D for now), where each element in the dimension
437 ///   axis is a slice of data starting at the corresponding position, with a specified width.
438 class SlidingWindow final : public TensorTransform {
439  public:
440   /// \brief Constructor.
441   /// \param[in] width The width of the window. It must be an integer and greater than zero.
442   /// \param[in] axis The axis where the sliding window is computed (default=0), axis only
443   ///    supports 0 or -1 for now.
444   explicit SlidingWindow(const int32_t width, const int32_t axis = 0);
445 
446   /// \brief Destructor
447   ~SlidingWindow() = default;
448 
449  protected:
450   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
451   /// \return Shared pointer to the TensorOperation object.
452   std::shared_ptr<TensorOperation> Parse() override;
453 
454  private:
455   struct Data;
456   std::shared_ptr<Data> data_;
457 };
458 
459 /// \brief Convert every element in a string tensor to a number.
460 ///   Strings are cast according to the rules specified in the following links:
461 ///   https://en.cppreference.com/w/cpp/string/basic_string/stof,
462 ///   https://en.cppreference.com/w/cpp/string/basic_string/stoul,
463 ///   except that any strings which represent negative numbers cannot be cast to an unsigned integer type.
464 class ToNumber final : public TensorTransform {
465  public:
466   /// \brief Constructor.
467   /// \param[in] data_type mindspore::DataType of the tensor to be cast to. Must be a numeric type, excluding bool.
468   explicit ToNumber(mindspore::DataType data_type);
469 
470   /// \brief Destructor
471   ~ToNumber() = default;
472 
473  protected:
474   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
475   /// \return Shared pointer to the TensorOperation object.
476   std::shared_ptr<TensorOperation> Parse() override;
477 
478  private:
479   struct Data;
480   std::shared_ptr<Data> data_;
481 };
482 
483 /// \brief Truncate a pair of rank-1 tensors such that the total length is less than max_length.
484 class TruncateSequencePair final : public TensorTransform {
485  public:
486   /// \brief Constructor.
487   /// \param[in] max_length Maximum length required.
488   explicit TruncateSequencePair(int32_t max_length);
489 
490   /// \brief Destructor
491   ~TruncateSequencePair() = default;
492 
493  protected:
494   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
495   /// \return Shared pointer to the TensorOperation object.
496   std::shared_ptr<TensorOperation> Parse() override;
497 
498  private:
499   struct Data;
500   std::shared_ptr<Data> data_;
501 };
502 
503 /// \brief Tokenize a scalar tensor of UTF-8 string to Unicode characters.
504 class UnicodeCharTokenizer final : public TensorTransform {
505  public:
506   /// \brief Constructor.
507   /// \param[in] with_offsets whether to output offsets of tokens (default=false).
508   explicit UnicodeCharTokenizer(bool with_offsets = false);
509 
510   /// \brief Destructor
511   ~UnicodeCharTokenizer() = default;
512 
513  protected:
514   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
515   /// \return Shared pointer to the TensorOperation object.
516   std::shared_ptr<TensorOperation> Parse() override;
517 
518  private:
519   struct Data;
520   std::shared_ptr<Data> data_;
521 };
522 
523 /// \brief Tokenize scalar token or 1-D tokens to 1-D sub-word tokens.
524 class WordpieceTokenizer final : public TensorTransform {
525  public:
526   /// \brief Constructor.
527   /// \param[in] vocab A Vocab object.
528   /// \param[in] suffix_indicator This parameter is used to show that the sub-word
529   ///    is the last part of a word (default='##').
530   /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100).
531   /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty
532   ///    string, else return the specified string (default='[UNK]').
533   /// \param[in] with_offsets whether to output offsets of tokens (default=false).
534   explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##",
535                               int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]",
536                               bool with_offsets = false)
WordpieceTokenizer(vocab,StringToChar (suffix_indicator),max_bytes_per_token,StringToChar (unknown_token),with_offsets)537       : WordpieceTokenizer(vocab, StringToChar(suffix_indicator), max_bytes_per_token, StringToChar(unknown_token),
538                            with_offsets) {}
539 
540   explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator,
541                               int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool with_offsets);
542 
543   /// \brief Destructor
544   ~WordpieceTokenizer() = default;
545 
546  protected:
547   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
548   /// \return Shared pointer to the TensorOperation object.
549   std::shared_ptr<TensorOperation> Parse() override;
550 
551  private:
552   struct Data;
553   std::shared_ptr<Data> data_;
554 };
555 
556 #ifndef _WIN32
557 /// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries.
558 class UnicodeScriptTokenizer final : public TensorTransform {
559  public:
560   /// \brief Constructor.
561   /// \param[in] keep_whitespace whether to emit whitespace tokens (default=false).
562   /// \param[in] with_offsets whether to output offsets of tokens (default=false).
563   explicit UnicodeScriptTokenizer(bool keep_whitespace = false, bool with_offsets = false);
564 
565   /// \brief Destructor
566   ~UnicodeScriptTokenizer() = default;
567 
568  protected:
569   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
570   /// \return Shared pointer to the TensorOperation object.
571   std::shared_ptr<TensorOperation> Parse() override;
572 
573  private:
574   struct Data;
575   std::shared_ptr<Data> data_;
576 };
577 
578 /// \brief Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces.
579 class WhitespaceTokenizer final : public TensorTransform {
580  public:
581   /// \brief Constructor.
582   /// \param[in] with_offsets whether to output offsets of tokens (default=false).
583   explicit WhitespaceTokenizer(bool with_offsets = false);
584 
585   /// \brief Destructor
586   ~WhitespaceTokenizer() = default;
587 
588  protected:
589   /// \brief The function to convert a TensorTransform object into a TensorOperation object.
590   /// \return Shared pointer to the TensorOperation object.
591   std::shared_ptr<TensorOperation> Parse() override;
592 
593  private:
594   struct Data;
595   std::shared_ptr<Data> data_;
596 };
597 #endif
598 }  // namespace text
599 }  // namespace dataset
600 }  // namespace mindspore
601 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_TEXT_H_
602