1 /** 2 * Copyright 2020-2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_TEXT_H_ 18 #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_TEXT_H_ 19 20 #include <memory> 21 #include <optional> 22 #include <string> 23 #include <utility> 24 #include <vector> 25 26 #include "include/api/dual_abi_helper.h" 27 #include "include/api/status.h" 28 #include "include/dataset/constants.h" 29 #include "include/dataset/transforms.h" 30 31 namespace mindspore { 32 namespace dataset { 33 34 class Vocab; 35 class SentencePieceVocab; 36 class TensorOperation; 37 38 // Transform operations for text 39 namespace text { 40 41 #ifndef _WIN32 42 /// \brief Tokenize a scalar tensor of UTF-8 string by specific rules. 43 /// \note BasicTokenizer is not supported on the Windows platform yet. 44 class BasicTokenizer final : public TensorTransform { 45 public: 46 /// \brief Constructor. 47 /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8 (NFD mode) and RegexReplace operations to 48 /// the input text to fold the text to lower case and strip accents characters. If false, only apply 49 /// the NormalizeUTF8('normalization_form' mode) operation to the input text (default=false). 50 /// \param[in] keep_whitespace If true, the whitespace will be kept in output tokens (default=false). 51 /// \param[in] normalize_form This parameter is used to specify a specific normalize mode. This is only effective 52 /// when 'lower_case' is false. See NormalizeUTF8 for details (default=NormalizeForm::kNone). 53 /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]' and 54 /// '[MASK]' (default=true). 55 /// \param[in] with_offsets Whether to output offsets of tokens (default=false). 56 explicit BasicTokenizer(bool lower_case = false, bool keep_whitespace = false, 57 const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true, 58 bool with_offsets = false); 59 60 /// \brief Destructor 61 ~BasicTokenizer() = default; 62 63 protected: 64 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 65 /// \return Shared pointer to the TensorOperation object. 66 std::shared_ptr<TensorOperation> Parse() override; 67 68 private: 69 struct Data; 70 std::shared_ptr<Data> data_; 71 }; 72 73 /// \brief A tokenizer used for Bert text process. 74 /// \note BertTokenizer is not supported on the Windows platform yet. 75 class BertTokenizer final : public TensorTransform { 76 public: 77 /// \brief Constructor. 78 /// \param[in] vocab A Vocab object. 79 /// \param[in] suffix_indicator This parameter is used to show that the sub-word 80 /// is the last part of a word (default='##'). 81 /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100). 82 /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty 83 /// string, else return the specified string (default='[UNK]'). 84 /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8 (NFD mode) and RegexReplace operations to 85 /// the input text to fold the text to lower case and strip accents characters. If false, only apply 86 /// the NormalizeUTF8('normalization_form' mode) operation to the input text (default=false). 87 /// \param[in] keep_whitespace If true, the whitespace will be kept in output tokens (default=false). 88 /// \param[in] normalize_form This parameter is used to specify a specific normalize mode. This is only effective 89 /// when 'lower_case' is false. See NormalizeUTF8 for details (default=NormalizeForm::kNone). 90 /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]' and 91 /// '[MASK]' (default=true). 92 /// \param[in] with_offsets Whether to output offsets of tokens (default=false). 93 explicit BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##", 94 int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]", 95 bool lower_case = false, bool keep_whitespace = false, 96 const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true, 97 bool with_offsets = false) BertTokenizer(vocab,StringToChar (suffix_indicator),max_bytes_per_token,StringToChar (unknown_token),lower_case,keep_whitespace,normalize_form,preserve_unused_token,with_offsets)98 : BertTokenizer(vocab, StringToChar(suffix_indicator), max_bytes_per_token, StringToChar(unknown_token), 99 lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets) {} 100 /// \brief Constructor. 101 /// \param[in] vocab A Vocab object. 102 /// \param[in] suffix_indicator This parameter is used to show that the sub-word 103 /// is the last part of a word (default='##'). 104 /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100). 105 /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty 106 /// string, else return the specified string (default='[UNK]'). 107 /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8 (NFD mode) and RegexReplace operations to 108 /// the input text to fold the text to lower case and strip accents characters. If false, only apply 109 /// the NormalizeUTF8('normalization_form' mode) operation to the input text (default=false). 110 /// \param[in] keep_whitespace If true, the whitespace will be kept in output tokens (default=false). 111 /// \param[in] normalize_form This parameter is used to specify a specific normalize mode. This is only effective 112 /// when 'lower_case' is false. See NormalizeUTF8 for details (default=NormalizeForm::kNone). 113 /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]' and 114 /// '[MASK]' (default=true). 115 /// \param[in] with_offsets Whether to output offsets of tokens (default=false). 116 BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, 117 int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool lower_case, 118 bool keep_whitespace, const NormalizeForm normalize_form, bool preserve_unused_token, 119 bool with_offsets); 120 121 /// \brief Destructor 122 ~BertTokenizer() = default; 123 124 protected: 125 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 126 /// \return Shared pointer to the TensorOperation object. 127 std::shared_ptr<TensorOperation> Parse() override; 128 129 private: 130 struct Data; 131 std::shared_ptr<Data> data_; 132 }; 133 134 /// \brief Apply case fold operation on UTF-8 string tensors. 135 class CaseFold final : public TensorTransform { 136 public: 137 /// \brief Constructor. 138 CaseFold(); 139 140 /// \brief Destructor 141 ~CaseFold() = default; 142 143 protected: 144 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 145 /// \return Shared pointer to the TensorOperation object. 146 std::shared_ptr<TensorOperation> Parse() override; 147 }; 148 #endif 149 150 /// \brief Tokenize a Chinese string into words based on the dictionary. 151 /// \note The integrity of the HMMSegment algorithm and MPSegment algorithm files must be confirmed. 152 class JiebaTokenizer final : public TensorTransform { 153 public: 154 /// \brief Constructor. 155 /// \param[in] hmm_path Dictionary file is used by the HMMSegment algorithm. The dictionary can be obtained on the 156 /// official website of cppjieba (https://github.com/yanyiwu/cppjieba). 157 /// \param[in] mp_path Dictionary file is used by the MPSegment algorithm. The dictionary can be obtained on the 158 /// official website of cppjieba (https://github.com/yanyiwu/cppjieba). 159 /// \param[in] mode Valid values can be any of JiebaMode.kMP, JiebaMode.kHMM and JiebaMode.kMIX 160 /// (default=JiebaMode.kMIX). 161 /// - JiebaMode.kMP, tokenizes with MPSegment algorithm. 162 /// - JiebaMode.kHMM, tokenizes with Hidden Markov Model Segment algorithm. 163 /// - JiebaMode.kMIX, tokenizes with a mix of MPSegment and HMMSegment algorithms. 164 /// \param[in] with_offsets Whether to output offsets of tokens (default=false). 165 JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode = JiebaMode::kMix, 166 bool with_offsets = false) JiebaTokenizer(StringToChar (hmm_path),StringToChar (mp_path),mode,with_offsets)167 : JiebaTokenizer(StringToChar(hmm_path), StringToChar(mp_path), mode, with_offsets) {} 168 169 /// \brief Constructor. 170 /// \param[in] hmm_path Dictionary file is used by the HMMSegment algorithm. The dictionary can be obtained on the 171 /// official website of cppjieba (https://github.com/yanyiwu/cppjieba). 172 /// \param[in] mp_path Dictionary file is used by the MPSegment algorithm. The dictionary can be obtained on the 173 /// official website of cppjieba (https://github.com/yanyiwu/cppjieba). 174 /// \param[in] mode Valid values can be any of JiebaMode.kMP, JiebaMode.kHMM and JiebaMode.kMIX 175 /// (default=JiebaMode.kMIX). 176 /// - JiebaMode.kMP, tokenizes with MPSegment algorithm. 177 /// - JiebaMode.kHMM, tokenizes with Hidden Markov Model Segment algorithm. 178 /// - JiebaMode.kMIX, tokenizes with a mix of MPSegment and HMMSegment algorithms. 179 /// \param[in] with_offsets Whether to output offsets of tokens (default=false). 180 JiebaTokenizer(const std::vector<char> &hmm_path, const std::vector<char> &mp_path, const JiebaMode &mode, 181 bool with_offsets); 182 183 /// \brief Destructor 184 ~JiebaTokenizer() = default; 185 186 /// \brief Add a user defined word to the JiebaTokenizer's dictionary. 187 /// \param[in] word The word to be added to the JiebaTokenizer instance. 188 /// The added word will not be written into the built-in dictionary on disk. 189 /// \param[in] freq The frequency of the word to be added. The higher the frequency, 190 /// the better chance the word will be tokenized (default=None, use default frequency). 191 /// \return Status error code, returns OK if no error is encountered. 192 Status AddWord(const std::string &word, int64_t freq = 0) { return AddWordChar(StringToChar(word), freq); } 193 194 /// \brief Add a user defined dictionary of word-freq pairs to the JiebaTokenizer's dictionary. 195 /// \param[in] user_dict Vector of word-freq pairs to be added to the JiebaTokenizer's dictionary. 196 /// \return Status error code, returns OK if no error is encountered. AddDict(const std::vector<std::pair<std::string,int64_t>> & user_dict)197 Status AddDict(const std::vector<std::pair<std::string, int64_t>> &user_dict) { 198 return AddDictChar(PairStringInt64ToPairCharInt64(user_dict)); 199 } 200 201 /// \brief Add user defined dictionary of word-freq pairs to the JiebaTokenizer's dictionary from a file. 202 /// Only valid word-freq pairs in user defined file will be added into the dictionary. 203 /// Rows containing invalid inputs will be ignored, no error nor warning status is returned. 204 /// \param[in] file_path Path to the dictionary which includes user defined word-freq pairs. 205 /// \return Status error code, returns OK if no error is encountered. AddDict(const std::string & file_path)206 Status AddDict(const std::string &file_path) { return AddDictChar(StringToChar(file_path)); } 207 208 protected: 209 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 210 /// \return Shared pointer to the TensorOperation object. 211 std::shared_ptr<TensorOperation> Parse() override; 212 213 private: 214 /// \brief Parser user defined words by files. 215 /// \param[in] file_path Path to the user defined file. 216 /// \param[in] user_dict Vector of word-freq pairs extracted from the user defined file. 217 Status ParserFile(const std::string &file_path, std::vector<std::pair<std::string, int64_t>> *const user_dict); 218 219 /// \brief Used to translate all API strings to vector of char and reverse. 220 Status AddWordChar(const std::vector<char> &word, int64_t freq = 0); 221 222 /// \brief Used to translate all API strings to vector of char and reverse. 223 Status AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict); 224 225 /// \brief Used to translate all API strings to vector of char and reverse. 226 Status AddDictChar(const std::vector<char> &file_path); 227 228 struct Data; 229 std::shared_ptr<Data> data_; 230 }; 231 232 /// \brief Look up a word into an id according to the input vocabulary table. 233 class Lookup final : public TensorTransform { 234 public: 235 /// \brief Constructor. 236 /// \param[in] vocab a Vocab object. 237 /// \param[in] unknown_token Word is used for lookup. In case of the word is out of vocabulary (OOV), 238 /// the result of lookup will be replaced to unknown_token. If the unknown_token is not specified or it is OOV, 239 /// runtime error will be thrown (default={}, means no unknown_token is specified). 240 /// \param[in] data_type mindspore::DataType of the tensor after lookup; must be numeric, including bool. 241 /// (default=mindspore::DataType::kNumberTypeInt32). 242 explicit Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token = {}, 243 mindspore::DataType data_type = mindspore::DataType::kNumberTypeInt32) { 244 std::optional<std::vector<char>> unknown_token_c = std::nullopt; 245 if (unknown_token != std::nullopt) { 246 unknown_token_c = std::vector<char>(unknown_token->begin(), unknown_token->end()); 247 } 248 new (this) Lookup(vocab, unknown_token_c, data_type); 249 } 250 251 /// \brief Constructor. 252 /// \param[in] vocab a Vocab object. 253 /// \param[in] unknown_token Word is used for lookup. In case of the word is out of vocabulary (OOV), 254 /// the result of lookup will be replaced to unknown_token. If the unknown_token is not specified or it is OOV, 255 /// runtime error will be thrown (default={}, means no unknown_token is specified). 256 /// \param[in] data_type mindspore::DataType of the tensor after lookup; must be numeric, including bool. 257 /// (default=mindspore::DataType::kNumberTypeInt32). 258 Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token, 259 mindspore::DataType data_type = mindspore::DataType::kNumberTypeInt32); 260 261 /// \brief Destructor 262 ~Lookup() = default; 263 264 protected: 265 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 266 /// \return Shared pointer to the TensorOperation object. 267 std::shared_ptr<TensorOperation> Parse() override; 268 269 private: 270 struct Data; 271 std::shared_ptr<Data> data_; 272 }; 273 274 /// \brief Generate n-gram from a 1-D string Tensor. 275 class Ngram final : public TensorTransform { 276 public: 277 /// \brief Constructor. 278 /// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result 279 /// would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up 280 /// a n-gram, an empty string will be returned. 281 /// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will 282 /// be capped at n-1. left_pad=("_",2) would pad the left side of the sequence with "__" (default={"", 0}}). 283 /// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will 284 /// be capped at n-1. right_pad=("-",2) would pad the right side of the sequence with "--" (default={"", 0}}). 285 /// \param[in] separator Symbol used to join strings together (default=" "). 286 explicit Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad = {"", 0}, 287 const std::pair<std::string, int32_t> &right_pad = {"", 0}, const std::string &separator = " ") Ngram(ngrams,PairStringToChar (left_pad),PairStringToChar (right_pad),StringToChar (separator))288 : Ngram(ngrams, PairStringToChar(left_pad), PairStringToChar(right_pad), StringToChar(separator)) {} 289 290 /// \brief Constructor. 291 /// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result 292 /// would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up 293 /// a n-gram, an empty string will be returned. 294 /// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will 295 /// be capped at n-1. left_pad=("_",2) would pad the left side of the sequence with "__" (default={"", 0}}). 296 /// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will 297 /// be capped at n-1. right_pad=("-",2) would pad the right side of the sequence with "--" (default={"", 0}}). 298 /// \param[in] separator Symbol used to join strings together (default=" "). 299 Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad, 300 const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator); 301 302 /// \brief Destructor 303 ~Ngram() = default; 304 305 protected: 306 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 307 /// \return Shared pointer to the TensorOperation object. 308 std::shared_ptr<TensorOperation> Parse() override; 309 310 private: 311 struct Data; 312 std::shared_ptr<Data> data_; 313 }; 314 315 #ifndef _WIN32 316 /// \brief Apply normalize operation to UTF-8 string tensors. 317 class NormalizeUTF8 final : public TensorTransform { 318 public: 319 /// \brief Constructor. 320 /// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc, 321 /// NormalizeForm::kNfkc, NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc). 322 /// See http://unicode.org/reports/tr15/ for details. 323 /// - NormalizeForm.kNone, remain the input string tensor unchanged. 324 /// - NormalizeForm.kNfc, normalizes with Normalization Form C. 325 /// - NormalizeForm.kNfkc, normalizes with Normalization Form KC. 326 /// - NormalizeForm.kNfd, normalizes with Normalization Form D. 327 /// - NormalizeForm.kNfkd, normalizes with Normalization Form KD. 328 explicit NormalizeUTF8(NormalizeForm normalize_form = NormalizeForm::kNfkc); 329 330 /// \brief Destructor 331 ~NormalizeUTF8() = default; 332 333 protected: 334 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 335 /// \return Shared pointer to the TensorOperation object. 336 std::shared_ptr<TensorOperation> Parse() override; 337 338 private: 339 struct Data; 340 std::shared_ptr<Data> data_; 341 }; 342 343 /// \brief Replace a UTF-8 string tensor with 'replace' according to regular expression 'pattern'. 344 class RegexReplace final : public TensorTransform { 345 public: 346 /// \brief Constructor. 347 /// \param[in] pattern The regex expression patterns. 348 /// \param[in] replace The string to replace the matched element. 349 /// \param[in] replace_all Confirm whether to replace all. If false, only replace the first matched element; 350 /// if true, replace all matched elements (default=true). 351 RegexReplace(std::string pattern, std::string replace, bool replace_all = true) RegexReplace(StringToChar (pattern),StringToChar (replace),replace_all)352 : RegexReplace(StringToChar(pattern), StringToChar(replace), replace_all) {} 353 354 /// \brief Constructor. 355 /// \param[in] pattern The regex expression patterns. Type should be char of vector. 356 /// \param[in] replace The string to replace the matched element. 357 /// \param[in] replace_all Confirm whether to replace all. If false, only replace the first matched element; 358 /// if true, replace all matched elements (default=true). 359 RegexReplace(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all); 360 361 /// \brief Destructor 362 ~RegexReplace() = default; 363 364 protected: 365 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 366 /// \return Shared pointer to the TensorOperation object. 367 std::shared_ptr<TensorOperation> Parse() override; 368 369 private: 370 struct Data; 371 std::shared_ptr<Data> data_; 372 }; 373 374 /// \brief Tokenize a scalar tensor of UTF-8 string by the regex expression pattern. 375 class RegexTokenizer final : public TensorTransform { 376 public: 377 /// \brief Constructor. 378 /// \param[in] delim_pattern The pattern of regex delimiters. 379 /// \param[in] keep_delim_pattern The string matched with 'delim_pattern' can be kept as a token if it can be 380 /// matched by 'keep_delim_pattern'. The default value is an empty string (""). 381 /// which means that delimiters will not be kept as an output token (default=""). 382 /// \param[in] with_offsets Whether to output offsets of tokens (default=false). 383 explicit RegexTokenizer(std::string delim_pattern, std::string keep_delim_pattern = "", bool with_offsets = false) RegexTokenizer(StringToChar (delim_pattern),StringToChar (keep_delim_pattern),with_offsets)384 : RegexTokenizer(StringToChar(delim_pattern), StringToChar(keep_delim_pattern), with_offsets) {} 385 386 explicit RegexTokenizer(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern, 387 bool with_offsets); 388 389 /// \brief Destructor 390 ~RegexTokenizer() = default; 391 392 protected: 393 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 394 /// \return Shared pointer to the TensorOperation object. 395 std::shared_ptr<TensorOperation> Parse() override; 396 397 private: 398 struct Data; 399 std::shared_ptr<Data> data_; 400 }; 401 #endif 402 403 /// \brief Tokenize a scalar token or a 1-D token to tokens by sentencepiece. 404 class SentencePieceTokenizer final : public TensorTransform { 405 public: 406 /// \brief Constructor. 407 /// \param[in] vocab a SentencePieceVocab object. 408 /// \param[in] out_type The type of the output. 409 SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> &vocab, 410 mindspore::dataset::SPieceTokenizerOutType out_type); 411 412 /// \brief Constructor. 413 /// \param[in] vocab_path vocab model file path. 414 /// \param[in] out_type The type of the output. SentencePieceTokenizer(const std::string & vocab_path,mindspore::dataset::SPieceTokenizerOutType out_type)415 SentencePieceTokenizer(const std::string &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type) 416 : SentencePieceTokenizer(StringToChar(vocab_path), out_type) {} 417 418 /// \brief Constructor. 419 /// \param[in] vocab_path vocab model file path. type should be char of vector. 420 /// \param[in] out_type The type of the output. 421 SentencePieceTokenizer(const std::vector<char> &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type); 422 423 /// \brief Destructor 424 ~SentencePieceTokenizer() = default; 425 426 protected: 427 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 428 /// \return Shared pointer to the TensorOperation object. 429 std::shared_ptr<TensorOperation> Parse() override; 430 431 private: 432 struct Data; 433 std::shared_ptr<Data> data_; 434 }; 435 436 /// \brief Construct a tensor from data (only 1-D for now), where each element in the dimension 437 /// axis is a slice of data starting at the corresponding position, with a specified width. 438 class SlidingWindow final : public TensorTransform { 439 public: 440 /// \brief Constructor. 441 /// \param[in] width The width of the window. It must be an integer and greater than zero. 442 /// \param[in] axis The axis where the sliding window is computed (default=0), axis only 443 /// supports 0 or -1 for now. 444 explicit SlidingWindow(const int32_t width, const int32_t axis = 0); 445 446 /// \brief Destructor 447 ~SlidingWindow() = default; 448 449 protected: 450 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 451 /// \return Shared pointer to the TensorOperation object. 452 std::shared_ptr<TensorOperation> Parse() override; 453 454 private: 455 struct Data; 456 std::shared_ptr<Data> data_; 457 }; 458 459 /// \brief Convert every element in a string tensor to a number. 460 /// Strings are cast according to the rules specified in the following links: 461 /// https://en.cppreference.com/w/cpp/string/basic_string/stof, 462 /// https://en.cppreference.com/w/cpp/string/basic_string/stoul, 463 /// except that any strings which represent negative numbers cannot be cast to an unsigned integer type. 464 class ToNumber final : public TensorTransform { 465 public: 466 /// \brief Constructor. 467 /// \param[in] data_type mindspore::DataType of the tensor to be cast to. Must be a numeric type, excluding bool. 468 explicit ToNumber(mindspore::DataType data_type); 469 470 /// \brief Destructor 471 ~ToNumber() = default; 472 473 protected: 474 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 475 /// \return Shared pointer to the TensorOperation object. 476 std::shared_ptr<TensorOperation> Parse() override; 477 478 private: 479 struct Data; 480 std::shared_ptr<Data> data_; 481 }; 482 483 /// \brief Truncate a pair of rank-1 tensors such that the total length is less than max_length. 484 class TruncateSequencePair final : public TensorTransform { 485 public: 486 /// \brief Constructor. 487 /// \param[in] max_length Maximum length required. 488 explicit TruncateSequencePair(int32_t max_length); 489 490 /// \brief Destructor 491 ~TruncateSequencePair() = default; 492 493 protected: 494 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 495 /// \return Shared pointer to the TensorOperation object. 496 std::shared_ptr<TensorOperation> Parse() override; 497 498 private: 499 struct Data; 500 std::shared_ptr<Data> data_; 501 }; 502 503 /// \brief Tokenize a scalar tensor of UTF-8 string to Unicode characters. 504 class UnicodeCharTokenizer final : public TensorTransform { 505 public: 506 /// \brief Constructor. 507 /// \param[in] with_offsets whether to output offsets of tokens (default=false). 508 explicit UnicodeCharTokenizer(bool with_offsets = false); 509 510 /// \brief Destructor 511 ~UnicodeCharTokenizer() = default; 512 513 protected: 514 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 515 /// \return Shared pointer to the TensorOperation object. 516 std::shared_ptr<TensorOperation> Parse() override; 517 518 private: 519 struct Data; 520 std::shared_ptr<Data> data_; 521 }; 522 523 /// \brief Tokenize scalar token or 1-D tokens to 1-D sub-word tokens. 524 class WordpieceTokenizer final : public TensorTransform { 525 public: 526 /// \brief Constructor. 527 /// \param[in] vocab A Vocab object. 528 /// \param[in] suffix_indicator This parameter is used to show that the sub-word 529 /// is the last part of a word (default='##'). 530 /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100). 531 /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty 532 /// string, else return the specified string (default='[UNK]'). 533 /// \param[in] with_offsets whether to output offsets of tokens (default=false). 534 explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##", 535 int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]", 536 bool with_offsets = false) WordpieceTokenizer(vocab,StringToChar (suffix_indicator),max_bytes_per_token,StringToChar (unknown_token),with_offsets)537 : WordpieceTokenizer(vocab, StringToChar(suffix_indicator), max_bytes_per_token, StringToChar(unknown_token), 538 with_offsets) {} 539 540 explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, 541 int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool with_offsets); 542 543 /// \brief Destructor 544 ~WordpieceTokenizer() = default; 545 546 protected: 547 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 548 /// \return Shared pointer to the TensorOperation object. 549 std::shared_ptr<TensorOperation> Parse() override; 550 551 private: 552 struct Data; 553 std::shared_ptr<Data> data_; 554 }; 555 556 #ifndef _WIN32 557 /// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries. 558 class UnicodeScriptTokenizer final : public TensorTransform { 559 public: 560 /// \brief Constructor. 561 /// \param[in] keep_whitespace whether to emit whitespace tokens (default=false). 562 /// \param[in] with_offsets whether to output offsets of tokens (default=false). 563 explicit UnicodeScriptTokenizer(bool keep_whitespace = false, bool with_offsets = false); 564 565 /// \brief Destructor 566 ~UnicodeScriptTokenizer() = default; 567 568 protected: 569 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 570 /// \return Shared pointer to the TensorOperation object. 571 std::shared_ptr<TensorOperation> Parse() override; 572 573 private: 574 struct Data; 575 std::shared_ptr<Data> data_; 576 }; 577 578 /// \brief Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces. 579 class WhitespaceTokenizer final : public TensorTransform { 580 public: 581 /// \brief Constructor. 582 /// \param[in] with_offsets whether to output offsets of tokens (default=false). 583 explicit WhitespaceTokenizer(bool with_offsets = false); 584 585 /// \brief Destructor 586 ~WhitespaceTokenizer() = default; 587 588 protected: 589 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 590 /// \return Shared pointer to the TensorOperation object. 591 std::shared_ptr<TensorOperation> Parse() override; 592 593 private: 594 struct Data; 595 std::shared_ptr<Data> data_; 596 }; 597 #endif 598 } // namespace text 599 } // namespace dataset 600 } // namespace mindspore 601 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_TEXT_H_ 602