1 /** 2 * Copyright 2020-2023 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_TEXT_H_ 18 #define MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_TEXT_H_ 19 20 #include <memory> 21 #include <optional> 22 #include <string> 23 #include <unordered_map> 24 #include <utility> 25 #include <vector> 26 27 #include "include/api/dual_abi_helper.h" 28 #include "include/api/status.h" 29 #include "include/dataset/constants.h" 30 #include "include/dataset/transforms.h" 31 32 namespace mindspore { 33 namespace dataset { 34 class TensorOperation; 35 class Vectors; 36 37 using WordIdType = int32_t; 38 using WordType = std::string; 39 40 /// \brief Vocab object that is used to save pairs of words and ids. 41 /// \note It contains a map that maps each word(str) to an id(int) or reverse. 42 class Vocab { 43 public: 44 /// \brief Build a vocab from an unordered_map. IDs should be no duplicate and continuous. 45 /// \param[in] words An unordered_map containing word id pair. 46 /// \param[out] vocab A vocab object. 47 /// \return Status code. 48 /// \par Example 49 /// \code 50 /// // Build a map 51 /// std::unordered_map<std::string, int32_t> dict; 52 /// dict["banana"] = 0; 53 /// dict["apple"] = 1; 54 /// dict["cat"] = 2; 55 /// dict["dog"] = 3; 56 /// // Build vocab from map 57 /// std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 58 /// Status s = Vocab::BuildFromUnorderedMap(dict, &vocab); 59 /// \endcode 60 static Status BuildFromUnorderedMap(const std::unordered_map<WordType, WordIdType> &words, 61 std::shared_ptr<Vocab> *vocab); 62 63 /// \brief Build a vocab from a c++ vector. id no duplicate and continuous. 64 /// \param[in] words A vector of string containing words. 65 /// \param[in] special_tokens A vector of string containing special tokens. 66 /// \param[in] prepend_special Whether the special_tokens will be prepended/appended to vocab. 67 /// \param[out] vocab A vocab object. 68 /// \return Status code. 69 /// \par Example 70 /// \code 71 /// // Build vocab from a vector of words, special tokens are prepended to vocab 72 /// std::vector<std::string> list = {"apple", "banana", "cat", "dog", "egg"}; 73 /// std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 74 /// Status s = Vocab::BuildFromVector(list, {"<unk>"}, true, &vocab); 75 /// \endcode 76 static Status BuildFromVector(const std::vector<WordType> &words, const std::vector<WordType> &special_tokens, 77 bool prepend_special, std::shared_ptr<Vocab> *vocab); 78 79 /// \brief Build a vocab from vocab file, IDs will be automatically assigned. 80 /// \param[in] path Path to vocab file, each line in file is assumed as a word (including space). 81 /// \param[in] delimiter Delimiter to break each line, characters after the delimiter will be deprecated. 82 /// \param[in] vocab_size Number of lines to be read from file. 83 /// \param[in] special_tokens A vector of string containing special tokens. 84 /// \param[in] prepend_special Whether the special_tokens will be prepended/appended to vocab. 85 /// \param[out] vocab A vocab object. 86 /// \return Status code. 87 /// \par Example 88 /// \code 89 /// // Build vocab from local file 90 /// std::string vocab_dir = datasets_root_path_ + "/testVocab/vocab_list.txt"; 91 /// std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 92 /// Status s = Vocab::BuildFromFile(vocab_dir, ",", -1, {"<pad>", "<unk>"}, true, &vocab); 93 /// \endcode 94 static Status BuildFromFile(const std::string &path, const std::string &delimiter, int32_t vocab_size, 95 const std::vector<WordType> &special_tokens, bool prepend_special, 96 std::shared_ptr<Vocab> *vocab); 97 98 /// Lookup the id of a word, if the word doesn't exist in vocab, return -1. 99 /// \param word Word to be looked up. 100 /// \return ID of the word in the vocab. 101 /// \par Example 102 /// \code 103 /// // lookup, convert token to id 104 /// auto single_index = vocab->TokensToIds("home"); 105 /// single_index = vocab->TokensToIds("hello"); 106 /// \endcode 107 WordIdType TokensToIds(const WordType &word) const; 108 109 /// Lookup the id of a word, if the word doesn't exist in vocab, return -1. 110 /// \param words Words to be looked up. 111 /// \return ID of the word in the vocab. 112 /// \par Example 113 /// \code 114 /// // lookup multiple tokens 115 /// auto multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "behind"}); 116 /// std::vector<int32_t> expected_multi_indexs = {0, 4}; 117 /// multi_indexs = vocab->TokensToIds(std::vector<std::string>{"<pad>", "apple"}); 118 /// expected_multi_indexs = {0, -1}; 119 /// \endcode 120 std::vector<WordIdType> TokensToIds(const std::vector<WordType> &words) const; 121 122 /// Lookup the word of an ID, if ID doesn't exist in vocab, return empty string. 123 /// \param id ID to be looked up. 124 /// \return Indicates the word corresponding to the ID. 125 /// \par Example 126 /// \code 127 /// // reverse lookup, convert id to token 128 /// auto single_word = vocab->IdsToTokens(2); 129 /// single_word = vocab->IdsToTokens(-1); 130 /// \endcode 131 WordType IdsToTokens(const WordIdType &id); 132 133 /// Lookup the word of an ID, if ID doesn't exist in vocab, return empty string. 134 /// \param ids ID to be looked up. 135 /// \return Indicates the word corresponding to the ID. 136 /// \par Example 137 /// \code 138 /// // reverse lookup multiple ids 139 /// auto multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 4}); 140 /// std::vector<std::string> expected_multi_words = {"<pad>", "behind"}; 141 /// multi_words = vocab->IdsToTokens(std::vector<int32_t>{0, 99}); 142 /// expected_multi_words = {"<pad>", ""}; 143 /// \endcode 144 std::vector<WordType> IdsToTokens(const std::vector<WordIdType> &ids); 145 146 /// Constructor, shouldn't be called directly, can't be private due to std::make_unique(). 147 /// \param map Sanitized word2id map. 148 explicit Vocab(std::unordered_map<WordType, WordIdType> map); 149 150 /// \brief Add one word to vocab, increment it's index automatically. 151 /// \param word Word to be added, word will skip if word already exists. 152 void AppendWord(const std::string &word); 153 154 /// \brief Return a read-only vocab in unordered_map type. 155 /// \return A unordered_map of word2id. GetVocab()156 const std::unordered_map<WordType, WordIdType> &GetVocab() const { return word2id_; } 157 158 /// \brief Constructor. 159 Vocab() = default; 160 161 /// \brief Destructor. 162 ~Vocab() = default; 163 164 static const WordIdType kNoTokenExists; 165 static const WordType kNoIdExists; 166 167 private: 168 std::unordered_map<WordType, WordIdType> word2id_; 169 std::unordered_map<WordIdType, WordType> id2word_; 170 }; 171 172 /// \brief SentencePiece object that is used to do words segmentation. 173 class SentencePieceVocab { 174 public: 175 /// \brief Build a SentencePiece object from a file. 176 /// \param[in] path_list Path to the file which contains the SentencePiece list. 177 /// \param[in] vocab_size Vocabulary size. 178 /// \param[in] character_coverage Amount of characters covered by the model, good defaults are: 0.9995 for 179 /// languages with rich character set like Japanese or Chinese and 1.0 for other languages with small 180 /// character set. 181 /// \param[in] model_type It can be any of [SentencePieceModel::kUnigram, SentencePieceModel::kBpe, 182 /// SentencePieceModel::kChar, SentencePieceModel::kWord], default is SentencePieceModel::kUnigram. The 183 /// input sentence must be pre-tokenized when using SentencePieceModel.WORD type. 184 /// - SentencePieceModel.kUnigram, Unigram Language Model means the next word in the sentence is assumed 185 /// to be independent of the previous words generated by the model. 186 /// - SentencePieceModel.kBpe, refers to byte pair encoding algorithm, which replaces the most frequent 187 /// pair of bytes in a sentence with a single, unused byte. 188 /// - SentencePieceModel.kChar, refers to char based sentencePiece Model type. 189 /// - SentencePieceModel.kWord, refers to word based sentencePiece Model type. 190 /// \param[in] params A dictionary with no incoming parameters(The parameters are derived from SentencePiece library). 191 /// \param[out] vocab A SentencePieceVocab object. 192 /// \return SentencePieceVocab, vocab built from the file. 193 /// \par Example 194 /// \code 195 /// std::string dataset_path; 196 /// dataset_path = datasets_root_path_ + "/test_sentencepiece/vocab.txt"; 197 /// std::vector<std::string> path_list; 198 /// path_list.emplace_back(dataset_path); 199 /// std::unordered_map<std::string, std::string> param_map; 200 /// std::shared_ptr<SentencePieceVocab> spm = std::make_unique<SentencePieceVocab>(); 201 /// Status rc = SentencePieceVocab::BuildFromFile(path_list, 5000, 0.9995, 202 /// SentencePieceModel::kUnigram, param_map, &spm); 203 /// \endcode 204 static Status BuildFromFile(const std::vector<std::string> &path_list, int32_t vocab_size, float character_coverage, 205 const SentencePieceModel &model_type, 206 const std::unordered_map<std::string, std::string> ¶ms, 207 std::shared_ptr<SentencePieceVocab> *vocab); 208 209 /// \brief Save the SentencePiece model into given file path. 210 /// \param[in] vocab A SentencePiece object to be saved. 211 /// \param[in] path Path to store the model. 212 /// \param[in] filename The save name of model file. 213 /// \par Example 214 /// \code 215 /// // Save vocab model to local 216 /// vocab->SaveModel(&vocab, datasets_root_path_ + "/test_sentencepiece", "m.model"); 217 /// \endcode 218 static Status SaveModel(const std::shared_ptr<SentencePieceVocab> *vocab, const std::string &path, 219 const std::string &filename); 220 221 /// \brief Constructor. 222 SentencePieceVocab(); 223 224 /// \brief Destructor. 225 ~SentencePieceVocab() = default; 226 227 const std::string &model_proto(); 228 229 void set_model_proto(const std::string &model_proto); 230 231 private: 232 std::string model_proto_; 233 }; 234 235 // Transform operations for text 236 namespace text { 237 /// \brief Add token to beginning or end of sequence. 238 class DATASET_API AddToken final : public TensorTransform { 239 public: 240 /// \brief Constructor. 241 /// \param[in] token The token to be added. 242 /// \param[in] begin Whether to insert token at start or end of sequence. Default: true. 243 /// \par Example 244 /// \code 245 /// /* Define operations */ 246 /// auto add_token_op = text::AddToken(token='TOKEN', begin=True); 247 /// 248 /// /* dataset is an instance of Dataset object */ 249 /// dataset = dataset->Map({add_token_op}, // operations 250 /// {"text"}); // input columns 251 /// \endcode 252 explicit AddToken(const std::string &token, bool begin = true); 253 254 /// \brief Destructor. 255 ~AddToken() override = default; 256 257 protected: 258 /// \brief Function to convert TensorTransform object into a TensorOperation object. 259 /// \return Shared pointer to TensorOperation object. 260 std::shared_ptr<TensorOperation> Parse() override; 261 262 private: 263 struct Data; 264 std::shared_ptr<Data> data_; 265 }; 266 267 #ifndef _WIN32 268 /// \brief Tokenize a scalar tensor of UTF-8 string by specific rules. 269 /// \note BasicTokenizer is not supported on the Windows platform yet. 270 class DATASET_API BasicTokenizer final : public TensorTransform { 271 public: 272 /// \brief Constructor. 273 /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8 (NFD mode) and RegexReplace operations to 274 /// the input text to fold the text to lower case and strip accents characters. If false, only apply 275 /// the NormalizeUTF8('normalization_form' mode) operation to the input text (default=false). 276 /// \param[in] keep_whitespace If true, the whitespace will be kept in output tokens (default=false). 277 /// \param[in] normalize_form This parameter is used to specify a specific normalize mode. This is only effective 278 /// when 'lower_case' is false. See NormalizeUTF8 for details (default=NormalizeForm::kNone). 279 /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]' and 280 /// '[MASK]' (default=true). 281 /// \param[in] with_offsets Whether to output offsets of tokens (default=false). 282 /// \par Example 283 /// \code 284 /// /* Define operations */ 285 /// auto tokenizer_op = text::BasicTokenizer(); 286 /// 287 /// /* dataset is an instance of Dataset object */ 288 /// dataset = dataset->Map({tokenizer_op}, // operations 289 /// {"text"}); // input columns 290 /// \endcode 291 explicit BasicTokenizer(bool lower_case = false, bool keep_whitespace = false, 292 NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true, 293 bool with_offsets = false); 294 295 /// \brief Destructor 296 ~BasicTokenizer() override = default; 297 298 protected: 299 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 300 /// \return Shared pointer to the TensorOperation object. 301 std::shared_ptr<TensorOperation> Parse() override; 302 303 private: 304 struct Data; 305 std::shared_ptr<Data> data_; 306 }; 307 308 /// \brief A tokenizer used for Bert text process. 309 /// \note BertTokenizer is not supported on the Windows platform yet. 310 class DATASET_API BertTokenizer final : public TensorTransform { 311 public: 312 /// \brief Constructor. 313 /// \param[in] vocab A Vocab object. 314 /// \param[in] suffix_indicator This parameter is used to show that the sub-word 315 /// is the last part of a word (default='##'). 316 /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100). 317 /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty 318 /// string, else return the specified string (default='[UNK]'). 319 /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8 (NFD mode) and RegexReplace operations to 320 /// the input text to fold the text to lower case and strip accents characters. If false, only apply 321 /// the NormalizeUTF8('normalization_form' mode) operation to the input text (default=false). 322 /// \param[in] keep_whitespace If true, the whitespace will be kept in output tokens (default=false). 323 /// \param[in] normalize_form This parameter is used to specify a specific normalize mode. This is only effective 324 /// when 'lower_case' is false. See NormalizeUTF8 for details (default=NormalizeForm::kNone). 325 /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]' and 326 /// '[MASK]' (default=true). 327 /// \param[in] with_offsets Whether to output offsets of tokens (default=false). 328 /// \par Example 329 /// \code 330 /// /* Define operations */ 331 /// std::vector<std::string> list = {"a", "b", "c", "d"}; 332 /// std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 333 /// Status s = Vocab::BuildFromVector(list, {}, true, &vocab); 334 /// auto tokenizer_op = text::BertTokenizer(vocab); 335 /// 336 /// /* dataset is an instance of Dataset object */ 337 /// dataset = dataset->Map({tokenizer_op}, // operations 338 /// {"text"}); // input columns 339 /// \endcode 340 explicit BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##", 341 int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]", 342 bool lower_case = false, bool keep_whitespace = false, 343 const NormalizeForm normalize_form = NormalizeForm::kNone, bool preserve_unused_token = true, 344 bool with_offsets = false) BertTokenizer(vocab,StringToChar (suffix_indicator),max_bytes_per_token,StringToChar (unknown_token),lower_case,keep_whitespace,normalize_form,preserve_unused_token,with_offsets)345 : BertTokenizer(vocab, StringToChar(suffix_indicator), max_bytes_per_token, StringToChar(unknown_token), 346 lower_case, keep_whitespace, normalize_form, preserve_unused_token, with_offsets) {} 347 /// \brief Constructor. 348 /// \param[in] vocab A Vocab object. 349 /// \param[in] suffix_indicator This parameter is used to show that the sub-word 350 /// is the last part of a word (default='##'). 351 /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100). 352 /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty 353 /// string, else return the specified string (default='[UNK]'). 354 /// \param[in] lower_case If true, apply CaseFold, NormalizeUTF8 (NFD mode) and RegexReplace operations to 355 /// the input text to fold the text to lower case and strip accents characters. If false, only apply 356 /// the NormalizeUTF8('normalization_form' mode) operation to the input text (default=false). 357 /// \param[in] keep_whitespace If true, the whitespace will be kept in output tokens (default=false). 358 /// \param[in] normalize_form This parameter is used to specify a specific normalize mode. This is only effective 359 /// when 'lower_case' is false. See NormalizeUTF8 for details (default=NormalizeForm::kNone). 360 /// \param[in] preserve_unused_token If true, do not split special tokens like '[CLS]', '[SEP]', '[UNK]', '[PAD]' and 361 /// '[MASK]' (default=true). 362 /// \param[in] with_offsets Whether to output offsets of tokens (default=false). 363 BertTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, 364 int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool lower_case, 365 bool keep_whitespace, NormalizeForm normalize_form, bool preserve_unused_token, bool with_offsets); 366 367 /// \brief Destructor 368 ~BertTokenizer() override = default; 369 370 protected: 371 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 372 /// \return Shared pointer to the TensorOperation object. 373 std::shared_ptr<TensorOperation> Parse() override; 374 375 private: 376 struct Data; 377 std::shared_ptr<Data> data_; 378 }; 379 380 /// \brief Apply case fold operation on UTF-8 string tensors. 381 class DATASET_API CaseFold final : public TensorTransform { 382 public: 383 /// \brief Constructor. 384 /// \par Example 385 /// \code 386 /// /* Define operations */ 387 /// auto casefold_op = text::CaseFold(); 388 /// 389 /// /* dataset is an instance of Dataset object */ 390 /// dataset = dataset->Map({casefold_op}, // operations 391 /// {"text"}); // input columns 392 /// \endcode 393 CaseFold(); 394 395 /// \brief Destructor 396 ~CaseFold() override = default; 397 398 protected: 399 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 400 /// \return Shared pointer to the TensorOperation object. 401 std::shared_ptr<TensorOperation> Parse() override; 402 }; 403 404 /// \brief Filter wikipedia xml lines. 405 class FilterWikipediaXML final : public TensorTransform { 406 public: 407 /// \brief Constructor. 408 FilterWikipediaXML(); 409 410 /// \brief Destructor 411 ~FilterWikipediaXML() override = default; 412 413 protected: 414 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 415 /// \return Shared pointer to the TensorOperation object. 416 std::shared_ptr<TensorOperation> Parse() override; 417 }; 418 #endif 419 420 /// \brief Tokenize a Chinese string into words based on the dictionary. 421 /// \note The integrity of the HMMSegment algorithm and MPSegment algorithm files must be confirmed. 422 class DATASET_API JiebaTokenizer final : public TensorTransform { 423 public: 424 /// \brief Constructor. 425 /// \param[in] hmm_path Dictionary file is used by the HMMSegment algorithm. The dictionary can be obtained on the 426 /// official website of cppjieba (https://github.com/yanyiwu/cppjieba). 427 /// \param[in] mp_path Dictionary file is used by the MPSegment algorithm. The dictionary can be obtained on the 428 /// official website of cppjieba (https://github.com/yanyiwu/cppjieba). 429 /// \param[in] mode Valid values can be any of JiebaMode.kMP, JiebaMode.kHMM and JiebaMode.kMIX 430 /// (default=JiebaMode.kMIX). 431 /// - JiebaMode.kMP, tokenizes with MPSegment algorithm. 432 /// - JiebaMode.kHMM, tokenizes with Hidden Markov Model Segment algorithm. 433 /// - JiebaMode.kMIX, tokenizes with a mix of MPSegment and HMMSegment algorithms. 434 /// \param[in] with_offsets Whether to output offsets of tokens (default=false). 435 /// \par Example 436 /// \code 437 /// /* Define operations */ 438 /// auto tokenizer_op = text::JiebaTokenizer("/path/to/hmm/file", "/path/to/mp/file"); 439 /// 440 /// /* dataset is an instance of Dataset object */ 441 /// dataset = dataset->Map({tokenizer_op}, // operations 442 /// {"text"}); // input columns 443 /// \endcode 444 JiebaTokenizer(const std::string &hmm_path, const std::string &mp_path, const JiebaMode &mode = JiebaMode::kMix, 445 bool with_offsets = false) JiebaTokenizer(StringToChar (hmm_path),StringToChar (mp_path),mode,with_offsets)446 : JiebaTokenizer(StringToChar(hmm_path), StringToChar(mp_path), mode, with_offsets) {} 447 448 /// \brief Constructor. 449 /// \param[in] hmm_path Dictionary file is used by the HMMSegment algorithm. The dictionary can be obtained on the 450 /// official website of cppjieba (https://github.com/yanyiwu/cppjieba). 451 /// \param[in] mp_path Dictionary file is used by the MPSegment algorithm. The dictionary can be obtained on the 452 /// official website of cppjieba (https://github.com/yanyiwu/cppjieba). 453 /// \param[in] mode Valid values can be any of JiebaMode.kMP, JiebaMode.kHMM and JiebaMode.kMIX 454 /// (default=JiebaMode.kMIX). 455 /// - JiebaMode.kMP, tokenizes with MPSegment algorithm. 456 /// - JiebaMode.kHMM, tokenizes with Hidden Markov Model Segment algorithm. 457 /// - JiebaMode.kMIX, tokenizes with a mix of MPSegment and HMMSegment algorithms. 458 /// \param[in] with_offsets Whether to output offsets of tokens (default=false). 459 JiebaTokenizer(const std::vector<char> &hmm_path, const std::vector<char> &mp_path, const JiebaMode &mode, 460 bool with_offsets); 461 462 /// \brief Destructor 463 ~JiebaTokenizer() override = default; 464 465 /// \brief Add a user defined word to the JiebaTokenizer's dictionary. 466 /// \param[in] word The word to be added to the JiebaTokenizer instance. 467 /// The added word will not be written into the built-in dictionary on disk. 468 /// \param[in] freq The frequency of the word to be added. The higher the frequency, 469 /// the better chance the word will be tokenized (default=None, use default frequency). 470 /// \return Status error code, returns OK if no error is encountered. 471 /// \par Example 472 /// \code 473 /// /* Define operations */ 474 /// auto tokenizer_op = text::JiebaTokenizer("/path/to/hmm/file", "/path/to/mp/file"); 475 /// 476 /// Status s = tokenizer_op.AddWord("hello", 2); 477 /// \endcode 478 Status AddWord(const std::string &word, int64_t freq = 0) { return AddWordChar(StringToChar(word), freq); } 479 480 /// \brief Add a user defined dictionary of word-freq pairs to the JiebaTokenizer's dictionary. 481 /// \param[in] user_dict Vector of word-freq pairs to be added to the JiebaTokenizer's dictionary. 482 /// \return Status error code, returns OK if no error is encountered. 483 /// \par Example 484 /// \code 485 /// /* Define operations */ 486 /// auto tokenizer_op = text::JiebaTokenizer("/path/to/hmm/file", "/path/to/mp/file"); 487 /// 488 /// std::vector<std::pair<std::string, int64_t>> user_dict = {{"a", 1}, {"b", 2}, {"c", 3}}; 489 /// Status s = tokenizer_op.AddDict(user_dict); 490 /// \endcode AddDict(const std::vector<std::pair<std::string,int64_t>> & user_dict)491 Status AddDict(const std::vector<std::pair<std::string, int64_t>> &user_dict) { 492 return AddDictChar(PairStringInt64ToPairCharInt64(user_dict)); 493 } 494 495 /// \brief Add user defined dictionary of word-freq pairs to the JiebaTokenizer's dictionary from a file. 496 /// Only valid word-freq pairs in user defined file will be added into the dictionary. 497 /// Rows containing invalid inputs will be ignored, no error nor warning status is returned. 498 /// \param[in] file_path Path to the dictionary which includes user defined word-freq pairs. 499 /// \return Status error code, returns OK if no error is encountered. 500 /// \par Example 501 /// \code 502 /// /* Define operations */ 503 /// auto tokenizer_op = text::JiebaTokenizer("/path/to/hmm/file", "/path/to/mp/file"); 504 /// 505 /// Status s = tokenizer_op.AddDict("/path/to/dict/file"); 506 /// \endcode AddDict(const std::string & file_path)507 Status AddDict(const std::string &file_path) { return AddDictChar(StringToChar(file_path)); } 508 509 protected: 510 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 511 /// \return Shared pointer to the TensorOperation object. 512 std::shared_ptr<TensorOperation> Parse() override; 513 514 private: 515 /// \brief Parser user defined words by files. 516 /// \param[in] file_path Path to the user defined file. 517 /// \param[in] user_dict Vector of word-freq pairs extracted from the user defined file. 518 Status ParserFile(const std::string &file_path, std::vector<std::pair<std::string, int64_t>> *const user_dict); 519 520 /// \brief Used to translate all API strings to vector of char and reverse. 521 Status AddWordChar(const std::vector<char> &word, int64_t freq = 0); 522 523 /// \brief Used to translate all API strings to vector of char and reverse. 524 Status AddDictChar(const std::vector<std::pair<std::vector<char>, int64_t>> &user_dict); 525 526 /// \brief Used to translate all API strings to vector of char and reverse. 527 Status AddDictChar(const std::vector<char> &file_path); 528 529 struct Data; 530 std::shared_ptr<Data> data_; 531 }; 532 533 /// \brief Look up a word into an id according to the input vocabulary table. 534 class DATASET_API Lookup final : public TensorTransform { 535 public: 536 /// \brief Constructor. 537 /// \param[in] vocab a Vocab object. 538 /// \param[in] unknown_token Word is used for lookup. In case of the word is out of vocabulary (OOV), 539 /// the result of lookup will be replaced to unknown_token. If the unknown_token is not specified or it is OOV, 540 /// runtime error will be thrown (default={}, means no unknown_token is specified). 541 /// \param[in] data_type mindspore::DataType of the tensor after lookup; must be numeric, including bool. 542 /// (default=mindspore::DataType::kNumberTypeInt32). 543 /// \par Example 544 /// \code 545 /// /* Define operations */ 546 /// std::vector<std::string> list = {"a", "b", "c", "d"}; 547 /// std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 548 /// Status s = Vocab::BuildFromVector(list, {}, true, &vocab); 549 /// auto lookup_op = text::Lookup(vocab, "[unk]"); 550 /// 551 /// /* dataset is an instance of Dataset object */ 552 /// dataset = dataset->Map({lookup_op}, // operations 553 /// {"text"}); // input columns 554 /// \endcode 555 explicit Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::string> &unknown_token = {}, 556 mindspore::DataType data_type = mindspore::DataType::kNumberTypeInt32) { 557 std::optional<std::vector<char>> unknown_token_c = std::nullopt; 558 if (unknown_token != std::nullopt) { 559 unknown_token_c = std::vector<char>(unknown_token->begin(), unknown_token->end()); 560 } 561 new (this) Lookup(vocab, unknown_token_c, data_type); 562 } 563 564 /// \brief Constructor. 565 /// \param[in] vocab a Vocab object. 566 /// \param[in] unknown_token Word is used for lookup. In case of the word is out of vocabulary (OOV), 567 /// the result of lookup will be replaced to unknown_token. If the unknown_token is not specified or it is OOV, 568 /// runtime error will be thrown (default={}, means no unknown_token is specified). 569 /// \param[in] data_type mindspore::DataType of the tensor after lookup; must be numeric, including bool. 570 /// (default=mindspore::DataType::kNumberTypeInt32). 571 Lookup(const std::shared_ptr<Vocab> &vocab, const std::optional<std::vector<char>> &unknown_token, 572 mindspore::DataType data_type = mindspore::DataType::kNumberTypeInt32); 573 574 /// \brief Destructor 575 ~Lookup() override = default; 576 577 protected: 578 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 579 /// \return Shared pointer to the TensorOperation object. 580 std::shared_ptr<TensorOperation> Parse() override; 581 582 private: 583 struct Data; 584 std::shared_ptr<Data> data_; 585 }; 586 587 /// \brief Generate n-gram from a 1-D string Tensor. 588 class DATASET_API Ngram final : public TensorTransform { 589 public: 590 /// \brief Constructor. 591 /// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result 592 /// would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up 593 /// a n-gram, an empty string will be returned. 594 /// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will 595 /// be capped at n-1. left_pad=("_",2) would pad the left side of the sequence with "__" (default={"", 0}}). 596 /// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will 597 /// be capped at n-1. right_pad=("-",2) would pad the right side of the sequence with "--" (default={"", 0}}). 598 /// \param[in] separator Symbol used to join strings together (default=" "). 599 /// \par Example 600 /// \code 601 /// /* Define operations */ 602 /// auto ngram_op = text::Ngram({2, 3}, {"&", 2}, {"&", 2}, "-"); 603 /// 604 /// /* dataset is an instance of Dataset object */ 605 /// dataset = dataset->Map({ngram_op}, // operations 606 /// {"text"}); // input columns 607 /// \endcode 608 explicit Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::string, int32_t> &left_pad = {"", 0}, 609 const std::pair<std::string, int32_t> &right_pad = {"", 0}, const std::string &separator = " ") Ngram(ngrams,PairStringToChar (left_pad),PairStringToChar (right_pad),StringToChar (separator))610 : Ngram(ngrams, PairStringToChar(left_pad), PairStringToChar(right_pad), StringToChar(separator)) {} 611 612 /// \brief Constructor. 613 /// \param[in] ngrams ngrams is a vector of positive integers. For example, if ngrams={4, 3}, then the result 614 /// would be a 4-gram followed by a 3-gram in the same tensor. If the number of words is not enough to make up 615 /// a n-gram, an empty string will be returned. 616 /// \param[in] left_pad {"pad_token", pad_width}. Padding performed on left side of the sequence. pad_width will 617 /// be capped at n-1. left_pad=("_",2) would pad the left side of the sequence with "__" (default={"", 0}}). 618 /// \param[in] right_pad {"pad_token", pad_width}. Padding performed on right side of the sequence.pad_width will 619 /// be capped at n-1. right_pad=("-",2) would pad the right side of the sequence with "--" (default={"", 0}}). 620 /// \param[in] separator Symbol used to join strings together (default=" "). 621 Ngram(const std::vector<int32_t> &ngrams, const std::pair<std::vector<char>, int32_t> &left_pad, 622 const std::pair<std::vector<char>, int32_t> &right_pad, const std::vector<char> &separator); 623 624 /// \brief Destructor 625 ~Ngram() override = default; 626 627 protected: 628 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 629 /// \return Shared pointer to the TensorOperation object. 630 std::shared_ptr<TensorOperation> Parse() override; 631 632 private: 633 struct Data; 634 std::shared_ptr<Data> data_; 635 }; 636 637 #ifndef _WIN32 638 /// \brief Apply normalize operation to UTF-8 string tensors. 639 class DATASET_API NormalizeUTF8 final : public TensorTransform { 640 public: 641 /// \brief Constructor. 642 /// \param[in] normalize_form Valid values can be any of [NormalizeForm::kNone,NormalizeForm::kNfc, 643 /// NormalizeForm::kNfkc, NormalizeForm::kNfd, NormalizeForm::kNfkd](default=NormalizeForm::kNfkc). 644 /// See <http://unicode.org/reports/tr15/> for details. 645 /// - NormalizeForm.kNone, remain the input string tensor unchanged. 646 /// - NormalizeForm.kNfc, normalizes with Normalization Form C. 647 /// - NormalizeForm.kNfkc, normalizes with Normalization Form KC. 648 /// - NormalizeForm.kNfd, normalizes with Normalization Form D. 649 /// - NormalizeForm.kNfkd, normalizes with Normalization Form KD. 650 /// \par Example 651 /// \code 652 /// /* Define operations */ 653 /// auto normalizeutf8_op = text::NormalizeUTF8(); 654 /// 655 /// /* dataset is an instance of Dataset object */ 656 /// dataset = dataset->Map({normalizeutf8_op}, // operations 657 /// {"text"}); // input columns 658 /// \endcode 659 explicit NormalizeUTF8(NormalizeForm normalize_form = NormalizeForm::kNfkc); 660 661 /// \brief Destructor 662 ~NormalizeUTF8() override = default; 663 664 protected: 665 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 666 /// \return Shared pointer to the TensorOperation object. 667 std::shared_ptr<TensorOperation> Parse() override; 668 669 private: 670 struct Data; 671 std::shared_ptr<Data> data_; 672 }; 673 674 /// \brief Replace a UTF-8 string tensor with 'replace' according to regular expression 'pattern'. 675 class DATASET_API RegexReplace final : public TensorTransform { 676 public: 677 /// \brief Constructor. 678 /// \param[in] pattern The regex expression patterns. 679 /// \param[in] replace The string to replace the matched element. 680 /// \param[in] replace_all Confirm whether to replace all. If false, only replace the first matched element; 681 /// if true, replace all matched elements (default=true). 682 /// \par Example 683 /// \code 684 /// /* Define operations */ 685 /// auto regex_op = text::RegexReplace("\\s+", "_", true); 686 /// 687 /// /* dataset is an instance of Dataset object */ 688 /// dataset = dataset->Map({regex_op}, // operations 689 /// {"text"}); // input columns 690 /// \endcode 691 RegexReplace(const std::string &pattern, const std::string &replace, bool replace_all = true) RegexReplace(StringToChar (pattern),StringToChar (replace),replace_all)692 : RegexReplace(StringToChar(pattern), StringToChar(replace), replace_all) {} 693 694 /// \brief Constructor. 695 /// \param[in] pattern The regex expression patterns. Type should be char of vector. 696 /// \param[in] replace The string to replace the matched element. 697 /// \param[in] replace_all Confirm whether to replace all. If false, only replace the first matched element; 698 /// if true, replace all matched elements (default=true). 699 RegexReplace(const std::vector<char> &pattern, const std::vector<char> &replace, bool replace_all); 700 701 /// \brief Destructor 702 ~RegexReplace() override = default; 703 704 protected: 705 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 706 /// \return Shared pointer to the TensorOperation object. 707 std::shared_ptr<TensorOperation> Parse() override; 708 709 private: 710 struct Data; 711 std::shared_ptr<Data> data_; 712 }; 713 714 /// \brief Tokenize a scalar tensor of UTF-8 string by the regex expression pattern. 715 class DATASET_API RegexTokenizer final : public TensorTransform { 716 public: 717 /// \brief Constructor. 718 /// \param[in] delim_pattern The pattern of regex delimiters. 719 /// \param[in] keep_delim_pattern The string matched with 'delim_pattern' can be kept as a token if it can be 720 /// matched by 'keep_delim_pattern'. The default value is an empty string (""). 721 /// which means that delimiters will not be kept as an output token (default=""). 722 /// \param[in] with_offsets Whether to output offsets of tokens (default=false). 723 /// \par Example 724 /// \code 725 /// /* Define operations */ 726 /// auto regex_op = text::RegexTokenizer("\\s+", "\\s+", false); 727 /// 728 /// /* dataset is an instance of Dataset object */ 729 /// dataset = dataset->Map({regex_op}, // operations 730 /// {"text"}); // input columns 731 /// \endcode 732 explicit RegexTokenizer(const std::string &delim_pattern, const std::string &keep_delim_pattern = "", 733 bool with_offsets = false) RegexTokenizer(StringToChar (delim_pattern),StringToChar (keep_delim_pattern),with_offsets)734 : RegexTokenizer(StringToChar(delim_pattern), StringToChar(keep_delim_pattern), with_offsets) {} 735 736 explicit RegexTokenizer(const std::vector<char> &delim_pattern, const std::vector<char> &keep_delim_pattern, 737 bool with_offsets); 738 739 /// \brief Destructor 740 ~RegexTokenizer() override = default; 741 742 protected: 743 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 744 /// \return Shared pointer to the TensorOperation object. 745 std::shared_ptr<TensorOperation> Parse() override; 746 747 private: 748 struct Data; 749 std::shared_ptr<Data> data_; 750 }; 751 #endif 752 753 /// \brief Tokenize a scalar token or a 1-D token to tokens by sentencepiece. 754 class DATASET_API SentencePieceTokenizer final : public TensorTransform { 755 public: 756 /// \brief Constructor. 757 /// \param[in] vocab a SentencePieceVocab object. 758 /// \param[in] out_type The type of the output. 759 /// \par Example 760 /// \code 761 /// /* Define operations */ 762 /// std::shared_ptr<Dataset> ds_vocab = TextFile({"/path/to/vocab/file"}, 0, ShuffleMode::kFalse); 763 /// std::shared_ptr<SentencePieceVocab> vocab = 764 /// ds_vocab->BuildSentencePieceVocab({}, 0, 0.9995, SentencePieceModel::kUnigram, {}); 765 /// auto tokenizer_op = text::SentencePieceTokenizer(vocab, mindspore::dataset::SPieceTokenizerOutType::kString); 766 /// 767 /// /* dataset is an instance of Dataset object */ 768 /// dataset = dataset->Map({tokenizer_op}, // operations 769 /// {"text"}); // input columns 770 /// \endcode 771 SentencePieceTokenizer(const std::shared_ptr<SentencePieceVocab> &vocab, 772 mindspore::dataset::SPieceTokenizerOutType out_type); 773 774 /// \brief Constructor. 775 /// \param[in] vocab_path vocab model file path. 776 /// \param[in] out_type The type of the output. 777 /// \par Example 778 /// \code 779 /// /* Define operations */ 780 /// auto tokenizer_op = text::SentencePieceTokenizer("/path/to/model", 781 /// mindspore::dataset::SPieceTokenizerOutType::kInt); 782 /// 783 /// /* dataset is an instance of Dataset object */ 784 /// dataset = dataset->Map({tokenizer_op}, // operations 785 /// {"text"}); // input columns 786 /// \endcode SentencePieceTokenizer(const std::string & vocab_path,mindspore::dataset::SPieceTokenizerOutType out_type)787 SentencePieceTokenizer(const std::string &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type) 788 : SentencePieceTokenizer(StringToChar(vocab_path), out_type) {} 789 790 /// \brief Constructor. 791 /// \param[in] vocab_path vocab model file path. type should be char of vector. 792 /// \param[in] out_type The type of the output. 793 SentencePieceTokenizer(const std::vector<char> &vocab_path, mindspore::dataset::SPieceTokenizerOutType out_type); 794 795 /// \brief Destructor 796 ~SentencePieceTokenizer() override = default; 797 798 protected: 799 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 800 /// \return Shared pointer to the TensorOperation object. 801 std::shared_ptr<TensorOperation> Parse() override; 802 803 private: 804 struct Data; 805 std::shared_ptr<Data> data_; 806 }; 807 808 /// \brief Construct a tensor from data (only 1-D for now), where each element in the dimension 809 /// axis is a slice of data starting at the corresponding position, with a specified width. 810 class DATASET_API SlidingWindow final : public TensorTransform { 811 public: 812 /// \brief Constructor. 813 /// \param[in] width The width of the window. It must be an integer and greater than zero. 814 /// \param[in] axis The axis where the sliding window is computed (default=0), axis only 815 /// supports 0 or -1 for now. 816 /// \par Example 817 /// \code 818 /// /* Define operations */ 819 /// auto slidingwindow_op = text::SlidingWindow(5, 0); 820 /// 821 /// /* dataset is an instance of Dataset object */ 822 /// dataset = dataset->Map({slidingwindow_op}, // operations 823 /// {"text"}); // input columns 824 /// \endcode 825 explicit SlidingWindow(int32_t width, int32_t axis = 0); 826 827 /// \brief Destructor 828 ~SlidingWindow() override = default; 829 830 protected: 831 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 832 /// \return Shared pointer to the TensorOperation object. 833 std::shared_ptr<TensorOperation> Parse() override; 834 835 private: 836 struct Data; 837 std::shared_ptr<Data> data_; 838 }; 839 840 /// \brief Convert every element in a string tensor to a number. 841 /// Strings are cast according to the rules specified in the following links: 842 /// https://en.cppreference.com/w/cpp/string/basic_string/stof, 843 /// https://en.cppreference.com/w/cpp/string/basic_string/stoul, 844 /// except that any strings which represent negative numbers cannot be cast to an unsigned integer type. 845 class DATASET_API ToNumber final : public TensorTransform { 846 public: 847 /// \brief Constructor. 848 /// \param[in] data_type mindspore::DataType of the tensor to be cast to. Must be a numeric type, excluding bool. 849 /// \par Example 850 /// \code 851 /// /* Define operations */ 852 /// auto to_number_op = text::ToNumber(mindspore::DataType::kNumberTypeInt8); 853 /// 854 /// /* dataset is an instance of Dataset object */ 855 /// dataset = dataset->Map({to_number_op}, // operations 856 /// {"text"}); // input columns 857 /// \endcode 858 explicit ToNumber(mindspore::DataType data_type); 859 860 /// \brief Destructor 861 ~ToNumber() override = default; 862 863 protected: 864 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 865 /// \return Shared pointer to the TensorOperation object. 866 std::shared_ptr<TensorOperation> Parse() override; 867 868 private: 869 struct Data; 870 std::shared_ptr<Data> data_; 871 }; 872 873 /// \brief Look up a token into an vector according to the input Vectors table. 874 class DATASET_API ToVectors final : public TensorTransform { 875 public: 876 /// \brief Constructor. 877 /// \param[in] vectors A Vectors object. 878 /// \param[in] unk_init In case of the token is out-of-vectors (OOV), the result will be initialized with `unk_init`. 879 /// (default={}, means to initialize with zero vectors). 880 /// \param[in] lower_case_backup Whether to look up the token in the lower case (default=false). 881 explicit ToVectors(const std::shared_ptr<Vectors> &vectors, const std::vector<float> &unk_init = {}, 882 bool lower_case_backup = false); 883 884 /// \brief Destructor 885 ~ToVectors() override = default; 886 887 protected: 888 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 889 /// \return Shared pointer to the TensorOperation object. 890 std::shared_ptr<TensorOperation> Parse() override; 891 892 private: 893 struct Data; 894 std::shared_ptr<Data> data_; 895 }; 896 897 /// \brief Truncate the input sequence so that it does not exceed the maximum length. 898 class DATASET_API Truncate final : public TensorTransform { 899 public: 900 /// \brief Constructor. 901 /// \param[in] max_seq_len Maximum allowable length. 902 /// \par Example 903 /// \code 904 /// /* Define operations */ 905 /// auto truncate_op = text::Truncate(5); 906 /// 907 /// /* dataset is an instance of Dataset object */ 908 /// dataset = dataset->Map({truncate_op}, // operations 909 /// {"text"}); // input columns 910 /// \endcode 911 explicit Truncate(int32_t max_seq_len); 912 913 /// \brief Destructor. 914 ~Truncate() override = default; 915 916 protected: 917 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 918 /// \return Shared pointer to the TensorOperation object. 919 std::shared_ptr<TensorOperation> Parse() override; 920 921 private: 922 struct Data; 923 std::shared_ptr<Data> data_; 924 }; 925 926 /// \brief Truncate a pair of rank-1 tensors such that the total length is less than max_length. 927 class DATASET_API TruncateSequencePair final : public TensorTransform { 928 public: 929 /// \brief Constructor. 930 /// \param[in] max_length Maximum length required. 931 /// \par Example 932 /// \code 933 /// /* Define operations */ 934 /// auto truncate_op = text::TruncateSequencePair(5); 935 /// 936 /// /* dataset is an instance of Dataset object */ 937 /// dataset = dataset->Map({truncate_op}, // operations 938 /// {"text"}); // input columns 939 /// \endcode 940 explicit TruncateSequencePair(int32_t max_length); 941 942 /// \brief Destructor 943 ~TruncateSequencePair() override = default; 944 945 protected: 946 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 947 /// \return Shared pointer to the TensorOperation object. 948 std::shared_ptr<TensorOperation> Parse() override; 949 950 private: 951 struct Data; 952 std::shared_ptr<Data> data_; 953 }; 954 955 /// \brief Tokenize a scalar tensor of UTF-8 string to Unicode characters. 956 class DATASET_API UnicodeCharTokenizer final : public TensorTransform { 957 public: 958 /// \brief Constructor. 959 /// \param[in] with_offsets whether to output offsets of tokens (default=false). 960 /// \par Example 961 /// \code 962 /// /* Define operations */ 963 /// auto tokenizer_op = text::UnicodeCharTokenizer(); 964 /// 965 /// /* dataset is an instance of Dataset object */ 966 /// dataset = dataset->Map({tokenizer_op}, // operations 967 /// {"text"}); // input columns 968 /// \endcode 969 explicit UnicodeCharTokenizer(bool with_offsets = false); 970 971 /// \brief Destructor 972 ~UnicodeCharTokenizer() override = default; 973 974 protected: 975 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 976 /// \return Shared pointer to the TensorOperation object. 977 std::shared_ptr<TensorOperation> Parse() override; 978 979 private: 980 struct Data; 981 std::shared_ptr<Data> data_; 982 }; 983 984 /// \brief Tokenize scalar token or 1-D tokens to 1-D sub-word tokens. 985 class DATASET_API WordpieceTokenizer final : public TensorTransform { 986 public: 987 /// \brief Constructor. 988 /// \param[in] vocab A Vocab object. 989 /// \param[in] suffix_indicator This parameter is used to show that the sub-word 990 /// is the last part of a word (default='##'). 991 /// \param[in] max_bytes_per_token Tokens exceeding this length will not be further split (default=100). 992 /// \param[in] unknown_token When a token cannot be found, return the token directly if 'unknown_token' is an empty 993 /// string, else return the specified string (default='[UNK]'). 994 /// \param[in] with_offsets whether to output offsets of tokens (default=false). 995 /// \par Example 996 /// \code 997 /// /* Define operations */ 998 /// std::vector<std::string> word_list = {"book", "apple", "rabbit"}; 999 /// std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 1000 /// Status s = Vocab::BuildFromVector(word_list, {}, true, &vocab); 1001 /// auto tokenizer_op = text::WordpieceTokenizer(vocab); 1002 /// 1003 /// /* dataset is an instance of Dataset object */ 1004 /// dataset = dataset->Map({tokenizer_op}, // operations 1005 /// {"text"}); // input columns 1006 /// \endcode 1007 explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::string &suffix_indicator = "##", 1008 int32_t max_bytes_per_token = 100, const std::string &unknown_token = "[UNK]", 1009 bool with_offsets = false) WordpieceTokenizer(vocab,StringToChar (suffix_indicator),max_bytes_per_token,StringToChar (unknown_token),with_offsets)1010 : WordpieceTokenizer(vocab, StringToChar(suffix_indicator), max_bytes_per_token, StringToChar(unknown_token), 1011 with_offsets) {} 1012 1013 explicit WordpieceTokenizer(const std::shared_ptr<Vocab> &vocab, const std::vector<char> &suffix_indicator, 1014 int32_t max_bytes_per_token, const std::vector<char> &unknown_token, bool with_offsets); 1015 1016 /// \brief Destructor 1017 ~WordpieceTokenizer() override = default; 1018 1019 protected: 1020 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1021 /// \return Shared pointer to the TensorOperation object. 1022 std::shared_ptr<TensorOperation> Parse() override; 1023 1024 private: 1025 struct Data; 1026 std::shared_ptr<Data> data_; 1027 }; 1028 1029 #ifndef _WIN32 1030 /// \brief Tokenize a scalar tensor of UTF-8 string on Unicode script boundaries. 1031 class DATASET_API UnicodeScriptTokenizer final : public TensorTransform { 1032 public: 1033 /// \brief Constructor. 1034 /// \param[in] keep_whitespace whether to emit whitespace tokens (default=false). 1035 /// \param[in] with_offsets whether to output offsets of tokens (default=false). 1036 /// \par Example 1037 /// \code 1038 /// /* Define operations */ 1039 /// auto tokenizer_op = text::UnicodeScriptTokenizer(false, true); 1040 /// 1041 /// /* dataset is an instance of Dataset object */ 1042 /// dataset = dataset->Map({tokenizer_op}, // operations 1043 /// {"text"}); // input columns 1044 /// \endcode 1045 explicit UnicodeScriptTokenizer(bool keep_whitespace = false, bool with_offsets = false); 1046 1047 /// \brief Destructor 1048 ~UnicodeScriptTokenizer() override = default; 1049 1050 protected: 1051 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1052 /// \return Shared pointer to the TensorOperation object. 1053 std::shared_ptr<TensorOperation> Parse() override; 1054 1055 private: 1056 struct Data; 1057 std::shared_ptr<Data> data_; 1058 }; 1059 1060 /// \brief Tokenize a scalar tensor of UTF-8 string on ICU4C defined whitespaces. 1061 class DATASET_API WhitespaceTokenizer final : public TensorTransform { 1062 public: 1063 /// \brief Constructor. 1064 /// \param[in] with_offsets whether to output offsets of tokens (default=false). 1065 /// \par Example 1066 /// \code 1067 /// /* Define operations */ 1068 /// auto tokenizer_op = text::WhitespaceTokenizer(false); 1069 /// 1070 /// /* dataset is an instance of Dataset object */ 1071 /// dataset = dataset->Map({tokenizer_op}, // operations 1072 /// {"text"}); // input columns 1073 /// \endcode 1074 explicit WhitespaceTokenizer(bool with_offsets = false); 1075 1076 /// \brief Destructor 1077 ~WhitespaceTokenizer() override = default; 1078 1079 protected: 1080 /// \brief The function to convert a TensorTransform object into a TensorOperation object. 1081 /// \return Shared pointer to the TensorOperation object. 1082 std::shared_ptr<TensorOperation> Parse() override; 1083 1084 private: 1085 struct Data; 1086 std::shared_ptr<Data> data_; 1087 }; 1088 #endif 1089 } // namespace text 1090 } // namespace dataset 1091 } // namespace mindspore 1092 #endif // MINDSPORE_CCSRC_MINDDATA_DATASET_INCLUDE_DATASET_TEXT_H_ 1093