1 /** 2 * Copyright 2020-2021 Huawei Technologies Co., Ltd 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 #include <memory> 17 #include <vector> 18 #include <string> 19 20 #include "common/common.h" 21 #include "include/api/status.h" 22 #include "minddata/dataset/include/dataset/config.h" 23 #include "minddata/dataset/include/dataset/datasets.h" 24 #include "minddata/dataset/include/dataset/text.h" 25 #include "minddata/dataset/include/dataset/transforms.h" 26 #include "minddata/dataset/text/vocab.h" 27 28 using namespace mindspore::dataset; 29 using mindspore::Status; 30 using mindspore::dataset::ShuffleMode; 31 using mindspore::dataset::Tensor; 32 using mindspore::dataset::Vocab; 33 34 class MindDataTestPipeline : public UT::DatasetOpTesting { 35 protected: 36 }; 37 38 TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess1) { 39 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess1."; 40 // Test BasicTokenizer with default parameters 41 42 // Create a TextFile dataset 43 std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt"; 44 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 45 EXPECT_NE(ds, nullptr); 46 47 // Create Take operation on ds 48 ds = ds->Take(6); 49 EXPECT_NE(ds, nullptr); 50 51 // Create BasicTokenizer operation on ds 52 std::shared_ptr<TensorTransform> basic_tokenizer = std::make_shared<text::BasicTokenizer>(); 53 EXPECT_NE(basic_tokenizer, nullptr); 54 55 // Create Map operation on ds 56 ds = ds->Map({basic_tokenizer}, {"text"}); 57 EXPECT_NE(ds, nullptr); 58 59 // Create an iterator over the result of the above dataset 60 // This will trigger the creation of the Execution Tree and launch it. 61 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 62 EXPECT_NE(iter, nullptr); 63 64 // Iterate the dataset and get each row 65 std::unordered_map<std::string, mindspore::MSTensor> row; 66 ASSERT_OK(iter->GetNextRow(&row)); 67 68 std::vector<std::vector<std::string>> expected = { 69 {"Welcome", "to", "Beijing", "北", "京", "欢", "迎", "您"}, 70 {"長", "風", "破", "浪", "會", "有", "時", ",", "直", "掛", "雲", "帆", "濟", "滄", "海"}, 71 {"", "嘿", "嘿", "", "哈", "哈", "", "大", "笑", "", "嘻", "嘻"}, 72 {"明", "朝", "(", "1368", "—", "1644", "年", ")", "和", "清", "朝", "(", "1644", "—", "1911", "年", ")", 73 ",", "是", "中", "国", "封", "建", "王", "朝", "史", "上", "最", "后", "两", "个", "朝", "代"}, 74 {"明", "代", "(", "1368", "-", "1644", ")", "と", "清", "代", "(", "1644", 75 "-", "1911", ")", "は", "、", "中", "国", "の", "封", "建", "王", "朝", 76 "の", "歴", "史", "における", "最", "後", "の2つの", "王", "朝", "でした"}, 77 {"명나라", "(", "1368", "-", "1644", ")", "와", "청나라", "(", "1644", "-", 78 "1911", ")", "는", "중국", "봉건", "왕조의", "역사에서", "마지막", "두", "왕조였다"}}; 79 80 uint64_t i = 0; 81 while (row.size() != 0) { 82 auto ind = row["text"]; 83 std::shared_ptr<Tensor> de_expected_tensor; 84 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); 85 mindspore::MSTensor expected_tensor = 86 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 87 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 88 89 ASSERT_OK(iter->GetNextRow(&row)); 90 i++; 91 } 92 93 EXPECT_EQ(i, 6); 94 95 // Manually terminate the pipeline 96 iter->Stop(); 97 } 98 99 TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess2) { 100 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess2."; 101 // Test BasicTokenizer with lower_case true 102 103 // Create a TextFile dataset 104 std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt"; 105 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 106 EXPECT_NE(ds, nullptr); 107 108 // Create Skip operation on ds 109 ds = ds->Skip(6); 110 EXPECT_NE(ds, nullptr); 111 112 // Create BasicTokenizer operation on ds 113 std::shared_ptr<TensorTransform> basic_tokenizer = std::make_shared<text::BasicTokenizer>(true); 114 EXPECT_NE(basic_tokenizer, nullptr); 115 116 // Create Map operation on ds 117 ds = ds->Map({basic_tokenizer}, {"text"}); 118 EXPECT_NE(ds, nullptr); 119 120 // Create an iterator over the result of the above dataset 121 // This will trigger the creation of the Execution Tree and launch it. 122 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 123 EXPECT_NE(iter, nullptr); 124 125 // Iterate the dataset and get each row 126 std::unordered_map<std::string, mindspore::MSTensor> row; 127 ASSERT_OK(iter->GetNextRow(&row)); 128 129 std::vector<std::string> expected = {"this", "is", "a", "funky", "string"}; 130 std::shared_ptr<Tensor> de_expected_tensor; 131 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); 132 mindspore::MSTensor expected_tensor = 133 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 134 135 uint64_t i = 0; 136 while (row.size() != 0) { 137 auto ind = row["text"]; 138 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 139 ASSERT_OK(iter->GetNextRow(&row)); 140 i++; 141 } 142 143 EXPECT_EQ(i, 1); 144 145 // Manually terminate the pipeline 146 iter->Stop(); 147 } 148 149 TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess3) { 150 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess3."; 151 // Test BasicTokenizer with with_offsets true and lower_case true 152 153 // Create a TextFile dataset 154 std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt"; 155 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 156 EXPECT_NE(ds, nullptr); 157 158 // Create Skip operation on ds 159 ds = ds->Skip(6); 160 EXPECT_NE(ds, nullptr); 161 162 // Create BasicTokenizer operation on ds 163 std::shared_ptr<TensorTransform> basic_tokenizer = 164 std::make_shared<text::BasicTokenizer>(true, false, NormalizeForm::kNone, true, true); 165 EXPECT_NE(basic_tokenizer, nullptr); 166 167 // Create Map operation on ds 168 ds = ds->Map({basic_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}); 169 EXPECT_NE(ds, nullptr); 170 171 // Create an iterator over the result of the above dataset 172 // This will trigger the creation of the Execution Tree and launch it. 173 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 174 EXPECT_NE(iter, nullptr); 175 176 // Iterate the dataset and get each row 177 std::unordered_map<std::string, mindspore::MSTensor> row; 178 ASSERT_OK(iter->GetNextRow(&row)); 179 180 std::vector<std::string> expected_tokens = {"this", "is", "a", "funky", "string"}; 181 std::vector<uint32_t> expected_offsets_start = {0, 5, 8, 10, 16}; 182 std::vector<uint32_t> expected_offsets_limit = {4, 7, 9, 15, 22}; 183 184 std::shared_ptr<Tensor> de_expected_tokens; 185 ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens)); 186 mindspore::MSTensor ms_expected_tokens = 187 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens)); 188 189 std::shared_ptr<Tensor> de_expected_offsets_start; 190 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start)); 191 mindspore::MSTensor ms_expected_offsets_start = 192 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start)); 193 194 std::shared_ptr<Tensor> de_expected_offsets_limit; 195 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit)); 196 mindspore::MSTensor ms_expected_offsets_limit = 197 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit)); 198 199 uint64_t i = 0; 200 while (row.size() != 0) { 201 auto ind = row["token"]; 202 EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens); 203 204 auto start = row["offsets_start"]; 205 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start); 206 207 auto limit = row["offsets_limit"]; 208 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit); 209 210 ASSERT_OK(iter->GetNextRow(&row)); 211 i++; 212 } 213 214 EXPECT_EQ(i, 1); 215 216 // Manually terminate the pipeline 217 iter->Stop(); 218 } 219 220 std::vector<std::string> list = { 221 "床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头", 222 "望", "低", "思", "故", "乡", "繁", "體", "字", "嘿", "哈", "大", "笑", 223 "嘻", "i", "am", "mak", "make", "small", "mistake", "##s", "during", "work", "##ing", "hour", 224 "", "", "", "", "+", "/", "-", "=", "12", "28", "40", "16", 225 " ", "I", "[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]", "[unused1]", "[unused10]"}; 226 227 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess1) { 228 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess1."; 229 // Test BertTokenizer with default parameters 230 231 // Create a TextFile dataset 232 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; 233 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 234 EXPECT_NE(ds, nullptr); 235 236 // Create Take operation on ds 237 ds = ds->Take(4); 238 EXPECT_NE(ds, nullptr); 239 240 // Create a vocab from vector 241 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 242 Status s = Vocab::BuildFromVector(list, {}, true, &vocab); 243 EXPECT_EQ(s, Status::OK()); 244 245 // Create BertTokenizer operation on ds 246 std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(vocab); 247 EXPECT_NE(bert_tokenizer, nullptr); 248 249 // Create Map operation on ds 250 ds = ds->Map({bert_tokenizer}, {"text"}); 251 EXPECT_NE(ds, nullptr); 252 253 // Create an iterator over the result of the above dataset 254 // This will trigger the creation of the Execution Tree and launch it. 255 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 256 EXPECT_NE(iter, nullptr); 257 258 // Iterate the dataset and get each row 259 std::unordered_map<std::string, mindspore::MSTensor> row; 260 ASSERT_OK(iter->GetNextRow(&row)); 261 262 std::vector<std::vector<std::string>> expected = {{"床", "前", "明", "月", "光"}, 263 {"疑", "是", "地", "上", "霜"}, 264 {"举", "头", "望", "明", "月"}, 265 {"低", "头", "思", "故", "乡"}}; 266 267 uint64_t i = 0; 268 while (row.size() != 0) { 269 auto ind = row["text"]; 270 std::shared_ptr<Tensor> de_expected_tensor; 271 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); 272 mindspore::MSTensor expected_tensor = 273 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 274 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 275 276 ASSERT_OK(iter->GetNextRow(&row)); 277 i++; 278 } 279 280 EXPECT_EQ(i, 4); 281 282 // Manually terminate the pipeline 283 iter->Stop(); 284 } 285 286 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess2) { 287 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess2."; 288 // Test BertTokenizer with lower_case true 289 290 // Create a TextFile dataset 291 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; 292 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 293 EXPECT_NE(ds, nullptr); 294 295 // Create Skip operation on ds 296 ds = ds->Skip(4); 297 EXPECT_NE(ds, nullptr); 298 299 // Create Take operation on ds 300 ds = ds->Take(1); 301 EXPECT_NE(ds, nullptr); 302 303 // Create a vocab from vector 304 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 305 Status s = Vocab::BuildFromVector(list, {}, true, &vocab); 306 EXPECT_EQ(s, Status::OK()); 307 308 // Create BertTokenizer operation on ds 309 std::shared_ptr<TensorTransform> bert_tokenizer = 310 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", true); 311 EXPECT_NE(bert_tokenizer, nullptr); 312 313 // Create Map operation on ds 314 ds = ds->Map({bert_tokenizer}, {"text"}); 315 EXPECT_NE(ds, nullptr); 316 317 // Create an iterator over the result of the above dataset 318 // This will trigger the creation of the Execution Tree and launch it. 319 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 320 EXPECT_NE(iter, nullptr); 321 322 // Iterate the dataset and get each row 323 std::unordered_map<std::string, mindspore::MSTensor> row; 324 ASSERT_OK(iter->GetNextRow(&row)); 325 326 std::vector<std::string> expected = {"i", "am", "mak", "##ing", "small", "mistake", 327 "##s", "during", "work", "##ing", "hour", "##s"}; 328 std::shared_ptr<Tensor> de_expected_tensor; 329 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); 330 mindspore::MSTensor expected_tensor = 331 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 332 333 uint64_t i = 0; 334 while (row.size() != 0) { 335 auto ind = row["text"]; 336 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 337 ASSERT_OK(iter->GetNextRow(&row)); 338 i++; 339 } 340 341 EXPECT_EQ(i, 1); 342 343 // Manually terminate the pipeline 344 iter->Stop(); 345 } 346 347 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess3) { 348 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess3."; 349 // Test BertTokenizer with normalization_form NFKC 350 351 // Create a TextFile dataset 352 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; 353 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 354 EXPECT_NE(ds, nullptr); 355 356 // Create Skip operation on ds 357 ds = ds->Skip(5); 358 EXPECT_NE(ds, nullptr); 359 360 // Create Take operation on ds 361 ds = ds->Take(2); 362 EXPECT_NE(ds, nullptr); 363 364 // Create a vocab from vector 365 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 366 Status s = Vocab::BuildFromVector(list, {}, true, &vocab); 367 EXPECT_EQ(s, Status::OK()); 368 369 // Create BertTokenizer operation on ds 370 std::shared_ptr<TensorTransform> bert_tokenizer = 371 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", false, false, NormalizeForm::kNfc); 372 EXPECT_NE(bert_tokenizer, nullptr); 373 374 // Create Map operation on ds 375 ds = ds->Map({bert_tokenizer}, {"text"}); 376 EXPECT_NE(ds, nullptr); 377 378 // Create an iterator over the result of the above dataset 379 // This will trigger the creation of the Execution Tree and launch it. 380 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 381 EXPECT_NE(iter, nullptr); 382 383 // Iterate the dataset and get each row 384 std::unordered_map<std::string, mindspore::MSTensor> row; 385 ASSERT_OK(iter->GetNextRow(&row)); 386 387 std::vector<std::vector<std::string>> expected = { 388 {"", "嘿", "嘿", "", "哈", "哈", "", "大", "笑", "", "嘻", "嘻"}, {"繁", "體", "字"}}; 389 390 uint64_t i = 0; 391 while (row.size() != 0) { 392 auto ind = row["text"]; 393 std::shared_ptr<Tensor> de_expected_tensor; 394 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); 395 mindspore::MSTensor expected_tensor = 396 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 397 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 398 399 ASSERT_OK(iter->GetNextRow(&row)); 400 i++; 401 } 402 403 EXPECT_EQ(i, 2); 404 405 // Manually terminate the pipeline 406 iter->Stop(); 407 } 408 409 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess4) { 410 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess4."; 411 // Test BertTokenizer with keep_whitespace true 412 413 // Create a TextFile dataset 414 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; 415 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 416 EXPECT_NE(ds, nullptr); 417 418 // Create Skip operation on ds 419 ds = ds->Skip(7); 420 EXPECT_NE(ds, nullptr); 421 422 // Create Take operation on ds 423 ds = ds->Take(1); 424 EXPECT_NE(ds, nullptr); 425 426 // Create a vocab from vector 427 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 428 Status s = Vocab::BuildFromVector(list, {}, true, &vocab); 429 EXPECT_EQ(s, Status::OK()); 430 431 // Create BertTokenizer operation on ds 432 std::shared_ptr<TensorTransform> bert_tokenizer = 433 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", false, true); 434 EXPECT_NE(bert_tokenizer, nullptr); 435 436 // Create Map operation on ds 437 ds = ds->Map({bert_tokenizer}, {"text"}); 438 EXPECT_NE(ds, nullptr); 439 440 // Create an iterator over the result of the above dataset 441 // This will trigger the creation of the Execution Tree and launch it. 442 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 443 EXPECT_NE(iter, nullptr); 444 445 // Iterate the dataset and get each row 446 std::unordered_map<std::string, mindspore::MSTensor> row; 447 ASSERT_OK(iter->GetNextRow(&row)); 448 449 std::vector<std::string> expected = {"[UNK]", " ", "[CLS]"}; 450 std::shared_ptr<Tensor> de_expected_tensor; 451 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); 452 mindspore::MSTensor expected_tensor = 453 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 454 455 uint64_t i = 0; 456 while (row.size() != 0) { 457 auto ind = row["text"]; 458 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 459 ASSERT_OK(iter->GetNextRow(&row)); 460 i++; 461 } 462 463 EXPECT_EQ(i, 1); 464 465 // Manually terminate the pipeline 466 iter->Stop(); 467 } 468 469 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess5) { 470 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess5."; 471 // Test BertTokenizer with unknown_token empty and keep_whitespace true 472 473 // Create a TextFile dataset 474 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; 475 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 476 EXPECT_NE(ds, nullptr); 477 478 // Create Skip operation on ds 479 ds = ds->Skip(7); 480 EXPECT_NE(ds, nullptr); 481 482 // Create Take operation on ds 483 ds = ds->Take(1); 484 EXPECT_NE(ds, nullptr); 485 486 // Create a vocab from vector 487 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 488 Status s = Vocab::BuildFromVector(list, {}, true, &vocab); 489 EXPECT_EQ(s, Status::OK()); 490 491 // Create BertTokenizer operation on ds 492 std::shared_ptr<TensorTransform> bert_tokenizer = 493 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "", false, true); 494 EXPECT_NE(bert_tokenizer, nullptr); 495 496 // Create Map operation on ds 497 ds = ds->Map({bert_tokenizer}, {"text"}); 498 EXPECT_NE(ds, nullptr); 499 500 // Create an iterator over the result of the above dataset 501 // This will trigger the creation of the Execution Tree and launch it. 502 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 503 EXPECT_NE(iter, nullptr); 504 505 // Iterate the dataset and get each row 506 std::unordered_map<std::string, mindspore::MSTensor> row; 507 ASSERT_OK(iter->GetNextRow(&row)); 508 509 std::vector<std::string> expected = {"unused", " ", "[CLS]"}; 510 std::shared_ptr<Tensor> de_expected_tensor; 511 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); 512 mindspore::MSTensor expected_tensor = 513 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 514 515 uint64_t i = 0; 516 while (row.size() != 0) { 517 auto ind = row["text"]; 518 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 519 ASSERT_OK(iter->GetNextRow(&row)); 520 i++; 521 } 522 523 EXPECT_EQ(i, 1); 524 525 // Manually terminate the pipeline 526 iter->Stop(); 527 } 528 529 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess6) { 530 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess6."; 531 // Test BertTokenizer with preserve_unused_token false, unknown_token empty and keep_whitespace true 532 533 // Create a TextFile dataset 534 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; 535 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 536 EXPECT_NE(ds, nullptr); 537 538 // Create Skip operation on ds 539 ds = ds->Skip(7); 540 EXPECT_NE(ds, nullptr); 541 542 // Create Take operation on ds 543 ds = ds->Take(1); 544 EXPECT_NE(ds, nullptr); 545 546 // Create a vocab from vector 547 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 548 Status s = Vocab::BuildFromVector(list, {}, true, &vocab); 549 EXPECT_EQ(s, Status::OK()); 550 551 // Create BertTokenizer operation on ds 552 std::shared_ptr<TensorTransform> bert_tokenizer = 553 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "", false, true, NormalizeForm::kNone, false); 554 EXPECT_NE(bert_tokenizer, nullptr); 555 556 // Create Map operation on ds 557 ds = ds->Map({bert_tokenizer}, {"text"}); 558 EXPECT_NE(ds, nullptr); 559 560 // Create an iterator over the result of the above dataset 561 // This will trigger the creation of the Execution Tree and launch it. 562 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 563 EXPECT_NE(iter, nullptr); 564 565 // Iterate the dataset and get each row 566 std::unordered_map<std::string, mindspore::MSTensor> row; 567 ASSERT_OK(iter->GetNextRow(&row)); 568 569 std::vector<std::string> expected = {"unused", " ", "[", "CLS", "]"}; 570 std::shared_ptr<Tensor> de_expected_tensor; 571 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); 572 mindspore::MSTensor expected_tensor = 573 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 574 575 uint64_t i = 0; 576 while (row.size() != 0) { 577 auto ind = row["text"]; 578 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 579 ASSERT_OK(iter->GetNextRow(&row)); 580 i++; 581 } 582 583 EXPECT_EQ(i, 1); 584 585 // Manually terminate the pipeline 586 iter->Stop(); 587 } 588 589 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess7) { 590 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess7."; 591 // Test BertTokenizer with with_offsets true and lower_case true 592 593 // Create a TextFile dataset 594 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; 595 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 596 EXPECT_NE(ds, nullptr); 597 598 // Create Skip operation on ds 599 ds = ds->Skip(4); 600 EXPECT_NE(ds, nullptr); 601 602 // Create Take operation on ds 603 ds = ds->Take(1); 604 EXPECT_NE(ds, nullptr); 605 606 // Create a vocab from vector 607 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 608 Status s = Vocab::BuildFromVector(list, {}, true, &vocab); 609 EXPECT_EQ(s, Status::OK()); 610 611 // Create BertTokenizer operation on ds 612 std::shared_ptr<TensorTransform> bert_tokenizer = 613 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", true, false, NormalizeForm::kNone, true, true); 614 EXPECT_NE(bert_tokenizer, nullptr); 615 616 // Create Map operation on ds 617 ds = ds->Map({bert_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}); 618 EXPECT_NE(ds, nullptr); 619 620 // Create an iterator over the result of the above dataset 621 // This will trigger the creation of the Execution Tree and launch it. 622 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 623 EXPECT_NE(iter, nullptr); 624 625 // Iterate the dataset and get each row 626 std::unordered_map<std::string, mindspore::MSTensor> row; 627 ASSERT_OK(iter->GetNextRow(&row)); 628 629 std::vector<std::string> expected_tokens = {"i", "am", "mak", "##ing", "small", "mistake", 630 "##s", "during", "work", "##ing", "hour", "##s"}; 631 std::vector<uint32_t> expected_offsets_start = {0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46}; 632 std::vector<uint32_t> expected_offsets_limit = {1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47}; 633 634 std::shared_ptr<Tensor> de_expected_tokens; 635 ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens)); 636 mindspore::MSTensor ms_expected_tokens = 637 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens)); 638 639 std::shared_ptr<Tensor> de_expected_offsets_start; 640 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start)); 641 mindspore::MSTensor ms_expected_offsets_start = 642 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start)); 643 644 std::shared_ptr<Tensor> de_expected_offsets_limit; 645 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit)); 646 mindspore::MSTensor ms_expected_offsets_limit = 647 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit)); 648 649 uint64_t i = 0; 650 while (row.size() != 0) { 651 auto ind = row["token"]; 652 EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens); 653 654 auto start = row["offsets_start"]; 655 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start); 656 657 auto limit = row["offsets_limit"]; 658 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit); 659 660 ASSERT_OK(iter->GetNextRow(&row)); 661 i++; 662 } 663 664 EXPECT_EQ(i, 1); 665 666 // Manually terminate the pipeline 667 iter->Stop(); 668 } 669 670 TEST_F(MindDataTestPipeline, TestBertTokenizerFail1) { 671 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail1."; 672 // Test BertTokenizer with nullptr vocab 673 674 // Create a TextFile dataset 675 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; 676 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 677 EXPECT_NE(ds, nullptr); 678 679 // Create BertTokenizer operation on ds 680 std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(nullptr); 681 EXPECT_NE(bert_tokenizer, nullptr); 682 683 // Create a Map operation on ds 684 ds = ds->Map({bert_tokenizer}); 685 EXPECT_NE(ds, nullptr); 686 687 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 688 // Expect failure: invalid BertTokenizer input with nullptr vocab 689 EXPECT_EQ(iter, nullptr); 690 } 691 692 TEST_F(MindDataTestPipeline, TestBertTokenizerFail2) { 693 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail2."; 694 // Test BertTokenizer with negative max_bytes_per_token 695 696 // Create a TextFile dataset 697 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; 698 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 699 EXPECT_NE(ds, nullptr); 700 701 // Create a vocab from vector 702 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 703 Status s = Vocab::BuildFromVector(list, {}, true, &vocab); 704 EXPECT_EQ(s, Status::OK()); 705 706 // Create BertTokenizer operation on ds 707 std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(vocab, "##", -1); 708 EXPECT_NE(bert_tokenizer, nullptr); 709 710 // Create a Map operation on ds 711 ds = ds->Map({bert_tokenizer}); 712 EXPECT_NE(ds, nullptr); 713 714 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 715 // Expect failure: invalid BertTokenizer input with nullptr vocab 716 EXPECT_EQ(iter, nullptr); 717 } 718 719 TEST_F(MindDataTestPipeline, TestCaseFoldSuccess) { 720 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaseFoldSuccess."; 721 722 // Create a TextFile dataset 723 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; 724 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 725 EXPECT_NE(ds, nullptr); 726 727 // Create casefold operation on ds 728 std::shared_ptr<TensorTransform> casefold = std::make_shared<text::CaseFold>(); 729 EXPECT_NE(casefold, nullptr); 730 731 // Create Map operation on ds 732 ds = ds->Map({casefold}, {"text"}); 733 EXPECT_NE(ds, nullptr); 734 735 // Create an iterator over the result of the above dataset 736 // This will trigger the creation of the Execution Tree and launch it. 737 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 738 EXPECT_NE(iter, nullptr); 739 740 // Iterate the dataset and get each row 741 std::unordered_map<std::string, mindspore::MSTensor> row; 742 ASSERT_OK(iter->GetNextRow(&row)); 743 744 std::vector<std::string> expected = {"welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "}; 745 746 uint64_t i = 0; 747 while (row.size() != 0) { 748 auto ind = row["text"]; 749 std::shared_ptr<Tensor> de_expected_tensor; 750 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor)); 751 mindspore::MSTensor ms_expected_tensor = 752 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 753 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor); 754 ASSERT_OK(iter->GetNextRow(&row)); 755 i++; 756 } 757 758 EXPECT_EQ(i, 4); 759 760 // Manually terminate the pipeline 761 iter->Stop(); 762 } 763 764 TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess) { 765 // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is false. 766 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess."; 767 768 // Create a TextFile dataset 769 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; 770 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; 771 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; 772 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 773 EXPECT_NE(ds, nullptr); 774 775 // Create jieba_tokenizer operation on ds 776 std::shared_ptr<TensorTransform> jieba_tokenizer = 777 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp); 778 EXPECT_NE(jieba_tokenizer, nullptr); 779 780 // Create Map operation on ds 781 ds = ds->Map({jieba_tokenizer}, {"text"}); 782 EXPECT_NE(ds, nullptr); 783 784 // Create an iterator over the result of the above dataset 785 // This will trigger the creation of the Execution Tree and launch it. 786 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 787 EXPECT_NE(iter, nullptr); 788 789 // Iterate the dataset and get each row 790 std::unordered_map<std::string, mindspore::MSTensor> row; 791 ASSERT_OK(iter->GetNextRow(&row)); 792 793 std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"}; 794 std::shared_ptr<Tensor> de_expected_tensor; 795 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); 796 mindspore::MSTensor expected_tensor = 797 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 798 799 uint64_t i = 0; 800 while (row.size() != 0) { 801 auto ind = row["text"]; 802 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 803 ASSERT_OK(iter->GetNextRow(&row)); 804 i++; 805 } 806 807 EXPECT_EQ(i, 1); 808 809 // Manually terminate the pipeline 810 iter->Stop(); 811 } 812 813 TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess1) { 814 // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kHmm and the with_offsets is false. 815 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess1."; 816 817 // Create a TextFile dataset 818 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; 819 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; 820 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; 821 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 822 EXPECT_NE(ds, nullptr); 823 824 // Create jieba_tokenizer operation on ds 825 std::shared_ptr<TensorTransform> jieba_tokenizer = 826 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kHmm); 827 EXPECT_NE(jieba_tokenizer, nullptr); 828 829 // Create Map operation on ds 830 ds = ds->Map({jieba_tokenizer}, {"text"}); 831 EXPECT_NE(ds, nullptr); 832 833 // Create an iterator over the result of the above dataset 834 // This will trigger the creation of the Execution Tree and launch it. 835 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 836 EXPECT_NE(iter, nullptr); 837 838 // Iterate the dataset and get each row 839 std::unordered_map<std::string, mindspore::MSTensor> row; 840 ASSERT_OK(iter->GetNextRow(&row)); 841 842 std::vector<std::string> expected = {"今天", "天气", "太", "好", "了", "我们", "一起", "去", "外面", "玩", "吧"}; 843 std::shared_ptr<Tensor> de_expected_tensor; 844 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); 845 mindspore::MSTensor expected_tensor = 846 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 847 848 uint64_t i = 0; 849 while (row.size() != 0) { 850 auto ind = row["text"]; 851 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 852 ASSERT_OK(iter->GetNextRow(&row)); 853 i++; 854 } 855 856 EXPECT_EQ(i, 1); 857 858 // Manually terminate the pipeline 859 iter->Stop(); 860 } 861 862 TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess2) { 863 // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is true. 864 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess2."; 865 866 // Create a TextFile dataset 867 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; 868 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; 869 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; 870 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 871 EXPECT_NE(ds, nullptr); 872 873 // Create jieba_tokenizer operation on ds 874 std::shared_ptr<TensorTransform> jieba_tokenizer = 875 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp, true); 876 EXPECT_NE(jieba_tokenizer, nullptr); 877 878 // Create Map operation on ds 879 ds = ds->Map({jieba_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}, 880 {"token", "offsets_start", "offsets_limit"}); 881 EXPECT_NE(ds, nullptr); 882 883 // Create an iterator over the result of the above dataset 884 // This will trigger the creation of the Execution Tree and launch it. 885 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 886 EXPECT_NE(iter, nullptr); 887 888 // Iterate the dataset and get each row 889 std::unordered_map<std::string, mindspore::MSTensor> row; 890 ASSERT_OK(iter->GetNextRow(&row)); 891 892 std::vector<std::string> expected_tokens = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"}; 893 std::vector<uint32_t> expected_offsets_start = {0, 12, 21, 27, 33, 36, 42}; 894 std::vector<uint32_t> expected_offsets_limit = {12, 21, 27, 33, 36, 42, 48}; 895 896 std::shared_ptr<Tensor> de_expected_tokens; 897 ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens)); 898 mindspore::MSTensor ms_expected_tokens = 899 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens)); 900 901 std::shared_ptr<Tensor> de_expected_offsets_start; 902 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start)); 903 mindspore::MSTensor ms_expected_offsets_start = 904 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start)); 905 906 std::shared_ptr<Tensor> de_expected_offsets_limit; 907 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit)); 908 mindspore::MSTensor ms_expected_offsets_limit = 909 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit)); 910 911 uint64_t i = 0; 912 while (row.size() != 0) { 913 auto ind = row["token"]; 914 EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens); 915 916 auto start = row["offsets_start"]; 917 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start); 918 919 auto limit = row["offsets_limit"]; 920 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit); 921 922 ASSERT_OK(iter->GetNextRow(&row)); 923 i++; 924 } 925 926 EXPECT_EQ(i, 1); 927 928 // Manually terminate the pipeline 929 iter->Stop(); 930 } 931 932 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail1) { 933 // Testing the incorrect parameter of JiebaTokenizer interface. 934 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail1."; 935 936 // Create a TextFile dataset 937 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; 938 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; 939 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 940 EXPECT_NE(ds, nullptr); 941 942 // Create jieba_tokenizer operation on ds 943 // Testing the parameter hmm_path is empty 944 std::shared_ptr<TensorTransform> jieba_tokenizer = 945 std::make_shared<text::JiebaTokenizer>("", mp_path, JiebaMode::kMp); 946 EXPECT_NE(jieba_tokenizer, nullptr); 947 948 // Create a Map operation on ds 949 ds = ds->Map({jieba_tokenizer}); 950 EXPECT_NE(ds, nullptr); 951 952 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 953 // Expect failure: invalid JiebaTokenizer input (parameter hmm_path is empty) 954 EXPECT_EQ(iter, nullptr); 955 } 956 957 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail2) { 958 // Testing the incorrect parameter of JiebaTokenizer interface. 959 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail2."; 960 961 // Create a TextFile dataset 962 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; 963 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; 964 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 965 EXPECT_NE(ds, nullptr); 966 967 // Create jieba_tokenizer operation on ds 968 // Testing the parameter mp_path is empty 969 std::shared_ptr<TensorTransform> jieba_tokenizer = 970 std::make_shared<text::JiebaTokenizer>(hmm_path, "", JiebaMode::kMp); 971 EXPECT_NE(jieba_tokenizer, nullptr); 972 973 // Create a Map operation on ds 974 ds = ds->Map({jieba_tokenizer}); 975 EXPECT_NE(ds, nullptr); 976 977 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 978 // Expect failure: invalid JiebaTokenizer input (parameter mp_path is empty) 979 EXPECT_EQ(iter, nullptr); 980 } 981 982 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail3) { 983 // Testing the incorrect parameter of JiebaTokenizer interface. 984 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail3."; 985 986 // Create a TextFile dataset 987 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; 988 std::string hmm_path_invalid = datasets_root_path_ + "/jiebadict/1.txt"; 989 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; 990 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 991 EXPECT_NE(ds, nullptr); 992 993 // Create jieba_tokenizer operation on ds 994 // Testing the parameter hmm_path is invalid path 995 std::shared_ptr<TensorTransform> jieba_tokenizer = 996 std::make_shared<text::JiebaTokenizer>(hmm_path_invalid, mp_path, JiebaMode::kMp); 997 EXPECT_NE(jieba_tokenizer, nullptr); 998 999 // Create a Map operation on ds 1000 ds = ds->Map({jieba_tokenizer}); 1001 EXPECT_NE(ds, nullptr); 1002 1003 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1004 // Expect failure: invalid JiebaTokenizer input (parameter hmm_path is invalid path) 1005 EXPECT_EQ(iter, nullptr); 1006 } 1007 1008 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail4) { 1009 // Testing the incorrect parameter of JiebaTokenizer interface. 1010 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail4."; 1011 1012 // Create a TextFile dataset 1013 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; 1014 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; 1015 std::string mp_path_invalid = datasets_root_path_ + "/jiebadict/1.txt"; 1016 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 1017 EXPECT_NE(ds, nullptr); 1018 1019 // Create jieba_tokenizer operation on ds 1020 // Testing the parameter mp_path is invalid path 1021 std::shared_ptr<TensorTransform> jieba_tokenizer = 1022 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path_invalid, JiebaMode::kMp); 1023 EXPECT_NE(jieba_tokenizer, nullptr); 1024 1025 // Create a Map operation on ds 1026 ds = ds->Map({jieba_tokenizer}); 1027 EXPECT_NE(ds, nullptr); 1028 1029 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1030 // Expect failure: invalid JiebaTokenizer input (parameter mp_path is invalid path) 1031 EXPECT_EQ(iter, nullptr); 1032 } 1033 1034 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord) { 1035 // Testing the parameter AddWord of JiebaTokenizer when the freq is not provided (default 0). 1036 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord."; 1037 1038 // Create a TextFile dataset 1039 std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt"; 1040 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; 1041 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; 1042 std::shared_ptr<Dataset> ds = TextFile({data_file}); 1043 EXPECT_NE(ds, nullptr); 1044 1045 // Create jieba_tokenizer operation on ds 1046 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer = 1047 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp); 1048 EXPECT_NE(jieba_tokenizer, nullptr); 1049 1050 // Add word with freq not provided (default 0) 1051 ASSERT_OK(jieba_tokenizer->AddWord("男默女泪")); 1052 1053 // Create Map operation on ds 1054 ds = ds->Map({jieba_tokenizer}, {"text"}); 1055 EXPECT_NE(ds, nullptr); 1056 1057 // Create an iterator over the result of the above dataset 1058 // This will trigger the creation of the Execution Tree and launch it. 1059 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1060 EXPECT_NE(iter, nullptr); 1061 1062 // Iterate the dataset and get each row 1063 std::unordered_map<std::string, mindspore::MSTensor> row; 1064 ASSERT_OK(iter->GetNextRow(&row)); 1065 1066 std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"}; 1067 std::shared_ptr<Tensor> de_expected_tensor; 1068 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); 1069 mindspore::MSTensor expected_tensor = 1070 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 1071 1072 uint64_t i = 0; 1073 while (row.size() != 0) { 1074 auto ind = row["text"]; 1075 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 1076 ASSERT_OK(iter->GetNextRow(&row)); 1077 i++; 1078 } 1079 1080 EXPECT_EQ(i, 1); 1081 1082 // Manually terminate the pipeline 1083 iter->Stop(); 1084 } 1085 1086 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord1) { 1087 // Testing the parameter AddWord of JiebaTokenizer when the freq is set explicitly to 0. 1088 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord1."; 1089 1090 // Create a TextFile dataset 1091 std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt"; 1092 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; 1093 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; 1094 std::shared_ptr<Dataset> ds = TextFile({data_file}); 1095 EXPECT_NE(ds, nullptr); 1096 1097 // Create jieba_tokenizer operation on ds 1098 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer = 1099 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp); 1100 EXPECT_NE(jieba_tokenizer, nullptr); 1101 1102 // Add word with freq is set explicitly to 0 1103 ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 0)); 1104 1105 // Create Map operation on ds 1106 ds = ds->Map({jieba_tokenizer}, {"text"}); 1107 EXPECT_NE(ds, nullptr); 1108 1109 // Create an iterator over the result of the above dataset 1110 // This will trigger the creation of the Execution Tree and launch it. 1111 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1112 EXPECT_NE(iter, nullptr); 1113 1114 // Iterate the dataset and get each row 1115 std::unordered_map<std::string, mindspore::MSTensor> row; 1116 ASSERT_OK(iter->GetNextRow(&row)); 1117 1118 std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"}; 1119 std::shared_ptr<Tensor> de_expected_tensor; 1120 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); 1121 mindspore::MSTensor expected_tensor = 1122 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 1123 1124 uint64_t i = 0; 1125 while (row.size() != 0) { 1126 auto ind = row["text"]; 1127 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 1128 ASSERT_OK(iter->GetNextRow(&row)); 1129 i++; 1130 } 1131 1132 EXPECT_EQ(i, 1); 1133 1134 // Manually terminate the pipeline 1135 iter->Stop(); 1136 } 1137 1138 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord2) { 1139 // Testing the parameter AddWord of JiebaTokenizer when the freq is 10. 1140 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord2."; 1141 1142 // Create a TextFile dataset 1143 std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt"; 1144 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; 1145 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; 1146 std::shared_ptr<Dataset> ds = TextFile({data_file}); 1147 EXPECT_NE(ds, nullptr); 1148 1149 // Create jieba_tokenizer operation on ds 1150 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer = 1151 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp); 1152 EXPECT_NE(jieba_tokenizer, nullptr); 1153 1154 // Add word with freq 10 1155 ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 10)); 1156 1157 // Create Map operation on ds 1158 ds = ds->Map({jieba_tokenizer}, {"text"}); 1159 EXPECT_NE(ds, nullptr); 1160 1161 // Create an iterator over the result of the above dataset 1162 // This will trigger the creation of the Execution Tree and launch it. 1163 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1164 EXPECT_NE(iter, nullptr); 1165 1166 // Iterate the dataset and get each row 1167 std::unordered_map<std::string, mindspore::MSTensor> row; 1168 ASSERT_OK(iter->GetNextRow(&row)); 1169 1170 std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"}; 1171 std::shared_ptr<Tensor> de_expected_tensor; 1172 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); 1173 mindspore::MSTensor expected_tensor = 1174 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 1175 1176 uint64_t i = 0; 1177 while (row.size() != 0) { 1178 auto ind = row["text"]; 1179 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 1180 ASSERT_OK(iter->GetNextRow(&row)); 1181 i++; 1182 } 1183 1184 EXPECT_EQ(i, 1); 1185 1186 // Manually terminate the pipeline 1187 iter->Stop(); 1188 } 1189 1190 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord3) { 1191 // Testing the parameter AddWord of JiebaTokenizer when the freq is 20000 which affects the result of segmentation. 1192 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord3."; 1193 1194 // Create a TextFile dataset 1195 std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt"; 1196 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; 1197 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; 1198 std::shared_ptr<Dataset> ds = TextFile({data_file}); 1199 EXPECT_NE(ds, nullptr); 1200 1201 // Create jieba_tokenizer operation on ds 1202 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer = 1203 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp); 1204 EXPECT_NE(jieba_tokenizer, nullptr); 1205 1206 // Add word with freq 20000 1207 ASSERT_OK(jieba_tokenizer->AddWord("江大桥", 20000)); 1208 1209 // Create Map operation on ds 1210 ds = ds->Map({jieba_tokenizer}, {"text"}); 1211 EXPECT_NE(ds, nullptr); 1212 1213 // Create an iterator over the result of the above dataset 1214 // This will trigger the creation of the Execution Tree and launch it. 1215 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1216 EXPECT_NE(iter, nullptr); 1217 1218 // Iterate the dataset and get each row 1219 std::unordered_map<std::string, mindspore::MSTensor> row; 1220 ASSERT_OK(iter->GetNextRow(&row)); 1221 1222 std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"}; 1223 std::shared_ptr<Tensor> de_expected_tensor; 1224 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); 1225 mindspore::MSTensor expected_tensor = 1226 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 1227 1228 uint64_t i = 0; 1229 while (row.size() != 0) { 1230 auto ind = row["text"]; 1231 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 1232 ASSERT_OK(iter->GetNextRow(&row)); 1233 i++; 1234 } 1235 1236 EXPECT_EQ(i, 1); 1237 1238 // Manually terminate the pipeline 1239 iter->Stop(); 1240 } 1241 1242 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWordFail) { 1243 // Testing the incorrect parameter of AddWord in JiebaTokenizer. 1244 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWordFail."; 1245 1246 // Create a TextFile dataset 1247 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; 1248 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; 1249 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; 1250 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 1251 EXPECT_NE(ds, nullptr); 1252 1253 // Testing the parameter word of AddWord is empty 1254 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer = 1255 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp); 1256 EXPECT_NE(jieba_tokenizer, nullptr); 1257 EXPECT_NE(jieba_tokenizer->AddWord("", 10), Status::OK()); 1258 // Testing the parameter freq of AddWord is negative 1259 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer1 = 1260 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp); 1261 EXPECT_NE(jieba_tokenizer1, nullptr); 1262 EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK()); 1263 } 1264 1265 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDict) { 1266 // Testing AddDict of JiebaTokenizer when the input is a vector of word-freq pair. 1267 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDict."; 1268 1269 // Create a TextFile dataset 1270 std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt"; 1271 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; 1272 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; 1273 std::shared_ptr<Dataset> ds = TextFile({data_file}); 1274 EXPECT_NE(ds, nullptr); 1275 1276 // Create jieba_tokenizer operation on ds 1277 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer = 1278 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp); 1279 EXPECT_NE(jieba_tokenizer, nullptr); 1280 1281 // Add word with freq 20000 1282 std::vector<std::pair<std::string, int64_t>> user_dict = {{"江大桥", 20000}}; 1283 ASSERT_OK(jieba_tokenizer->AddDict(user_dict)); 1284 1285 // Create Map operation on ds 1286 ds = ds->Map({jieba_tokenizer}, {"text"}); 1287 EXPECT_NE(ds, nullptr); 1288 1289 // Create an iterator over the result of the above dataset 1290 // This will trigger the creation of the Execution Tree and launch it. 1291 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1292 EXPECT_NE(iter, nullptr); 1293 1294 // Iterate the dataset and get each row 1295 std::unordered_map<std::string, mindspore::MSTensor> row; 1296 ASSERT_OK(iter->GetNextRow(&row)); 1297 1298 std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"}; 1299 std::shared_ptr<Tensor> de_expected_tensor; 1300 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); 1301 mindspore::MSTensor expected_tensor = 1302 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 1303 1304 uint64_t i = 0; 1305 while (row.size() != 0) { 1306 auto txt = row["text"]; 1307 EXPECT_MSTENSOR_EQ(txt, expected_tensor); 1308 ASSERT_OK(iter->GetNextRow(&row)); 1309 i++; 1310 } 1311 1312 EXPECT_EQ(i, 1); 1313 1314 // Manually terminate the pipeline 1315 iter->Stop(); 1316 } 1317 1318 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDictFromFile) { 1319 // Testing AddDict of JiebaTokenizer when the input is a path to dict. 1320 // Test error scenario for AddDict: invalid path 1321 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDictFromFile."; 1322 1323 // Create a TextFile dataset 1324 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt"; 1325 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8"; 1326 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8"; 1327 std::shared_ptr<Dataset> ds = TextFile({data_file}); 1328 EXPECT_NE(ds, nullptr); 1329 1330 // Create jieba_tokenizer operation on ds 1331 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer = 1332 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp); 1333 EXPECT_NE(jieba_tokenizer, nullptr); 1334 1335 // Load dict from txt file 1336 std::string user_dict_path = datasets_root_path_ + "/testJiebaDataset/user_dict.txt"; 1337 std::string invalid_path = datasets_root_path_ + "/testJiebaDataset/invalid_path.txt"; 1338 EXPECT_ERROR(jieba_tokenizer->AddDict(invalid_path)); 1339 ASSERT_OK(jieba_tokenizer->AddDict(user_dict_path)); 1340 1341 // Create Map operation on ds 1342 ds = ds->Map({jieba_tokenizer}, {"text"}); 1343 EXPECT_NE(ds, nullptr); 1344 1345 // Create an iterator over the result of the above dataset 1346 // This will trigger the creation of the Execution Tree and launch it. 1347 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1348 EXPECT_NE(iter, nullptr); 1349 1350 // Iterate the dataset and get each row 1351 std::unordered_map<std::string, mindspore::MSTensor> row; 1352 ASSERT_OK(iter->GetNextRow(&row)); 1353 1354 std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"}; 1355 std::shared_ptr<Tensor> de_expected_tensor; 1356 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor)); 1357 mindspore::MSTensor expected_tensor = 1358 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 1359 1360 uint64_t i = 0; 1361 while (row.size() != 0) { 1362 auto txt = row["text"]; 1363 EXPECT_MSTENSOR_EQ(txt, expected_tensor); 1364 ASSERT_OK(iter->GetNextRow(&row)); 1365 i++; 1366 } 1367 1368 EXPECT_EQ(i, 1); 1369 1370 // Manually terminate the pipeline 1371 iter->Stop(); 1372 } 1373 1374 TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) { 1375 // Testing the parameter of SlidingWindow interface when the axis is 0. 1376 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess."; 1377 1378 // Create a TextFile dataset 1379 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; 1380 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 1381 EXPECT_NE(ds, nullptr); 1382 1383 // Create white_tokenizer operation on ds 1384 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>(); 1385 EXPECT_NE(white_tokenizer, nullptr); 1386 // Create sliding_window operation on ds 1387 std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(3, 0); 1388 EXPECT_NE(sliding_window, nullptr); 1389 1390 // Create Map operation on ds 1391 ds = ds->Map({white_tokenizer, sliding_window}, {"text"}); 1392 EXPECT_NE(ds, nullptr); 1393 1394 // Create an iterator over the result of the above dataset 1395 // This will trigger the creation of the Execution Tree and launch it. 1396 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1397 EXPECT_NE(iter, nullptr); 1398 1399 // Iterate the dataset and get each row 1400 std::unordered_map<std::string, mindspore::MSTensor> row; 1401 ASSERT_OK(iter->GetNextRow(&row)); 1402 1403 std::vector<std::vector<std::string>> expected = {{"This", "is", "a", "is", "a", "text", "a", "text", "file."}, 1404 {"Be", "happy", "every", "happy", "every", "day."}, 1405 {"Good", "luck", "to", "luck", "to", "everyone."}}; 1406 1407 uint64_t i = 0; 1408 while (row.size() != 0) { 1409 auto ind = row["text"]; 1410 1411 std::shared_ptr<Tensor> de_expected_tensor; 1412 int x = expected[i].size() / 3; 1413 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x, 3}), &de_expected_tensor)); 1414 mindspore::MSTensor expected_tensor = 1415 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 1416 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 1417 1418 ASSERT_OK(iter->GetNextRow(&row)); 1419 i++; 1420 } 1421 1422 EXPECT_EQ(i, 3); 1423 1424 // Manually terminate the pipeline 1425 iter->Stop(); 1426 } 1427 1428 TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess1) { 1429 // Testing the parameter of SlidingWindow interface when the axis is -1. 1430 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess1."; 1431 1432 // Create a TextFile dataset 1433 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; 1434 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 1435 EXPECT_NE(ds, nullptr); 1436 1437 // Create white_tokenizer operation on ds 1438 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>(); 1439 EXPECT_NE(white_tokenizer, nullptr); 1440 // Create sliding_window operation on ds 1441 std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(2, -1); 1442 EXPECT_NE(sliding_window, nullptr); 1443 1444 // Create Map operation on ds 1445 ds = ds->Map({white_tokenizer, sliding_window}, {"text"}); 1446 EXPECT_NE(ds, nullptr); 1447 1448 // Create an iterator over the result of the above dataset 1449 // This will trigger the creation of the Execution Tree and launch it. 1450 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1451 EXPECT_NE(iter, nullptr); 1452 1453 // Iterate the dataset and get each row 1454 std::unordered_map<std::string, mindspore::MSTensor> row; 1455 ASSERT_OK(iter->GetNextRow(&row)); 1456 1457 std::vector<std::vector<std::string>> expected = {{"This", "is", "is", "a", "a", "text", "text", "file."}, 1458 {"Be", "happy", "happy", "every", "every", "day."}, 1459 {"Good", "luck", "luck", "to", "to", "everyone."}}; 1460 uint64_t i = 0; 1461 while (row.size() != 0) { 1462 auto ind = row["text"]; 1463 1464 std::shared_ptr<Tensor> de_expected_tensor; 1465 int x = expected[i].size() / 2; 1466 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x, 2}), &de_expected_tensor)); 1467 mindspore::MSTensor expected_tensor = 1468 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 1469 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 1470 1471 ASSERT_OK(iter->GetNextRow(&row)); 1472 i++; 1473 } 1474 1475 EXPECT_EQ(i, 3); 1476 1477 // Manually terminate the pipeline 1478 iter->Stop(); 1479 } 1480 1481 TEST_F(MindDataTestPipeline, TestSlidingWindowFail1) { 1482 // Testing the incorrect parameter of SlidingWindow interface. 1483 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail1."; 1484 1485 // Create a TextFile dataset 1486 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; 1487 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 1488 EXPECT_NE(ds, nullptr); 1489 1490 // Create sliding_window operation on ds 1491 // Testing the parameter width less than or equal to 0 1492 // The parameter axis support 0 or -1 only for now 1493 std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(0, 0); 1494 EXPECT_NE(sliding_window, nullptr); 1495 1496 // Create a Map operation on ds 1497 ds = ds->Map({sliding_window}); 1498 EXPECT_NE(ds, nullptr); 1499 1500 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1501 // Expect failure: invalid SlidingWindow input (width less than or equal to 0) 1502 EXPECT_EQ(iter, nullptr); 1503 } 1504 1505 TEST_F(MindDataTestPipeline, TestSlidingWindowFail2) { 1506 // Testing the incorrect parameter of SlidingWindow interface. 1507 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail2."; 1508 1509 // Create a TextFile dataset 1510 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; 1511 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 1512 EXPECT_NE(ds, nullptr); 1513 1514 // Create sliding_window operation on ds 1515 // Testing the parameter width less than or equal to 0 1516 // The parameter axis support 0 or -1 only for now 1517 std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(-2, 0); 1518 EXPECT_NE(sliding_window, nullptr); 1519 1520 // Create a Map operation on ds 1521 ds = ds->Map({sliding_window}); 1522 EXPECT_NE(ds, nullptr); 1523 1524 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1525 // Expect failure: invalid SlidingWindow input (width less than or equal to 0) 1526 EXPECT_EQ(iter, nullptr); 1527 } 1528 1529 TEST_F(MindDataTestPipeline, TestToNumberSuccess1) { 1530 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberSuccess1."; 1531 // Test ToNumber with integer numbers 1532 1533 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt"; 1534 1535 // Create a TextFile dataset 1536 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 1537 EXPECT_NE(ds, nullptr); 1538 1539 // Create a Take operation on ds 1540 ds = ds->Take(8); 1541 EXPECT_NE(ds, nullptr); 1542 1543 // Create ToNumber operation on ds 1544 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64); 1545 EXPECT_NE(to_number, nullptr); 1546 1547 // Create a Map operation on ds 1548 ds = ds->Map({to_number}, {"text"}); 1549 EXPECT_NE(ds, nullptr); 1550 1551 // Create an iterator over the result of the above dataset 1552 // This will trigger the creation of the Execution Tree and launch it. 1553 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1554 EXPECT_NE(iter, nullptr); 1555 1556 // Iterate the dataset and get each row 1557 std::unordered_map<std::string, mindspore::MSTensor> row; 1558 ASSERT_OK(iter->GetNextRow(&row)); 1559 1560 std::vector<int64_t> expected = {-121, 14, -2219, 7623, -8162536, 162371864, -1726483716, 98921728421}; 1561 1562 uint64_t i = 0; 1563 while (row.size() != 0) { 1564 auto ind = row["text"]; 1565 std::shared_ptr<Tensor> de_expected_tensor; 1566 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor)); 1567 mindspore::MSTensor ms_expected_tensor = 1568 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 1569 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor); 1570 ASSERT_OK(iter->GetNextRow(&row)); 1571 i++; 1572 } 1573 1574 EXPECT_EQ(i, 8); 1575 1576 // Manually terminate the pipeline 1577 iter->Stop(); 1578 } 1579 1580 TEST_F(MindDataTestPipeline, TestToNumberSuccess2) { 1581 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberSuccess2."; 1582 // Test ToNumber with float numbers 1583 1584 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt"; 1585 1586 // Create a TextFile dataset 1587 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 1588 EXPECT_NE(ds, nullptr); 1589 1590 // Create a Skip operation on ds 1591 ds = ds->Skip(8); 1592 EXPECT_NE(ds, nullptr); 1593 1594 // Create a Take operation on ds 1595 ds = ds->Take(6); 1596 EXPECT_NE(ds, nullptr); 1597 1598 // Create ToNumber operation on ds 1599 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat64); 1600 EXPECT_NE(to_number, nullptr); 1601 1602 // Create a Map operation on ds 1603 ds = ds->Map({to_number}, {"text"}); 1604 EXPECT_NE(ds, nullptr); 1605 1606 // Create an iterator over the result of the above dataset 1607 // This will trigger the creation of the Execution Tree and launch it. 1608 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1609 EXPECT_NE(iter, nullptr); 1610 1611 // Iterate the dataset and get each row 1612 std::unordered_map<std::string, mindspore::MSTensor> row; 1613 ASSERT_OK(iter->GetNextRow(&row)); 1614 1615 std::vector<double_t> expected = {-1.1, 1.4, -2219.321, 7623.453, -816256.234282, 162371864.243243}; 1616 1617 uint64_t i = 0; 1618 while (row.size() != 0) { 1619 auto ind = row["text"]; 1620 std::shared_ptr<Tensor> de_expected_tensor; 1621 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor)); 1622 mindspore::MSTensor ms_expected_tensor = 1623 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 1624 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor); 1625 ASSERT_OK(iter->GetNextRow(&row)); 1626 i++; 1627 } 1628 1629 EXPECT_EQ(i, 6); 1630 1631 // Manually terminate the pipeline 1632 iter->Stop(); 1633 } 1634 1635 TEST_F(MindDataTestPipeline, TestToNumberFail1) { 1636 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail1."; 1637 // Test ToNumber with overflow integer numbers 1638 1639 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt"; 1640 1641 // Create a TextFile dataset 1642 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 1643 EXPECT_NE(ds, nullptr); 1644 1645 // Create a Skip operation on ds 1646 ds = ds->Skip(2); 1647 EXPECT_NE(ds, nullptr); 1648 1649 // Create a Take operation on ds 1650 ds = ds->Take(6); 1651 EXPECT_NE(ds, nullptr); 1652 1653 // Create ToNumber operation on ds 1654 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8); 1655 EXPECT_NE(to_number, nullptr); 1656 1657 // Create a Map operation on ds 1658 ds = ds->Map({to_number}, {"text"}); 1659 EXPECT_NE(ds, nullptr); 1660 1661 // Create an iterator over the result of the above dataset 1662 // This will trigger the creation of the Execution Tree and launch it. 1663 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1664 EXPECT_NE(iter, nullptr); 1665 1666 // Iterate the dataset and get each row 1667 std::unordered_map<std::string, mindspore::MSTensor> row; 1668 1669 // Expect error: input out of bounds of int8 1670 EXPECT_ERROR(iter->GetNextRow(&row)); 1671 1672 uint64_t i = 0; 1673 while (row.size() != 0) { 1674 EXPECT_ERROR(iter->GetNextRow(&row)); 1675 i++; 1676 } 1677 1678 // Expect failure: GetNextRow fail and return nothing 1679 EXPECT_EQ(i, 0); 1680 1681 // Manually terminate the pipeline 1682 iter->Stop(); 1683 } 1684 1685 TEST_F(MindDataTestPipeline, TestToNumberFail2) { 1686 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail2."; 1687 // Test ToNumber with overflow float numbers 1688 1689 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt"; 1690 1691 // Create a TextFile dataset 1692 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 1693 EXPECT_NE(ds, nullptr); 1694 1695 // Create a Skip operation on ds 1696 ds = ds->Skip(12); 1697 EXPECT_NE(ds, nullptr); 1698 1699 // Create a Take operation on ds 1700 ds = ds->Take(2); 1701 EXPECT_NE(ds, nullptr); 1702 1703 // Create ToNumber operation on ds 1704 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16); 1705 EXPECT_NE(to_number, nullptr); 1706 1707 // Create a Map operation on ds 1708 ds = ds->Map({to_number}, {"text"}); 1709 EXPECT_NE(ds, nullptr); 1710 1711 // Create an iterator over the result of the above dataset 1712 // This will trigger the creation of the Execution Tree and launch it. 1713 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1714 EXPECT_NE(iter, nullptr); 1715 1716 // Iterate the dataset and get each row 1717 std::unordered_map<std::string, mindspore::MSTensor> row; 1718 1719 // Expect error: input out of bounds of float16 1720 EXPECT_ERROR(iter->GetNextRow(&row)); 1721 1722 uint64_t i = 0; 1723 while (row.size() != 0) { 1724 EXPECT_ERROR(iter->GetNextRow(&row)); 1725 i++; 1726 } 1727 1728 // Expect failure: GetNextRow fail and return nothing 1729 EXPECT_EQ(i, 0); 1730 1731 // Manually terminate the pipeline 1732 iter->Stop(); 1733 } 1734 1735 TEST_F(MindDataTestPipeline, TestToNumberFail3) { 1736 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail3."; 1737 // Test ToNumber with non numerical input 1738 1739 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt"; 1740 1741 // Create a TextFile dataset 1742 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 1743 EXPECT_NE(ds, nullptr); 1744 1745 // Create a Skip operation on ds 1746 ds = ds->Skip(14); 1747 EXPECT_NE(ds, nullptr); 1748 1749 // Create ToNumber operation on ds 1750 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64); 1751 EXPECT_NE(to_number, nullptr); 1752 1753 // Create a Map operation on ds 1754 ds = ds->Map({to_number}, {"text"}); 1755 EXPECT_NE(ds, nullptr); 1756 1757 // Create an iterator over the result of the above dataset 1758 // This will trigger the creation of the Execution Tree and launch it. 1759 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1760 EXPECT_NE(iter, nullptr); 1761 1762 // Iterate the dataset and get each row 1763 std::unordered_map<std::string, mindspore::MSTensor> row; 1764 1765 // Expect error: invalid input which is non numerical 1766 EXPECT_ERROR(iter->GetNextRow(&row)); 1767 1768 uint64_t i = 0; 1769 while (row.size() != 0) { 1770 EXPECT_ERROR(iter->GetNextRow(&row)); 1771 i++; 1772 } 1773 1774 // Expect failure: GetNextRow fail and return nothing 1775 EXPECT_EQ(i, 0); 1776 1777 // Manually terminate the pipeline 1778 iter->Stop(); 1779 } 1780 1781 TEST_F(MindDataTestPipeline, TestToNumberFail4) { 1782 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail4."; 1783 // Test ToNumber with non numerical data type 1784 1785 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt"; 1786 1787 // Create a TextFile dataset 1788 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 1789 EXPECT_NE(ds, nullptr); 1790 1791 // Create ToNumber operation on ds 1792 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kObjectTypeString); 1793 EXPECT_NE(to_number, nullptr); 1794 1795 // Create a Map operation on ds 1796 ds = ds->Map({to_number}, {"text"}); 1797 EXPECT_NE(ds, nullptr); 1798 1799 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1800 // Expect failure: invalid parameter with non numerical data type 1801 EXPECT_EQ(iter, nullptr); 1802 } 1803 1804 TEST_F(MindDataTestPipeline, TestToNumberFail5) { 1805 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail5."; 1806 // Test ToNumber with non numerical data type 1807 1808 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt"; 1809 1810 // Create a TextFile dataset 1811 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 1812 EXPECT_NE(ds, nullptr); 1813 1814 // Create ToNumber operation on ds 1815 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeBool); 1816 EXPECT_NE(to_number, nullptr); 1817 1818 // Create a Map operation on ds 1819 ds = ds->Map({to_number}, {"text"}); 1820 EXPECT_NE(ds, nullptr); 1821 1822 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1823 // Expect failure: invalid parameter with non numerical data type 1824 EXPECT_EQ(iter, nullptr); 1825 } 1826 1827 TEST_F(MindDataTestPipeline, TestTruncateSequencePairSuccess1) { 1828 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairSuccess1."; 1829 // Testing basic TruncateSequencePair 1830 1831 // Set seed for RandomDataset 1832 auto original_seed = config::get_seed(); 1833 bool status_set_seed = config::set_seed(0); 1834 EXPECT_EQ(status_set_seed, true); 1835 1836 // Set num_parallel_workers for RandomDataset 1837 auto original_worker = config::get_num_parallel_workers(); 1838 bool status_set_worker = config::set_num_parallel_workers(1); 1839 EXPECT_EQ(status_set_worker, true); 1840 1841 // Create a RandomDataset which has column names "col1" and "col2" 1842 std::shared_ptr<SchemaObj> schema = Schema(); 1843 ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt16, {5})); 1844 ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt32, {3})); 1845 std::shared_ptr<Dataset> ds = RandomData(3, schema); 1846 EXPECT_NE(ds, nullptr); 1847 1848 // Create a truncate_sequence_pair operation on ds 1849 std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(4); 1850 EXPECT_NE(truncate_sequence_pair, nullptr); 1851 1852 // Create Map operation on ds 1853 ds = ds->Map({truncate_sequence_pair}, {"col1", "col2"}); 1854 EXPECT_NE(ds, nullptr); 1855 1856 // Create an iterator over the result of the above dataset 1857 // This will trigger the creation of the Execution Tree and launch it. 1858 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1859 EXPECT_NE(iter, nullptr); 1860 1861 // Iterate the dataset and get each row 1862 std::unordered_map<std::string, mindspore::MSTensor> row; 1863 ASSERT_OK(iter->GetNextRow(&row)); 1864 1865 std::vector<std::vector<int16_t>> expected1 = {{-29556, -29556}, {-18505, -18505}, {-25958, -25958}}; 1866 std::vector<std::vector<int32_t>> expected2 = { 1867 {-1751672937, -1751672937}, {-656877352, -656877352}, {-606348325, -606348325}}; 1868 1869 uint64_t i = 0; 1870 while (row.size() != 0) { 1871 auto ind1 = row["col1"]; 1872 auto ind2 = row["col2"]; 1873 1874 std::shared_ptr<Tensor> de_expected_tensor1; 1875 ASSERT_OK(Tensor::CreateFromVector(expected1[i], &de_expected_tensor1)); 1876 mindspore::MSTensor expected_tensor1 = 1877 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor1)); 1878 EXPECT_MSTENSOR_EQ(ind1, expected_tensor1); 1879 1880 std::shared_ptr<Tensor> de_expected_tensor2; 1881 ASSERT_OK(Tensor::CreateFromVector(expected2[i], &de_expected_tensor2)); 1882 mindspore::MSTensor expected_tensor2 = 1883 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor2)); 1884 EXPECT_MSTENSOR_EQ(ind2, expected_tensor2); 1885 1886 ASSERT_OK(iter->GetNextRow(&row)); 1887 i++; 1888 } 1889 1890 EXPECT_EQ(i, 3); 1891 1892 // Manually terminate the pipeline 1893 iter->Stop(); 1894 1895 // Restore original seed and num_parallel_workers 1896 status_set_seed = config::set_seed(original_seed); 1897 EXPECT_EQ(status_set_seed, true); 1898 status_set_worker = config::set_num_parallel_workers(original_worker); 1899 EXPECT_EQ(status_set_worker, true); 1900 } 1901 1902 TEST_F(MindDataTestPipeline, TestTruncateSequencePairSuccess2) { 1903 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairSuccess2."; 1904 // Testing basic TruncateSequencePair with odd max_length 1905 1906 // Set seed for RandomDataset 1907 auto original_seed = config::get_seed(); 1908 bool status_set_seed = config::set_seed(1); 1909 EXPECT_EQ(status_set_seed, true); 1910 1911 // Set num_parallel_workers for RandomDataset 1912 auto original_worker = config::get_num_parallel_workers(); 1913 bool status_set_worker = config::set_num_parallel_workers(1); 1914 EXPECT_EQ(status_set_worker, true); 1915 1916 // Create a RandomDataset which has column names "col1" and "col2" 1917 std::shared_ptr<SchemaObj> schema = Schema(); 1918 ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt32, {4})); 1919 ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt64, {4})); 1920 std::shared_ptr<Dataset> ds = RandomData(4, schema); 1921 EXPECT_NE(ds, nullptr); 1922 1923 // Create a truncate_sequence_pair operation on ds 1924 std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(5); 1925 EXPECT_NE(truncate_sequence_pair, nullptr); 1926 1927 // Create Map operation on ds 1928 ds = ds->Map({truncate_sequence_pair}, {"col1", "col2"}); 1929 EXPECT_NE(ds, nullptr); 1930 1931 // Create an iterator over the result of the above dataset 1932 // This will trigger the creation of the Execution Tree and launch it. 1933 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 1934 EXPECT_NE(iter, nullptr); 1935 1936 // Iterate the dataset and get each row 1937 std::unordered_map<std::string, mindspore::MSTensor> row; 1938 ASSERT_OK(iter->GetNextRow(&row)); 1939 1940 std::vector<std::vector<int32_t>> expected1 = {{1785358954, 1785358954, 1785358954}, 1941 {-1195853640, -1195853640, -1195853640}, 1942 {0, 0, 0}, 1943 {1296911693, 1296911693, 1296911693}}; 1944 std::vector<std::vector<int64_t>> expected2 = { 1945 {-1, -1}, {-1229782938247303442, -1229782938247303442}, {2314885530818453536, 2314885530818453536}, {-1, -1}}; 1946 1947 uint64_t i = 0; 1948 while (row.size() != 0) { 1949 auto ind1 = row["col1"]; 1950 auto ind2 = row["col2"]; 1951 1952 std::shared_ptr<Tensor> de_expected_tensor1; 1953 ASSERT_OK(Tensor::CreateFromVector(expected1[i], &de_expected_tensor1)); 1954 mindspore::MSTensor expected_tensor1 = 1955 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor1)); 1956 EXPECT_MSTENSOR_EQ(ind1, expected_tensor1); 1957 1958 std::shared_ptr<Tensor> de_expected_tensor2; 1959 ASSERT_OK(Tensor::CreateFromVector(expected2[i], &de_expected_tensor2)); 1960 mindspore::MSTensor expected_tensor2 = 1961 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor2)); 1962 EXPECT_MSTENSOR_EQ(ind2, expected_tensor2); 1963 1964 ASSERT_OK(iter->GetNextRow(&row)); 1965 i++; 1966 } 1967 1968 EXPECT_EQ(i, 4); 1969 1970 // Manually terminate the pipeline 1971 iter->Stop(); 1972 1973 // Restore original seed and num_parallel_workers 1974 status_set_seed = config::set_seed(original_seed); 1975 EXPECT_EQ(status_set_seed, true); 1976 status_set_worker = config::set_num_parallel_workers(original_worker); 1977 EXPECT_EQ(status_set_worker, true); 1978 } 1979 1980 TEST_F(MindDataTestPipeline, TestTruncateSequencePairFail) { 1981 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairFail."; 1982 // Testing TruncateSequencePair with negative max_length 1983 1984 // Create a RandomDataset which has column names "col1" and "col2" 1985 std::shared_ptr<SchemaObj> schema = Schema(); 1986 ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt8, {3})); 1987 ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt8, {3})); 1988 std::shared_ptr<Dataset> ds = RandomData(3, schema); 1989 EXPECT_NE(ds, nullptr); 1990 1991 // Create a truncate_sequence_pair operation on ds 1992 std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(-1); 1993 EXPECT_NE(truncate_sequence_pair, nullptr); 1994 1995 // Create a Map operation on ds 1996 ds = ds->Map({truncate_sequence_pair}); 1997 EXPECT_NE(ds, nullptr); 1998 1999 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2000 // Expect failure: invalid TruncateSequencePair input (invalid parameter with negative max_length) 2001 EXPECT_EQ(iter, nullptr); 2002 } 2003 2004 TEST_F(MindDataTestPipeline, TestNgramSuccess) { 2005 // Testing the parameter of Ngram interface. 2006 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess."; 2007 2008 // Create a TextFile dataset 2009 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; 2010 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2011 EXPECT_NE(ds, nullptr); 2012 2013 // Create white_tokenizer operation on ds 2014 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>(); 2015 EXPECT_NE(white_tokenizer, nullptr); 2016 // Create sliding_window operation on ds 2017 std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2}, {"_", 1}, {"_", 1}, " ")); 2018 EXPECT_NE(ngram_op, nullptr); 2019 2020 // Create Map operation on ds 2021 ds = ds->Map({white_tokenizer, ngram_op}, {"text"}); 2022 EXPECT_NE(ds, nullptr); 2023 2024 // Create an iterator over the result of the above dataset 2025 // This will trigger the creation of the Execution Tree and launch it. 2026 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2027 EXPECT_NE(iter, nullptr); 2028 2029 // Iterate the dataset and get each row 2030 std::unordered_map<std::string, mindspore::MSTensor> row; 2031 ASSERT_OK(iter->GetNextRow(&row)); 2032 2033 std::vector<std::vector<std::string>> expected = {{"_ This", "This is", "is a", "a text", "text file.", "file. _"}, 2034 {"_ Be", "Be happy", "happy every", "every day.", "day. _"}, 2035 {"_ Good", "Good luck", "luck to", "to everyone.", "everyone. _"}}; 2036 2037 uint64_t i = 0; 2038 while (row.size() != 0) { 2039 auto ind = row["text"]; 2040 2041 std::shared_ptr<Tensor> de_expected_tensor; 2042 int x = expected[i].size(); 2043 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor)); 2044 mindspore::MSTensor expected_tensor = 2045 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 2046 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 2047 2048 ASSERT_OK(iter->GetNextRow(&row)); 2049 i++; 2050 } 2051 2052 EXPECT_EQ(i, 3); 2053 2054 // Manually terminate the pipeline 2055 iter->Stop(); 2056 } 2057 2058 TEST_F(MindDataTestPipeline, TestNgramSuccess1) { 2059 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess1."; 2060 2061 // Create a TextFile dataset 2062 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; 2063 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2064 EXPECT_NE(ds, nullptr); 2065 2066 // Create white_tokenizer operation on ds 2067 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>(); 2068 EXPECT_NE(white_tokenizer, nullptr); 2069 // Create sliding_window operation on ds 2070 std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2, 3}, {"&", 2}, {"&", 2}, "-")); 2071 EXPECT_NE(ngram_op, nullptr); 2072 2073 // Create Map operation on ds 2074 ds = ds->Map({white_tokenizer, ngram_op}, {"text"}); 2075 EXPECT_NE(ds, nullptr); 2076 2077 // Create an iterator over the result of the above dataset 2078 // This will trigger the creation of the Execution Tree and launch it. 2079 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2080 EXPECT_NE(iter, nullptr); 2081 2082 // Iterate the dataset and get each row 2083 std::unordered_map<std::string, mindspore::MSTensor> row; 2084 ASSERT_OK(iter->GetNextRow(&row)); 2085 2086 std::vector<std::vector<std::string>> expected = { 2087 {"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a", 2088 "is-a-text", 2089 "a-text-file.", "text-file.-&", "file.-&-&"}, 2090 {"&-Be", "Be-happy", "happy-every", "every-day.", "day.-&", "&-&-Be", "&-Be-happy", "Be-happy-every", 2091 "happy-every-day.", "every-day.-&", "day.-&-&"}, 2092 {"&-Good", "Good-luck", "luck-to", "to-everyone.", "everyone.-&", "&-&-Good", "&-Good-luck", "Good-luck-to", 2093 "luck-to-everyone.", "to-everyone.-&", "everyone.-&-&"}}; 2094 2095 uint64_t i = 0; 2096 while (row.size() != 0) { 2097 auto ind = row["text"]; 2098 2099 std::shared_ptr<Tensor> de_expected_tensor; 2100 int x = expected[i].size(); 2101 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor)); 2102 mindspore::MSTensor expected_tensor = 2103 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 2104 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 2105 2106 ASSERT_OK(iter->GetNextRow(&row)); 2107 i++; 2108 } 2109 2110 EXPECT_EQ(i, 3); 2111 2112 // Manually terminate the pipeline 2113 iter->Stop(); 2114 } 2115 2116 TEST_F(MindDataTestPipeline, TestNgramFail1) { 2117 // Testing the incorrect parameter of Ngram interface. 2118 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail1."; 2119 2120 // Create a TextFile dataset 2121 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; 2122 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2123 EXPECT_NE(ds, nullptr); 2124 2125 // Create sliding_window operation on ds 2126 // Testing the vector of ngram is empty 2127 std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({})); 2128 EXPECT_NE(ngram_op, nullptr); 2129 2130 // Create a Map operation on ds 2131 ds = ds->Map({ngram_op}); 2132 EXPECT_NE(ds, nullptr); 2133 2134 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2135 // Expect failure: invalid Ngram input (the vector of ngram is empty) 2136 EXPECT_EQ(iter, nullptr); 2137 } 2138 2139 TEST_F(MindDataTestPipeline, TestNgramFail2) { 2140 // Testing the incorrect parameter of Ngram interface. 2141 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail2."; 2142 2143 // Create a TextFile dataset 2144 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; 2145 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2146 EXPECT_NE(ds, nullptr); 2147 2148 // Create sliding_window operation on ds 2149 // Testing the value of ngrams vector less than and equal to 0 2150 std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({0})); 2151 EXPECT_NE(ngram_op, nullptr); 2152 2153 // Create a Map operation on ds 2154 ds = ds->Map({ngram_op}); 2155 EXPECT_NE(ds, nullptr); 2156 2157 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2158 // Expect failure: invalid Ngram input (the value of ngrams vector less than and equal to 0) 2159 EXPECT_EQ(iter, nullptr); 2160 } 2161 2162 TEST_F(MindDataTestPipeline, TestNgramFail3) { 2163 // Testing the incorrect parameter of Ngram interface. 2164 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail3."; 2165 2166 // Create a TextFile dataset 2167 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; 2168 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2169 EXPECT_NE(ds, nullptr); 2170 2171 // Create sliding_window operation on ds 2172 // Testing the value of ngrams vector less than and equal to 0 2173 std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({-2})); 2174 EXPECT_NE(ngram_op, nullptr); 2175 2176 // Create a Map operation on ds 2177 ds = ds->Map({ngram_op}); 2178 EXPECT_NE(ds, nullptr); 2179 2180 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2181 // Expect failure: invalid Ngram input (the value of ngrams vector less than and equal to 0) 2182 EXPECT_EQ(iter, nullptr); 2183 } 2184 2185 TEST_F(MindDataTestPipeline, TestNgramFail4) { 2186 // Testing the incorrect parameter of Ngram interface. 2187 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail4."; 2188 2189 // Create a TextFile dataset 2190 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; 2191 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2192 EXPECT_NE(ds, nullptr); 2193 2194 // Create sliding_window operation on ds 2195 // Testing the second parameter pad_width in left_pad vector less than 0 2196 std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2}, {"", -1})); 2197 EXPECT_NE(ngram_op, nullptr); 2198 2199 // Create a Map operation on ds 2200 ds = ds->Map({ngram_op}); 2201 EXPECT_NE(ds, nullptr); 2202 2203 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2204 // Expect failure: invalid Ngram input (the second parameter pad_width in left_pad vector less than 0) 2205 EXPECT_EQ(iter, nullptr); 2206 } 2207 2208 TEST_F(MindDataTestPipeline, TestNgramFail5) { 2209 // Testing the incorrect parameter of Ngram interface. 2210 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail5."; 2211 2212 // Create a TextFile dataset 2213 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; 2214 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2215 EXPECT_NE(ds, nullptr); 2216 2217 // Create sliding_window operation on ds 2218 // Testing the second parameter pad_width in right_pad vector less than 0 2219 std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2}, {"", 1}, {"", -1})); 2220 EXPECT_NE(ngram_op, nullptr); 2221 2222 // Create a Map operation on ds 2223 ds = ds->Map({ngram_op}); 2224 EXPECT_NE(ds, nullptr); 2225 2226 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2227 // Expect failure: invalid Ngram input (the second parameter pad_width in left_pad vector less than 0) 2228 EXPECT_EQ(iter, nullptr); 2229 } 2230 2231 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success) { 2232 // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkc. 2233 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success."; 2234 2235 // Create a TextFile dataset 2236 std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt"; 2237 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2238 EXPECT_NE(ds, nullptr); 2239 2240 // Create normalizeutf8 operation on ds 2241 std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfkc); 2242 EXPECT_NE(normalizeutf8, nullptr); 2243 2244 // Create Map operation on ds 2245 ds = ds->Map({normalizeutf8}, {"text"}); 2246 EXPECT_NE(ds, nullptr); 2247 2248 // Create an iterator over the result of the above dataset 2249 // This will trigger the creation of the Execution Tree and launch it. 2250 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2251 EXPECT_NE(iter, nullptr); 2252 2253 // Iterate the dataset and get each row 2254 std::unordered_map<std::string, mindspore::MSTensor> row; 2255 ASSERT_OK(iter->GetNextRow(&row)); 2256 2257 std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"}; 2258 2259 uint64_t i = 0; 2260 while (row.size() != 0) { 2261 auto ind = row["text"]; 2262 std::shared_ptr<Tensor> de_expected_tensor; 2263 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor)); 2264 mindspore::MSTensor ms_expected_tensor = 2265 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 2266 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor); 2267 ASSERT_OK(iter->GetNextRow(&row)); 2268 i++; 2269 } 2270 2271 EXPECT_EQ(i, 6); 2272 2273 // Manually terminate the pipeline 2274 iter->Stop(); 2275 } 2276 2277 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success1) { 2278 // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfc. 2279 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success1."; 2280 2281 // Create a TextFile dataset 2282 std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt"; 2283 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2284 EXPECT_NE(ds, nullptr); 2285 2286 // Create normalizeutf8 operation on ds 2287 std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfc); 2288 EXPECT_NE(normalizeutf8, nullptr); 2289 2290 // Create Map operation on ds 2291 ds = ds->Map({normalizeutf8}, {"text"}); 2292 EXPECT_NE(ds, nullptr); 2293 2294 // Create an iterator over the result of the above dataset 2295 // This will trigger the creation of the Execution Tree and launch it. 2296 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2297 EXPECT_NE(iter, nullptr); 2298 2299 // Iterate the dataset and get each row 2300 std::unordered_map<std::string, mindspore::MSTensor> row; 2301 ASSERT_OK(iter->GetNextRow(&row)); 2302 2303 std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"}; 2304 2305 uint64_t i = 0; 2306 while (row.size() != 0) { 2307 auto ind = row["text"]; 2308 std::shared_ptr<Tensor> de_expected_tensor; 2309 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor)); 2310 mindspore::MSTensor ms_expected_tensor = 2311 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 2312 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor); 2313 ASSERT_OK(iter->GetNextRow(&row)); 2314 i++; 2315 } 2316 2317 EXPECT_EQ(i, 6); 2318 2319 // Manually terminate the pipeline 2320 iter->Stop(); 2321 } 2322 2323 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success2) { 2324 // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfd. 2325 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success2."; 2326 2327 // Create a TextFile dataset 2328 std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt"; 2329 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2330 EXPECT_NE(ds, nullptr); 2331 2332 // Create normalizeutf8 operation on ds 2333 std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfd); 2334 EXPECT_NE(normalizeutf8, nullptr); 2335 2336 // Create Map operation on ds 2337 ds = ds->Map({normalizeutf8}, {"text"}); 2338 EXPECT_NE(ds, nullptr); 2339 2340 // Create an iterator over the result of the above dataset 2341 // This will trigger the creation of the Execution Tree and launch it. 2342 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2343 EXPECT_NE(iter, nullptr); 2344 2345 // Iterate the dataset and get each row 2346 std::unordered_map<std::string, mindspore::MSTensor> row; 2347 ASSERT_OK(iter->GetNextRow(&row)); 2348 2349 std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"}; 2350 2351 uint64_t i = 0; 2352 while (row.size() != 0) { 2353 auto ind = row["text"]; 2354 std::shared_ptr<Tensor> de_expected_tensor; 2355 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor)); 2356 mindspore::MSTensor ms_expected_tensor = 2357 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 2358 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor); 2359 ASSERT_OK(iter->GetNextRow(&row)); 2360 i++; 2361 } 2362 2363 EXPECT_EQ(i, 6); 2364 2365 // Manually terminate the pipeline 2366 iter->Stop(); 2367 } 2368 2369 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success3) { 2370 // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkd. 2371 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success3."; 2372 2373 // Create a TextFile dataset 2374 std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt"; 2375 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2376 EXPECT_NE(ds, nullptr); 2377 2378 // Create normalizeutf8 operation on ds 2379 std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfkd); 2380 EXPECT_NE(normalizeutf8, nullptr); 2381 2382 // Create Map operation on ds 2383 ds = ds->Map({normalizeutf8}, {"text"}); 2384 EXPECT_NE(ds, nullptr); 2385 2386 // Create an iterator over the result of the above dataset 2387 // This will trigger the creation of the Execution Tree and launch it. 2388 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2389 EXPECT_NE(iter, nullptr); 2390 2391 // Iterate the dataset and get each row 2392 std::unordered_map<std::string, mindspore::MSTensor> row; 2393 ASSERT_OK(iter->GetNextRow(&row)); 2394 2395 std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"}; 2396 2397 uint64_t i = 0; 2398 while (row.size() != 0) { 2399 auto ind = row["text"]; 2400 std::shared_ptr<Tensor> de_expected_tensor; 2401 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor)); 2402 mindspore::MSTensor ms_expected_tensor = 2403 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 2404 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor); 2405 ASSERT_OK(iter->GetNextRow(&row)); 2406 i++; 2407 } 2408 2409 EXPECT_EQ(i, 6); 2410 2411 // Manually terminate the pipeline 2412 iter->Stop(); 2413 } 2414 2415 TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess) { 2416 // Testing the parameter of RegexReplace interface when the replace_all is true. 2417 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess."; 2418 2419 // Create a TextFile dataset 2420 std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt"; 2421 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2422 EXPECT_NE(ds, nullptr); 2423 2424 // Create regex_replace operation on ds 2425 std::shared_ptr<TensorTransform> regex_replace = std::make_shared<text::RegexReplace>("\\s+", "_", true); 2426 EXPECT_NE(regex_replace, nullptr); 2427 2428 // Create Map operation on ds 2429 ds = ds->Map({regex_replace}, {"text"}); 2430 EXPECT_NE(ds, nullptr); 2431 2432 // Create an iterator over the result of the above dataset 2433 // This will trigger the creation of the Execution Tree and launch it. 2434 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2435 EXPECT_NE(iter, nullptr); 2436 2437 // Iterate the dataset and get each row 2438 std::unordered_map<std::string, mindspore::MSTensor> row; 2439 ASSERT_OK(iter->GetNextRow(&row)); 2440 2441 std::vector<std::string> expected = {"Hello_World", "Let's_Go", "1:hello", "2:world", 2442 "31:beijing", "Welcome_to_China!", "_我_不想_长大_", "Welcome_to_Shenzhen!"}; 2443 2444 uint64_t i = 0; 2445 while (row.size() != 0) { 2446 auto ind = row["text"]; 2447 std::shared_ptr<Tensor> de_expected_tensor; 2448 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor)); 2449 mindspore::MSTensor ms_expected_tensor = 2450 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 2451 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor); 2452 ASSERT_OK(iter->GetNextRow(&row)); 2453 i++; 2454 } 2455 2456 EXPECT_EQ(i, 8); 2457 2458 // Manually terminate the pipeline 2459 iter->Stop(); 2460 } 2461 2462 TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess1) { 2463 // Testing the parameter of RegexReplace interface when the replace_all is false. 2464 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess1."; 2465 2466 // Create a TextFile dataset 2467 std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt"; 2468 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2469 EXPECT_NE(ds, nullptr); 2470 2471 // Create regex_replace operation on ds 2472 std::shared_ptr<TensorTransform> regex_replace = std::make_shared<text::RegexReplace>("\\s+", "_", false); 2473 EXPECT_NE(regex_replace, nullptr); 2474 2475 // Create Map operation on ds 2476 ds = ds->Map({regex_replace}, {"text"}); 2477 EXPECT_NE(ds, nullptr); 2478 2479 // Create an iterator over the result of the above dataset 2480 // This will trigger the creation of the Execution Tree and launch it. 2481 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2482 EXPECT_NE(iter, nullptr); 2483 2484 // Iterate the dataset and get each row 2485 std::unordered_map<std::string, mindspore::MSTensor> row; 2486 ASSERT_OK(iter->GetNextRow(&row)); 2487 2488 std::vector<std::string> expected = {"Hello_World", "Let's_Go", "1:hello", "2:world", 2489 "31:beijing", "Welcome_to China!", "_我 不想 长大 ", "Welcome_to Shenzhen!"}; 2490 2491 uint64_t i = 0; 2492 while (row.size() != 0) { 2493 auto ind = row["text"]; 2494 std::shared_ptr<Tensor> de_expected_tensor; 2495 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor)); 2496 mindspore::MSTensor ms_expected_tensor = 2497 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 2498 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor); 2499 ASSERT_OK(iter->GetNextRow(&row)); 2500 i++; 2501 } 2502 2503 EXPECT_EQ(i, 8); 2504 2505 // Manually terminate the pipeline 2506 iter->Stop(); 2507 } 2508 2509 TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess) { 2510 // Testing the parameter of RegexTokenizer interface when the with_offsets is false. 2511 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess."; 2512 2513 // Create a TextFile dataset 2514 std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt"; 2515 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2516 EXPECT_NE(ds, nullptr); 2517 2518 // Create regex_tokenizer operation on ds 2519 std::shared_ptr<TensorTransform> regex_tokenizer = std::make_shared<text::RegexTokenizer>("\\s+", "\\s+", false); 2520 EXPECT_NE(regex_tokenizer, nullptr); 2521 2522 // Create Map operation on ds 2523 ds = ds->Map({regex_tokenizer}, {"text"}); 2524 EXPECT_NE(ds, nullptr); 2525 2526 // Create an iterator over the result of the above dataset 2527 // This will trigger the creation of the Execution Tree and launch it. 2528 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2529 EXPECT_NE(iter, nullptr); 2530 2531 // Iterate the dataset and get each row 2532 std::unordered_map<std::string, mindspore::MSTensor> row; 2533 ASSERT_OK(iter->GetNextRow(&row)); 2534 2535 std::vector<std::vector<std::string>> expected = {{"Hello", " ", "World"}, 2536 {"Let's", " ", "Go"}, 2537 {"1:hello"}, 2538 {"2:world"}, 2539 {"31:beijing"}, 2540 {"Welcome", " ", "to", " ", "China!"}, 2541 {" ", "我", " ", "不想", " ", "长大", " "}, 2542 {"Welcome", " ", "to", " ", "Shenzhen!"}}; 2543 2544 uint64_t i = 0; 2545 while (row.size() != 0) { 2546 auto ind = row["text"]; 2547 2548 std::shared_ptr<Tensor> de_expected_tensor; 2549 int x = expected[i].size(); 2550 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor)); 2551 mindspore::MSTensor expected_tensor = 2552 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 2553 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 2554 2555 ASSERT_OK(iter->GetNextRow(&row)); 2556 i++; 2557 } 2558 2559 EXPECT_EQ(i, 8); 2560 2561 // Manually terminate the pipeline 2562 iter->Stop(); 2563 } 2564 2565 TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess1) { 2566 // Testing the parameter of RegexTokenizer interface when the with_offsets is true. 2567 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess1."; 2568 2569 // Create a TextFile dataset 2570 std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt"; 2571 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2572 EXPECT_NE(ds, nullptr); 2573 2574 // Create regex_tokenizer operation on ds 2575 std::shared_ptr<TensorTransform> regex_tokenizer = std::make_shared<text::RegexTokenizer>("\\s+", "\\s+", true); 2576 EXPECT_NE(regex_tokenizer, nullptr); 2577 2578 // Create Map operation on ds 2579 ds = ds->Map({regex_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}, 2580 {"token", "offsets_start", "offsets_limit"}); 2581 EXPECT_NE(ds, nullptr); 2582 2583 // Create an iterator over the result of the above dataset 2584 // This will trigger the creation of the Execution Tree and launch it. 2585 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2586 EXPECT_NE(iter, nullptr); 2587 2588 // Iterate the dataset and get each row 2589 std::unordered_map<std::string, mindspore::MSTensor> row; 2590 ASSERT_OK(iter->GetNextRow(&row)); 2591 2592 std::vector<std::vector<std::string>> expected_tokens = {{"Hello", " ", "World"}, 2593 {"Let's", " ", "Go"}, 2594 {"1:hello"}, 2595 {"2:world"}, 2596 {"31:beijing"}, 2597 {"Welcome", " ", "to", " ", "China!"}, 2598 {" ", "我", " ", "不想", " ", "长大", " "}, 2599 {"Welcome", " ", "to", " ", "Shenzhen!"}}; 2600 2601 std::vector<std::vector<uint32_t>> expected_offsets_start = { 2602 {0, 5, 6}, {0, 5, 6}, {0}, {0}, {0}, {0, 7, 8, 10, 11}, {0, 2, 5, 6, 12, 14, 20}, {0, 7, 8, 10, 11}}; 2603 std::vector<std::vector<uint32_t>> expected_offsets_limit = { 2604 {5, 6, 11}, {5, 6, 8}, {7}, {7}, {10}, {7, 8, 10, 11, 17}, {2, 5, 6, 12, 14, 20, 21}, {7, 8, 10, 11, 20}}; 2605 2606 uint64_t i = 0; 2607 while (row.size() != 0) { 2608 auto token = row["token"]; 2609 auto start = row["offsets_start"]; 2610 auto limit = row["offsets_limit"]; 2611 2612 std::shared_ptr<Tensor> de_expected_tokens; 2613 int x = expected_tokens[i].size(); 2614 ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens)); 2615 mindspore::MSTensor ms_expected_tokens = 2616 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens)); 2617 EXPECT_MSTENSOR_EQ(token, ms_expected_tokens); 2618 2619 std::shared_ptr<Tensor> de_expected_offsets_start; 2620 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start)); 2621 mindspore::MSTensor ms_expected_offsets_start = 2622 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start)); 2623 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start); 2624 2625 std::shared_ptr<Tensor> de_expected_offsets_limit; 2626 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit)); 2627 mindspore::MSTensor ms_expected_offsets_limit = 2628 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit)); 2629 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit); 2630 2631 ASSERT_OK(iter->GetNextRow(&row)); 2632 i++; 2633 } 2634 2635 EXPECT_EQ(i, 8); 2636 2637 // Manually terminate the pipeline 2638 iter->Stop(); 2639 } 2640 2641 TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess) { 2642 // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is default. 2643 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess."; 2644 2645 // Create a TextFile dataset 2646 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; 2647 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2648 EXPECT_NE(ds, nullptr); 2649 2650 // Create unicodechar_tokenizer operation on ds 2651 std::shared_ptr<TensorTransform> unicodechar_tokenizer = std::make_shared<text::UnicodeCharTokenizer>(); 2652 EXPECT_NE(unicodechar_tokenizer, nullptr); 2653 2654 // Create Map operation on ds 2655 ds = ds->Map({unicodechar_tokenizer}, {"text"}); 2656 EXPECT_NE(ds, nullptr); 2657 2658 // Create an iterator over the result of the above dataset 2659 // This will trigger the creation of the Execution Tree and launch it. 2660 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2661 EXPECT_NE(iter, nullptr); 2662 2663 // Iterate the dataset and get each row 2664 std::unordered_map<std::string, mindspore::MSTensor> row; 2665 ASSERT_OK(iter->GetNextRow(&row)); 2666 2667 std::vector<std::vector<std::string>> expected = { 2668 {"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"}, 2669 {"北", "京", "欢", "迎", "您", "!"}, 2670 {"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"}, 2671 {" ", " "}}; 2672 2673 uint64_t i = 0; 2674 while (row.size() != 0) { 2675 auto ind = row["text"]; 2676 2677 std::shared_ptr<Tensor> de_expected_tensor; 2678 int x = expected[i].size(); 2679 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor)); 2680 mindspore::MSTensor expected_tensor = 2681 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 2682 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 2683 2684 ASSERT_OK(iter->GetNextRow(&row)); 2685 i++; 2686 } 2687 2688 EXPECT_EQ(i, 4); 2689 2690 // Manually terminate the pipeline 2691 iter->Stop(); 2692 } 2693 2694 TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess1) { 2695 // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is true. 2696 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess1."; 2697 2698 // Create a TextFile dataset 2699 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; 2700 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2701 EXPECT_NE(ds, nullptr); 2702 2703 // Create unicodechar_tokenizer operation on ds 2704 std::shared_ptr<TensorTransform> unicodechar_tokenizer = std::make_shared<text::UnicodeCharTokenizer>(true); 2705 EXPECT_NE(unicodechar_tokenizer, nullptr); 2706 2707 // Create Map operation on ds 2708 ds = ds->Map({unicodechar_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}, 2709 {"token", "offsets_start", "offsets_limit"}); 2710 EXPECT_NE(ds, nullptr); 2711 2712 // Create an iterator over the result of the above dataset 2713 // This will trigger the creation of the Execution Tree and launch it. 2714 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2715 EXPECT_NE(iter, nullptr); 2716 2717 // Iterate the dataset and get each row 2718 std::unordered_map<std::string, mindspore::MSTensor> row; 2719 ASSERT_OK(iter->GetNextRow(&row)); 2720 2721 std::vector<std::vector<std::string>> expected_tokens = { 2722 {"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"}, 2723 {"北", "京", "欢", "迎", "您", "!"}, 2724 {"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"}, 2725 {" ", " "}}; 2726 2727 std::vector<std::vector<uint32_t>> expected_offsets_start = { 2728 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18}, 2729 {0, 3, 6, 9, 12, 15}, 2730 {0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16}, 2731 {0, 1}}; 2732 2733 std::vector<std::vector<uint32_t>> expected_offsets_limit = { 2734 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19}, 2735 {3, 6, 9, 12, 15, 18}, 2736 {3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17}, 2737 {1, 2}}; 2738 2739 uint64_t i = 0; 2740 while (row.size() != 0) { 2741 auto token = row["token"]; 2742 auto start = row["offsets_start"]; 2743 auto limit = row["offsets_limit"]; 2744 2745 std::shared_ptr<Tensor> de_expected_tokens; 2746 int x = expected_tokens[i].size(); 2747 ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens)); 2748 mindspore::MSTensor ms_expected_tokens = 2749 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens)); 2750 EXPECT_MSTENSOR_EQ(token, ms_expected_tokens); 2751 2752 std::shared_ptr<Tensor> de_expected_offsets_start; 2753 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start)); 2754 mindspore::MSTensor ms_expected_offsets_start = 2755 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start)); 2756 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start); 2757 2758 std::shared_ptr<Tensor> de_expected_offsets_limit; 2759 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit)); 2760 mindspore::MSTensor ms_expected_offsets_limit = 2761 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit)); 2762 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit); 2763 2764 ASSERT_OK(iter->GetNextRow(&row)); 2765 i++; 2766 } 2767 2768 EXPECT_EQ(i, 4); 2769 2770 // Manually terminate the pipeline 2771 iter->Stop(); 2772 } 2773 2774 std::vector<std::string> vocab_english = {"book", "cholera", "era", "favor", "##ite", "my", 2775 "is", "love", "dur", "##ing", "the"}; 2776 2777 std::vector<std::string> vocab_chinese = {"我", "最", "喜", "欢", "的", "书", "是", "霍", "乱", "时", "期", "爱", "情"}; 2778 2779 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess1) { 2780 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess1."; 2781 // Test WordpieceTokenizer with default parameters on English vocab 2782 2783 // Create a TextFile dataset 2784 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; 2785 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2786 EXPECT_NE(ds, nullptr); 2787 2788 // Create Take operation on ds 2789 ds = ds->Take(10); 2790 EXPECT_NE(ds, nullptr); 2791 2792 // Create a vocab from vector 2793 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 2794 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); 2795 EXPECT_EQ(s, Status::OK()); 2796 2797 // Create WordpieceTokenizer operation on ds 2798 std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab); 2799 EXPECT_NE(wordpiece_tokenizer, nullptr); 2800 2801 // Create Map operation on ds 2802 ds = ds->Map({wordpiece_tokenizer}, {"text"}); 2803 EXPECT_NE(ds, nullptr); 2804 2805 // Create an iterator over the result of the above dataset 2806 // This will trigger the creation of the Execution Tree and launch it. 2807 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2808 EXPECT_NE(iter, nullptr); 2809 2810 // Iterate the dataset and get each row 2811 std::unordered_map<std::string, mindspore::MSTensor> row; 2812 ASSERT_OK(iter->GetNextRow(&row)); 2813 2814 std::vector<std::vector<std::string>> expected = { 2815 {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}}; 2816 2817 uint64_t i = 0; 2818 while (row.size() != 0) { 2819 auto txt = row["text"]; 2820 std::shared_ptr<Tensor> de_expected_tensor; 2821 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); 2822 mindspore::MSTensor expected_tensor = 2823 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 2824 EXPECT_MSTENSOR_EQ(txt, expected_tensor); 2825 ASSERT_OK(iter->GetNextRow(&row)); 2826 i++; 2827 } 2828 2829 EXPECT_EQ(i, 10); 2830 2831 // Manually terminate the pipeline 2832 iter->Stop(); 2833 } 2834 2835 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess2) { 2836 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess2."; 2837 // Test WordpieceTokenizer with empty unknown_token 2838 2839 // Create a TextFile dataset 2840 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; 2841 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2842 EXPECT_NE(ds, nullptr); 2843 2844 // Create Take operation on ds 2845 ds = ds->Take(10); 2846 EXPECT_NE(ds, nullptr); 2847 2848 // Create a vocab from vector 2849 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 2850 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); 2851 EXPECT_EQ(s, Status::OK()); 2852 2853 // Create WordpieceTokenizer operation on ds 2854 std::shared_ptr<TensorTransform> wordpiece_tokenizer = 2855 std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "", false); 2856 EXPECT_NE(wordpiece_tokenizer, nullptr); 2857 2858 // Create Map operation on ds 2859 ds = ds->Map({wordpiece_tokenizer}, {"text"}); 2860 EXPECT_NE(ds, nullptr); 2861 2862 // Create an iterator over the result of the above dataset 2863 // This will trigger the creation of the Execution Tree and launch it. 2864 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2865 EXPECT_NE(iter, nullptr); 2866 2867 // Iterate the dataset and get each row 2868 std::unordered_map<std::string, mindspore::MSTensor> row; 2869 ASSERT_OK(iter->GetNextRow(&row)); 2870 2871 std::vector<std::vector<std::string>> expected = { 2872 {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"what"}}; 2873 2874 uint64_t i = 0; 2875 while (row.size() != 0) { 2876 auto txt = row["text"]; 2877 std::shared_ptr<Tensor> de_expected_tensor; 2878 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); 2879 mindspore::MSTensor expected_tensor = 2880 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 2881 EXPECT_MSTENSOR_EQ(txt, expected_tensor); 2882 ASSERT_OK(iter->GetNextRow(&row)); 2883 i++; 2884 } 2885 2886 EXPECT_EQ(i, 10); 2887 2888 // Manually terminate the pipeline 2889 iter->Stop(); 2890 } 2891 2892 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess3) { 2893 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess3."; 2894 // Test WordpieceTokenizer with non-default max_bytes_per_token 2895 2896 // Create a TextFile dataset 2897 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; 2898 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2899 EXPECT_NE(ds, nullptr); 2900 2901 // Create Take operation on ds 2902 ds = ds->Take(10); 2903 EXPECT_NE(ds, nullptr); 2904 2905 // Create a vocab from vector 2906 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 2907 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); 2908 EXPECT_EQ(s, Status::OK()); 2909 2910 // Create WordpieceTokenizer operation on ds 2911 std::shared_ptr<TensorTransform> wordpiece_tokenizer = 2912 std::make_shared<text::WordpieceTokenizer>(vocab, "##", 4, "[UNK]", false); 2913 EXPECT_NE(wordpiece_tokenizer, nullptr); 2914 2915 // Create Map operation on ds 2916 ds = ds->Map({wordpiece_tokenizer}, {"text"}); 2917 EXPECT_NE(ds, nullptr); 2918 2919 // Create an iterator over the result of the above dataset 2920 // This will trigger the creation of the Execution Tree and launch it. 2921 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2922 EXPECT_NE(iter, nullptr); 2923 2924 // Iterate the dataset and get each row 2925 std::unordered_map<std::string, mindspore::MSTensor> row; 2926 ASSERT_OK(iter->GetNextRow(&row)); 2927 2928 std::vector<std::vector<std::string>> expected = {{"my"}, {"[UNK]"}, {"book"}, {"is"}, {"love"}, 2929 {"[UNK]"}, {"the"}, {"[UNK]"}, {"era"}, {"[UNK]"}}; 2930 2931 uint64_t i = 0; 2932 while (row.size() != 0) { 2933 auto txt = row["text"]; 2934 std::shared_ptr<Tensor> de_expected_tensor; 2935 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); 2936 mindspore::MSTensor expected_tensor = 2937 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 2938 EXPECT_MSTENSOR_EQ(txt, expected_tensor); 2939 ASSERT_OK(iter->GetNextRow(&row)); 2940 i++; 2941 } 2942 2943 EXPECT_EQ(i, 10); 2944 2945 // Manually terminate the pipeline 2946 iter->Stop(); 2947 } 2948 2949 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess4) { 2950 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess4."; 2951 // Test WordpieceTokenizer with default parameters on Chinese vocab 2952 2953 // Create a TextFile dataset 2954 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; 2955 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 2956 EXPECT_NE(ds, nullptr); 2957 2958 // Create Skip operation on ds 2959 ds = ds->Skip(10); 2960 EXPECT_NE(ds, nullptr); 2961 2962 // Create Take operation on ds 2963 ds = ds->Take(15); 2964 EXPECT_NE(ds, nullptr); 2965 2966 // Create a vocab from vector 2967 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 2968 Status s = Vocab::BuildFromVector(vocab_chinese, {}, true, &vocab); 2969 EXPECT_EQ(s, Status::OK()); 2970 2971 // Create WordpieceTokenizer operation on ds 2972 std::shared_ptr<TensorTransform> wordpiece_tokenizer = 2973 std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", false); 2974 EXPECT_NE(wordpiece_tokenizer, nullptr); 2975 2976 // Create Map operation on ds 2977 ds = ds->Map({wordpiece_tokenizer}, {"text"}); 2978 EXPECT_NE(ds, nullptr); 2979 2980 // Create an iterator over the result of the above dataset 2981 // This will trigger the creation of the Execution Tree and launch it. 2982 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 2983 EXPECT_NE(iter, nullptr); 2984 2985 // Iterate the dataset and get each row 2986 std::unordered_map<std::string, mindspore::MSTensor> row; 2987 ASSERT_OK(iter->GetNextRow(&row)); 2988 2989 std::vector<std::vector<std::string>> expected = {{"我"}, {"最"}, {"喜"}, {"欢"}, {"的"}, {"书"}, {"是"}, {"霍"}, 2990 {"乱"}, {"时"}, {"期"}, {"的"}, {"爱"}, {"情"}, {"[UNK]"}}; 2991 2992 uint64_t i = 0; 2993 while (row.size() != 0) { 2994 auto txt = row["text"]; 2995 std::shared_ptr<Tensor> de_expected_tensor; 2996 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); 2997 mindspore::MSTensor expected_tensor = 2998 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 2999 EXPECT_MSTENSOR_EQ(txt, expected_tensor); 3000 ASSERT_OK(iter->GetNextRow(&row)); 3001 i++; 3002 } 3003 3004 EXPECT_EQ(i, 15); 3005 3006 // Manually terminate the pipeline 3007 iter->Stop(); 3008 } 3009 3010 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess5) { 3011 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess5."; 3012 // Test WordpieceTokenizer with with_offsets true 3013 3014 // Create a TextFile dataset 3015 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; 3016 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 3017 EXPECT_NE(ds, nullptr); 3018 3019 // Create Take operation on ds 3020 ds = ds->Take(10); 3021 EXPECT_NE(ds, nullptr); 3022 3023 // Create a vocab from vector 3024 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 3025 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); 3026 EXPECT_EQ(s, Status::OK()); 3027 3028 // Create WordpieceTokenizer operation on ds 3029 std::shared_ptr<TensorTransform> wordpiece_tokenizer = 3030 std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", true); 3031 EXPECT_NE(wordpiece_tokenizer, nullptr); 3032 3033 // Create Map operation on ds 3034 ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}); 3035 EXPECT_NE(ds, nullptr); 3036 3037 // Create an iterator over the result of the above dataset 3038 // This will trigger the creation of the Execution Tree and launch it. 3039 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 3040 EXPECT_NE(iter, nullptr); 3041 3042 // Iterate the dataset and get each row 3043 std::unordered_map<std::string, mindspore::MSTensor> row; 3044 ASSERT_OK(iter->GetNextRow(&row)); 3045 3046 std::vector<std::vector<std::string>> expected = { 3047 {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}}; 3048 std::vector<std::vector<uint32_t>> expected_offsets_start = {{0}, {0, 5}, {0}, {0}, {0}, {0, 3}, {0}, {0}, {0}, {0}}; 3049 std::vector<std::vector<uint32_t>> expected_offsets_limit = {{2}, {5, 8}, {4}, {2}, {4}, {3, 6}, {3}, {7}, {3}, {4}}; 3050 3051 uint64_t i = 0; 3052 while (row.size() != 0) { 3053 auto txt = row["token"]; 3054 std::shared_ptr<Tensor> de_expected_tensor; 3055 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); 3056 mindspore::MSTensor expected_tensor = 3057 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 3058 EXPECT_MSTENSOR_EQ(txt, expected_tensor); 3059 3060 auto start = row["offsets_start"]; 3061 std::shared_ptr<Tensor> de_expected_start_tensor; 3062 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], &de_expected_start_tensor)); 3063 mindspore::MSTensor expected_start_tensor = 3064 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_start_tensor)); 3065 EXPECT_MSTENSOR_EQ(start, expected_start_tensor); 3066 3067 auto limit = row["offsets_limit"]; 3068 std::shared_ptr<Tensor> de_expected_limit_tensor; 3069 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], &de_expected_limit_tensor)); 3070 mindspore::MSTensor expected_limit_tensor = 3071 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_limit_tensor)); 3072 EXPECT_MSTENSOR_EQ(limit, expected_limit_tensor); 3073 ASSERT_OK(iter->GetNextRow(&row)); 3074 i++; 3075 } 3076 3077 EXPECT_EQ(i, 10); 3078 3079 // Manually terminate the pipeline 3080 iter->Stop(); 3081 } 3082 3083 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess6) { 3084 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess6."; 3085 // Test WordpieceTokenizer with max_bytes_per_token equals to 0 3086 3087 // Create a TextFile dataset 3088 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt"; 3089 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 3090 EXPECT_NE(ds, nullptr); 3091 3092 // Create Take operation on ds 3093 ds = ds->Take(10); 3094 EXPECT_NE(ds, nullptr); 3095 3096 // Create a vocab from vector 3097 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 3098 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); 3099 EXPECT_EQ(s, Status::OK()); 3100 3101 // Create WordpieceTokenizer operation on ds 3102 std::shared_ptr<TensorTransform> wordpiece_tokenizer = 3103 std::make_shared<text::WordpieceTokenizer>(vocab, "##", 0, "[UNK]", true); 3104 EXPECT_NE(wordpiece_tokenizer, nullptr); 3105 3106 // Create Map operation on ds 3107 ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}); 3108 EXPECT_NE(ds, nullptr); 3109 3110 // Create an iterator over the result of the above dataset 3111 // This will trigger the creation of the Execution Tree and launch it. 3112 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 3113 EXPECT_NE(iter, nullptr); 3114 3115 // Iterate the dataset and get each row 3116 std::unordered_map<std::string, mindspore::MSTensor> row; 3117 ASSERT_OK(iter->GetNextRow(&row)); 3118 3119 std::vector<std::vector<std::string>> expected = {{"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, 3120 {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}}; 3121 3122 uint64_t i = 0; 3123 while (row.size() != 0) { 3124 auto txt = row["token"]; 3125 std::shared_ptr<Tensor> de_expected_tensor; 3126 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor)); 3127 mindspore::MSTensor expected_tensor = 3128 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 3129 EXPECT_MSTENSOR_EQ(txt, expected_tensor); 3130 ASSERT_OK(iter->GetNextRow(&row)); 3131 i++; 3132 } 3133 3134 EXPECT_EQ(i, 10); 3135 3136 // Manually terminate the pipeline 3137 iter->Stop(); 3138 } 3139 3140 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail1) { 3141 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail1."; 3142 // Test WordpieceTokenizer with nullptr vocab 3143 3144 // Create a TextFile dataset 3145 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; 3146 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 3147 EXPECT_NE(ds, nullptr); 3148 3149 // Create WordpieceTokenizer operation on ds 3150 std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(nullptr); 3151 EXPECT_NE(wordpiece_tokenizer, nullptr); 3152 3153 // Create a Map operation on ds 3154 ds = ds->Map({wordpiece_tokenizer}); 3155 EXPECT_NE(ds, nullptr); 3156 3157 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 3158 // Expect failure: invalid WordpieceTokenizer input with nullptr vocab 3159 EXPECT_EQ(iter, nullptr); 3160 } 3161 3162 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail2) { 3163 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail2."; 3164 // Test WordpieceTokenizer with negative max_bytes_per_token 3165 3166 // Create a TextFile dataset 3167 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt"; 3168 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 3169 EXPECT_NE(ds, nullptr); 3170 3171 // Create a vocab from vector 3172 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>(); 3173 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab); 3174 EXPECT_EQ(s, Status::OK()); 3175 3176 // Create WordpieceTokenizer operation on ds 3177 std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab, "##", -1); 3178 EXPECT_NE(wordpiece_tokenizer, nullptr); 3179 3180 // Create a Map operation on ds 3181 ds = ds->Map({wordpiece_tokenizer}); 3182 EXPECT_NE(ds, nullptr); 3183 3184 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 3185 // Expect failure: invalid WordpieceTokenizer input with nullptr vocab 3186 EXPECT_EQ(iter, nullptr); 3187 } 3188 3189 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess) { 3190 // Testing the parameter of UnicodeScriptTokenizer interface when the with_offsets and the keep_whitespace is default. 3191 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess."; 3192 3193 // Create a TextFile dataset 3194 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; 3195 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 3196 EXPECT_NE(ds, nullptr); 3197 3198 // Create unicodescript_tokenizer operation on ds 3199 std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>(); 3200 EXPECT_NE(unicodescript_tokenizer, nullptr); 3201 3202 // Create Map operation on ds 3203 ds = ds->Map({unicodescript_tokenizer}, {"text"}); 3204 EXPECT_NE(ds, nullptr); 3205 3206 // Create an iterator over the result of the above dataset 3207 // This will trigger the creation of the Execution Tree and launch it. 3208 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 3209 EXPECT_NE(iter, nullptr); 3210 3211 // Iterate the dataset and get each row 3212 std::unordered_map<std::string, mindspore::MSTensor> row; 3213 ASSERT_OK(iter->GetNextRow(&row)); 3214 3215 std::vector<std::vector<std::string>> expected = { 3216 {"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}}; 3217 3218 uint64_t i = 0; 3219 while (row.size() != 0) { 3220 auto ind = row["text"]; 3221 3222 std::shared_ptr<Tensor> de_expected_tensor; 3223 int x = expected[i].size(); 3224 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor)); 3225 mindspore::MSTensor expected_tensor = 3226 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 3227 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 3228 3229 ASSERT_OK(iter->GetNextRow(&row)); 3230 i++; 3231 } 3232 3233 EXPECT_EQ(i, 4); 3234 3235 // Manually terminate the pipeline 3236 iter->Stop(); 3237 } 3238 3239 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess1) { 3240 // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is 3241 // false. 3242 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess1."; 3243 3244 // Create a TextFile dataset 3245 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; 3246 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 3247 EXPECT_NE(ds, nullptr); 3248 3249 // Create unicodescript_tokenizer operation on ds 3250 std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>(true); 3251 EXPECT_NE(unicodescript_tokenizer, nullptr); 3252 3253 // Create Map operation on ds 3254 ds = ds->Map({unicodescript_tokenizer}, {"text"}); 3255 EXPECT_NE(ds, nullptr); 3256 3257 // Create an iterator over the result of the above dataset 3258 // This will trigger the creation of the Execution Tree and launch it. 3259 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 3260 EXPECT_NE(iter, nullptr); 3261 3262 // Iterate the dataset and get each row 3263 std::unordered_map<std::string, mindspore::MSTensor> row; 3264 ASSERT_OK(iter->GetNextRow(&row)); 3265 3266 std::vector<std::vector<std::string>> expected = { 3267 {"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {" "}}; 3268 3269 uint64_t i = 0; 3270 while (row.size() != 0) { 3271 auto ind = row["text"]; 3272 3273 std::shared_ptr<Tensor> de_expected_tensor; 3274 int x = expected[i].size(); 3275 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor)); 3276 mindspore::MSTensor expected_tensor = 3277 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 3278 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 3279 3280 ASSERT_OK(iter->GetNextRow(&row)); 3281 i++; 3282 } 3283 3284 EXPECT_EQ(i, 4); 3285 3286 // Manually terminate the pipeline 3287 iter->Stop(); 3288 } 3289 3290 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess2) { 3291 // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is false and the with_offsets is 3292 // true. 3293 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess2."; 3294 3295 // Create a TextFile dataset 3296 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; 3297 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 3298 EXPECT_NE(ds, nullptr); 3299 3300 // Create unicodescript_tokenizer operation on ds 3301 std::shared_ptr<TensorTransform> unicodescript_tokenizer = 3302 std::make_shared<text::UnicodeScriptTokenizer>(false, true); 3303 EXPECT_NE(unicodescript_tokenizer, nullptr); 3304 3305 // Create Map operation on ds 3306 ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}, 3307 {"token", "offsets_start", "offsets_limit"}); 3308 EXPECT_NE(ds, nullptr); 3309 3310 // Create an iterator over the result of the above dataset 3311 // This will trigger the creation of the Execution Tree and launch it. 3312 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 3313 EXPECT_NE(iter, nullptr); 3314 3315 // Iterate the dataset and get each row 3316 std::unordered_map<std::string, mindspore::MSTensor> row; 3317 ASSERT_OK(iter->GetNextRow(&row)); 3318 3319 std::vector<std::vector<std::string>> expected_tokens = { 3320 {"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}}; 3321 3322 std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11, 18}, {0, 15}, {0, 9, 16}, {0}}; 3323 std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 18, 19}, {15, 18}, {9, 16, 17}, {0}}; 3324 3325 uint64_t i = 0; 3326 while (row.size() != 0) { 3327 auto token = row["token"]; 3328 auto start = row["offsets_start"]; 3329 auto limit = row["offsets_limit"]; 3330 3331 std::shared_ptr<Tensor> de_expected_tokens; 3332 int x = expected_tokens[i].size(); 3333 ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens)); 3334 mindspore::MSTensor ms_expected_tokens = 3335 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens)); 3336 EXPECT_MSTENSOR_EQ(token, ms_expected_tokens); 3337 3338 std::shared_ptr<Tensor> de_expected_offsets_start; 3339 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start)); 3340 mindspore::MSTensor ms_expected_offsets_start = 3341 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start)); 3342 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start); 3343 3344 std::shared_ptr<Tensor> de_expected_offsets_limit; 3345 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit)); 3346 mindspore::MSTensor ms_expected_offsets_limit = 3347 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit)); 3348 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit); 3349 3350 ASSERT_OK(iter->GetNextRow(&row)); 3351 i++; 3352 } 3353 3354 EXPECT_EQ(i, 4); 3355 3356 // Manually terminate the pipeline 3357 iter->Stop(); 3358 } 3359 3360 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess3) { 3361 // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is 3362 // true. 3363 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess3."; 3364 3365 // Create a TextFile dataset 3366 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; 3367 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 3368 EXPECT_NE(ds, nullptr); 3369 3370 // Create unicodescript_tokenizer operation on ds 3371 std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>(true, true); 3372 EXPECT_NE(unicodescript_tokenizer, nullptr); 3373 3374 // Create Map operation on ds 3375 ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}, 3376 {"token", "offsets_start", "offsets_limit"}); 3377 EXPECT_NE(ds, nullptr); 3378 3379 // Create an iterator over the result of the above dataset 3380 // This will trigger the creation of the Execution Tree and launch it. 3381 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 3382 EXPECT_NE(iter, nullptr); 3383 3384 // Iterate the dataset and get each row 3385 std::unordered_map<std::string, mindspore::MSTensor> row; 3386 ASSERT_OK(iter->GetNextRow(&row)); 3387 3388 std::vector<std::vector<std::string>> expected_tokens = { 3389 {"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {" "}}; 3390 3391 std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 7, 8, 10, 11, 18}, {0, 15}, {0, 9, 16}, {0}}; 3392 std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 8, 10, 11, 18, 19}, {15, 18}, {9, 16, 17}, {2}}; 3393 3394 uint64_t i = 0; 3395 while (row.size() != 0) { 3396 auto token = row["token"]; 3397 auto start = row["offsets_start"]; 3398 auto limit = row["offsets_limit"]; 3399 3400 std::shared_ptr<Tensor> de_expected_tokens; 3401 int x = expected_tokens[i].size(); 3402 ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens)); 3403 mindspore::MSTensor ms_expected_tokens = 3404 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens)); 3405 EXPECT_MSTENSOR_EQ(token, ms_expected_tokens); 3406 3407 std::shared_ptr<Tensor> de_expected_offsets_start; 3408 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start)); 3409 mindspore::MSTensor ms_expected_offsets_start = 3410 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start)); 3411 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start); 3412 3413 std::shared_ptr<Tensor> de_expected_offsets_limit; 3414 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit)); 3415 mindspore::MSTensor ms_expected_offsets_limit = 3416 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit)); 3417 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit); 3418 3419 ASSERT_OK(iter->GetNextRow(&row)); 3420 i++; 3421 } 3422 3423 EXPECT_EQ(i, 4); 3424 3425 // Manually terminate the pipeline 3426 iter->Stop(); 3427 } 3428 3429 TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess) { 3430 // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is default. 3431 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess."; 3432 3433 // Create a TextFile dataset 3434 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt"; 3435 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 3436 EXPECT_NE(ds, nullptr); 3437 3438 // Create white_tokenizer operation on ds 3439 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>(); 3440 EXPECT_NE(white_tokenizer, nullptr); 3441 3442 // Create Map operation on ds 3443 ds = ds->Map({white_tokenizer}, {"text"}); 3444 EXPECT_NE(ds, nullptr); 3445 3446 // Create an iterator over the result of the above dataset 3447 // This will trigger the creation of the Execution Tree and launch it. 3448 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 3449 EXPECT_NE(iter, nullptr); 3450 3451 // Iterate the dataset and get each row 3452 std::unordered_map<std::string, mindspore::MSTensor> row; 3453 ASSERT_OK(iter->GetNextRow(&row)); 3454 3455 std::vector<std::vector<std::string>> expected = { 3456 {"This", "is", "a", "text", "file."}, {"Be", "happy", "every", "day."}, {"Good", "luck", "to", "everyone."}}; 3457 3458 uint64_t i = 0; 3459 while (row.size() != 0) { 3460 auto ind = row["text"]; 3461 3462 std::shared_ptr<Tensor> de_expected_tensor; 3463 int x = expected[i].size(); 3464 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor)); 3465 mindspore::MSTensor expected_tensor = 3466 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor)); 3467 EXPECT_MSTENSOR_EQ(ind, expected_tensor); 3468 3469 ASSERT_OK(iter->GetNextRow(&row)); 3470 i++; 3471 } 3472 3473 EXPECT_EQ(i, 3); 3474 3475 // Manually terminate the pipeline 3476 iter->Stop(); 3477 } 3478 3479 TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess1) { 3480 // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is true. 3481 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess1."; 3482 3483 // Create a TextFile dataset 3484 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt"; 3485 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse); 3486 EXPECT_NE(ds, nullptr); 3487 3488 // Create white_tokenizer operation on ds 3489 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>(true); 3490 EXPECT_NE(white_tokenizer, nullptr); 3491 3492 // Create Map operation on ds 3493 ds = ds->Map({white_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"}, 3494 {"token", "offsets_start", "offsets_limit"}); 3495 EXPECT_NE(ds, nullptr); 3496 3497 // Create an iterator over the result of the above dataset 3498 // This will trigger the creation of the Execution Tree and launch it. 3499 std::shared_ptr<Iterator> iter = ds->CreateIterator(); 3500 EXPECT_NE(iter, nullptr); 3501 3502 // Iterate the dataset and get each row 3503 std::unordered_map<std::string, mindspore::MSTensor> row; 3504 ASSERT_OK(iter->GetNextRow(&row)); 3505 3506 std::vector<std::vector<std::string>> expected_tokens = { 3507 {"Welcome", "to", "Beijing!"}, {"北京欢迎您!"}, {"我喜欢English!"}, {""}}; 3508 3509 std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11}, {0}, {0}, {0}}; 3510 std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 19}, {18}, {17}, {0}}; 3511 3512 uint64_t i = 0; 3513 while (row.size() != 0) { 3514 auto token = row["token"]; 3515 auto start = row["offsets_start"]; 3516 auto limit = row["offsets_limit"]; 3517 3518 std::shared_ptr<Tensor> de_expected_tokens; 3519 int x = expected_tokens[i].size(); 3520 ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens)); 3521 mindspore::MSTensor ms_expected_tokens = 3522 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens)); 3523 EXPECT_MSTENSOR_EQ(token, ms_expected_tokens); 3524 3525 std::shared_ptr<Tensor> de_expected_offsets_start; 3526 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start)); 3527 mindspore::MSTensor ms_expected_offsets_start = 3528 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start)); 3529 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start); 3530 3531 std::shared_ptr<Tensor> de_expected_offsets_limit; 3532 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit)); 3533 mindspore::MSTensor ms_expected_offsets_limit = 3534 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit)); 3535 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit); 3536 3537 ASSERT_OK(iter->GetNextRow(&row)); 3538 i++; 3539 } 3540 3541 EXPECT_EQ(i, 4); 3542 3543 // Manually terminate the pipeline 3544 iter->Stop(); 3545 } 3546