1 /**
2 * Copyright 2020-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include "common/common.h"
17 #include "minddata/dataset/core/global_context.h"
18 #include "minddata/dataset/include/dataset/datasets.h"
19
20 using namespace mindspore::dataset;
21 using mindspore::dataset::GlobalContext;
22 using mindspore::dataset::ShuffleMode;
23 using mindspore::dataset::Tensor;
24
25 class MindDataTestPipeline : public UT::DatasetOpTesting {
26 protected:
27 };
28
TEST_F(MindDataTestPipeline,TestCLUEDatasetAFQMC)29 TEST_F(MindDataTestPipeline, TestCLUEDatasetAFQMC) {
30 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetAFQMC.";
31
32 // Create a CLUEFile Dataset, with single CLUE file
33 std::string train_file = datasets_root_path_ + "/testCLUE/afqmc/train.json";
34 std::string test_file = datasets_root_path_ + "/testCLUE/afqmc/test.json";
35 std::string eval_file = datasets_root_path_ + "/testCLUE/afqmc/dev.json";
36 std::string task = "AFQMC";
37 std::string usage = "train";
38 std::shared_ptr<Dataset> ds = CLUE({train_file}, task, usage, 0, ShuffleMode::kFalse);
39 EXPECT_NE(ds, nullptr);
40
41 // Create an iterator over the result of the above dataset
42 // This will trigger the creation of the Execution Tree and launch it.
43 std::shared_ptr<Iterator> iter = ds->CreateIterator();
44 EXPECT_NE(iter, nullptr);
45
46 // Iterate the dataset and get each row
47 std::unordered_map<std::string, mindspore::MSTensor> row;
48 ASSERT_OK(iter->GetNextRow(&row));
49
50 EXPECT_NE(row.find("sentence1"), row.end());
51 std::vector<std::string> expected_result = {"蚂蚁借呗等额还款能否换成先息后本", "蚂蚁花呗说我违约了",
52 "帮我看看本月花呗账单结清了没"};
53
54 uint64_t i = 0;
55 while (row.size() != 0) {
56 auto text = row["sentence1"];
57 std::shared_ptr<Tensor> de_text;
58 ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
59 std::string_view sv;
60 ASSERT_OK(de_text->GetItemAt(&sv, {}));
61 std::string ss(sv);
62 EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
63 MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
64 ASSERT_OK(iter->GetNextRow(&row));
65 i++;
66 }
67
68 // Expect 3 samples
69 EXPECT_EQ(i, 3);
70
71 // Manually terminate the pipeline
72 iter->Stop();
73
74 // test
75 usage = "test";
76 expected_result = {"借呗取消的时间", "网商贷用什么方法转变成借呗", "我的借呗为什么开通不了"};
77 ds = CLUE({test_file}, task, usage, 0, ShuffleMode::kFalse);
78 EXPECT_NE(ds, nullptr);
79 iter = ds->CreateIterator();
80 EXPECT_NE(iter, nullptr);
81 ASSERT_OK(iter->GetNextRow(&row));
82 EXPECT_NE(row.find("sentence1"), row.end());
83 i = 0;
84 while (row.size() != 0) {
85 auto text = row["sentence1"];
86 std::shared_ptr<Tensor> de_text;
87 ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
88 std::string_view sv;
89 ASSERT_OK(de_text->GetItemAt(&sv, {}));
90 std::string ss(sv);
91 EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
92 ASSERT_OK(iter->GetNextRow(&row));
93 i++;
94 }
95 iter->Stop();
96
97 // eval
98 usage = "eval";
99 expected_result = {"你有花呗吗", "吃饭能用花呗吗", "蚂蚁花呗支付金额有什么限制"};
100 ds = CLUE({eval_file}, task, usage, 0, ShuffleMode::kFalse);
101 EXPECT_NE(ds, nullptr);
102 iter = ds->CreateIterator();
103 EXPECT_NE(iter, nullptr);
104 ASSERT_OK(iter->GetNextRow(&row));
105 EXPECT_NE(row.find("sentence1"), row.end());
106 i = 0;
107 while (row.size() != 0) {
108 auto text = row["sentence1"];
109 std::shared_ptr<Tensor> de_text;
110 ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
111 std::string_view sv;
112 ASSERT_OK(de_text->GetItemAt(&sv, {}));
113 std::string ss(sv);
114 EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
115 ASSERT_OK(iter->GetNextRow(&row));
116 i++;
117 }
118 iter->Stop();
119 }
120
TEST_F(MindDataTestPipeline,TestCLUEDatasetBasic)121 TEST_F(MindDataTestPipeline, TestCLUEDatasetBasic) {
122 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetBasic.";
123
124 // Create a CLUEFile Dataset, with single CLUE file
125 std::string clue_file = datasets_root_path_ + "/testCLUE/afqmc/train.json";
126 std::string task = "AFQMC";
127 std::string usage = "train";
128 std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 2);
129 EXPECT_NE(ds, nullptr);
130
131 // Create an iterator over the result of the above dataset
132 // This will trigger the creation of the Execution Tree and launch it.
133 std::shared_ptr<Iterator> iter = ds->CreateIterator();
134 EXPECT_NE(iter, nullptr);
135
136 // Iterate the dataset and get each row
137 std::unordered_map<std::string, mindspore::MSTensor> row;
138 ASSERT_OK(iter->GetNextRow(&row));
139
140 EXPECT_NE(row.find("sentence1"), row.end());
141 uint64_t i = 0;
142 while (row.size() != 0) {
143 auto text = row["sentence1"];
144 MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
145 i++;
146 ASSERT_OK(iter->GetNextRow(&row));
147 }
148
149 // Expect 2 samples
150 EXPECT_EQ(i, 2);
151
152 // Manually terminate the pipeline
153 iter->Stop();
154 }
155
TEST_F(MindDataTestPipeline,TestCLUEDatasetBasicWithPipeline)156 TEST_F(MindDataTestPipeline, TestCLUEDatasetBasicWithPipeline) {
157 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetBasicWithPipeline.";
158
159 // Create two CLUEFile Dataset, with single CLUE file
160 std::string clue_file = datasets_root_path_ + "/testCLUE/afqmc/train.json";
161 std::string task = "AFQMC";
162 std::string usage = "train";
163 std::shared_ptr<Dataset> ds1 = CLUE({clue_file}, task, usage, 2);
164 std::shared_ptr<Dataset> ds2 = CLUE({clue_file}, task, usage, 2);
165 EXPECT_NE(ds1, nullptr);
166 EXPECT_NE(ds2, nullptr);
167
168 // Create two Repeat operation on ds
169 int32_t repeat_num = 2;
170 ds1 = ds1->Repeat(repeat_num);
171 EXPECT_NE(ds1, nullptr);
172 repeat_num = 3;
173 ds2 = ds2->Repeat(repeat_num);
174 EXPECT_NE(ds2, nullptr);
175
176 // Create two Project operation on ds
177 std::vector<std::string> column_project = {"sentence1"};
178 ds1 = ds1->Project(column_project);
179 EXPECT_NE(ds1, nullptr);
180 ds2 = ds2->Project(column_project);
181 EXPECT_NE(ds2, nullptr);
182
183 // Create a Concat operation on the ds
184 ds1 = ds1->Concat({ds2});
185 EXPECT_NE(ds1, nullptr);
186
187 // Create an iterator over the result of the above dataset
188 // This will trigger the creation of the Execution Tree and launch it.
189 std::shared_ptr<Iterator> iter = ds1->CreateIterator();
190 EXPECT_NE(iter, nullptr);
191
192 // Iterate the dataset and get each row
193 std::unordered_map<std::string, mindspore::MSTensor> row;
194 ASSERT_OK(iter->GetNextRow(&row));
195
196 EXPECT_NE(row.find("sentence1"), row.end());
197 uint64_t i = 0;
198 while (row.size() != 0) {
199 auto text = row["sentence1"];
200 MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
201 i++;
202 ASSERT_OK(iter->GetNextRow(&row));
203 }
204
205 // Expect 10 samples
206 EXPECT_EQ(i, 10);
207
208 // Manually terminate the pipeline
209 iter->Stop();
210 }
211
TEST_F(MindDataTestPipeline,TestCLUEGetters)212 TEST_F(MindDataTestPipeline, TestCLUEGetters) {
213 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEGetters.";
214
215 // Create a CLUEFile Dataset, with single CLUE file
216 std::string clue_file = datasets_root_path_ + "/testCLUE/afqmc/train.json";
217 std::string task = "AFQMC";
218 std::string usage = "train";
219 std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 2);
220 std::vector<std::string> column_names = {"label", "sentence1", "sentence2"};
221 EXPECT_NE(ds, nullptr);
222
223 EXPECT_EQ(ds->GetDatasetSize(), 2);
224 EXPECT_EQ(ds->GetColumnNames(), column_names);
225 }
226
TEST_F(MindDataTestPipeline,TestCLUEDatasetCMNLI)227 TEST_F(MindDataTestPipeline, TestCLUEDatasetCMNLI) {
228 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetCMNLI.";
229
230 // Create a CLUEFile Dataset, with single CLUE file
231 std::string clue_file = datasets_root_path_ + "/testCLUE/cmnli/train.json";
232 std::string task = "CMNLI";
233 std::string usage = "train";
234 std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 0, ShuffleMode::kFalse);
235 EXPECT_NE(ds, nullptr);
236
237 // Create an iterator over the result of the above dataset
238 // This will trigger the creation of the Execution Tree and launch it.
239 std::shared_ptr<Iterator> iter = ds->CreateIterator();
240 EXPECT_NE(iter, nullptr);
241
242 // Iterate the dataset and get each row
243 std::unordered_map<std::string, mindspore::MSTensor> row;
244 ASSERT_OK(iter->GetNextRow(&row));
245
246 EXPECT_NE(row.find("sentence1"), row.end());
247 std::vector<std::string> expected_result = {"你应该给这件衣服定一个价格。", "我怎么知道他要说什么", "向左。"};
248
249 uint64_t i = 0;
250 while (row.size() != 0) {
251 auto text = row["sentence1"];
252 std::shared_ptr<Tensor> de_text;
253 ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
254 std::string_view sv;
255 ASSERT_OK(de_text->GetItemAt(&sv, {}));
256 std::string ss(sv);
257 EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
258 MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
259 ASSERT_OK(iter->GetNextRow(&row));
260 i++;
261 }
262
263 // Expect 3 samples
264 EXPECT_EQ(i, 3);
265
266 // Manually terminate the pipeline
267 iter->Stop();
268 }
269
TEST_F(MindDataTestPipeline,TestCLUEDatasetCSL)270 TEST_F(MindDataTestPipeline, TestCLUEDatasetCSL) {
271 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetCSL.";
272
273 // Create a CLUEFile Dataset, with single CLUE file
274 std::string clue_file = datasets_root_path_ + "/testCLUE/csl/train.json";
275 std::string task = "CSL";
276 std::string usage = "train";
277 std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 0, ShuffleMode::kFalse);
278 EXPECT_NE(ds, nullptr);
279
280 // Create an iterator over the result of the above dataset
281 // This will trigger the creation of the Execution Tree and launch it.
282 std::shared_ptr<Iterator> iter = ds->CreateIterator();
283 EXPECT_NE(iter, nullptr);
284
285 // Iterate the dataset and get each row
286 std::unordered_map<std::string, mindspore::MSTensor> row;
287 ASSERT_OK(iter->GetNextRow(&row));
288
289 EXPECT_NE(row.find("abst"), row.end());
290 std::vector<std::string> expected_result = {"这是一段长文本", "这是一段长文本", "这是一段长文本"};
291
292 uint64_t i = 0;
293 while (row.size() != 0) {
294 auto text = row["abst"];
295 std::shared_ptr<Tensor> de_text;
296 ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
297 std::string_view sv;
298 ASSERT_OK(de_text->GetItemAt(&sv, {}));
299 std::string ss(sv);
300 EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
301 MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
302 ASSERT_OK(iter->GetNextRow(&row));
303 i++;
304 }
305
306 // Expect 3 samples
307 EXPECT_EQ(i, 3);
308
309 // Manually terminate the pipeline
310 iter->Stop();
311 }
312
TEST_F(MindDataTestPipeline,TestCLUEDatasetDistribution)313 TEST_F(MindDataTestPipeline, TestCLUEDatasetDistribution) {
314 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetDistribution.";
315
316 // Create a CLUEFile Dataset, with single CLUE file
317 std::string clue_file = datasets_root_path_ + "/testCLUE/afqmc/train.json";
318 std::string task = "AFQMC";
319 std::string usage = "train";
320 std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 0, ShuffleMode::kGlobal, 3, 0);
321 EXPECT_NE(ds, nullptr);
322
323 // Create an iterator over the result of the above dataset
324 // This will trigger the creation of the Execution Tree and launch it.
325 std::shared_ptr<Iterator> iter = ds->CreateIterator();
326 EXPECT_NE(iter, nullptr);
327
328 // Iterate the dataset and get each row
329 std::unordered_map<std::string, mindspore::MSTensor> row;
330 ASSERT_OK(iter->GetNextRow(&row));
331
332 EXPECT_NE(row.find("sentence1"), row.end());
333 uint64_t i = 0;
334 while (row.size() != 0) {
335 auto text = row["sentence1"];
336 MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
337 i++;
338 ASSERT_OK(iter->GetNextRow(&row));
339 }
340
341 // Expect 1 samples
342 EXPECT_EQ(i, 1);
343
344 // Manually terminate the pipeline
345 iter->Stop();
346 }
347
TEST_F(MindDataTestPipeline,TestCLUEDatasetFail)348 TEST_F(MindDataTestPipeline, TestCLUEDatasetFail) {
349 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetFail.";
350 // Create a CLUE Dataset
351 std::string clue_file = datasets_root_path_ + "/testCLUE/wsc/train.json";
352 std::string task = "WSC";
353 std::string usage = "train";
354 std::string invalid_clue_file = "./NotExistFile";
355
356 std::shared_ptr<Dataset> ds0 = CLUE({}, task, usage);
357 EXPECT_NE(ds0, nullptr);
358 // Create an iterator over the result of the above dataset
359 std::shared_ptr<Iterator> iter0 = ds0->CreateIterator();
360 // Expect failure: invalid CLUE input
361 EXPECT_EQ(iter0, nullptr);
362
363 std::shared_ptr<Dataset> ds1 = CLUE({invalid_clue_file}, task, usage);
364 EXPECT_NE(ds1, nullptr);
365 // Create an iterator over the result of the above dataset
366 std::shared_ptr<Iterator> iter1 = ds1->CreateIterator();
367 // Expect failure: invalid CLUE input
368 EXPECT_EQ(iter1, nullptr);
369
370 std::shared_ptr<Dataset> ds2 = CLUE({clue_file}, "invalid_task", usage);
371 EXPECT_NE(ds2, nullptr);
372 // Create an iterator over the result of the above dataset
373 std::shared_ptr<Iterator> iter2 = ds2->CreateIterator();
374 // Expect failure: invalid CLUE input
375 EXPECT_EQ(iter2, nullptr);
376
377 std::shared_ptr<Dataset> ds3 = CLUE({clue_file}, task, "invalid_usage");
378 EXPECT_NE(ds3, nullptr);
379 // Create an iterator over the result of the above dataset
380 std::shared_ptr<Iterator> iter3 = ds3->CreateIterator();
381 // Expect failure: invalid CLUE input
382 EXPECT_EQ(iter3, nullptr);
383
384 std::shared_ptr<Dataset> ds4 = CLUE({clue_file}, task, usage, 0, ShuffleMode::kGlobal, 2, 2);
385 EXPECT_NE(ds4, nullptr);
386 // Create an iterator over the result of the above dataset
387 std::shared_ptr<Iterator> iter4 = ds4->CreateIterator();
388 // Expect failure: invalid CLUE input
389 EXPECT_EQ(iter4, nullptr);
390
391 std::shared_ptr<Dataset> ds5 = CLUE({clue_file}, task, usage, -1, ShuffleMode::kGlobal);
392 EXPECT_NE(ds5, nullptr);
393 // Create an iterator over the result of the above dataset
394 std::shared_ptr<Iterator> iter5 = ds5->CreateIterator();
395 // Expect failure: invalid CLUE input
396 EXPECT_EQ(iter5, nullptr);
397
398 std::shared_ptr<Dataset> ds6 = CLUE({clue_file}, task, usage, 0, ShuffleMode::kGlobal, -1);
399 EXPECT_NE(ds6, nullptr);
400 // Create an iterator over the result of the above dataset
401 std::shared_ptr<Iterator> iter6 = ds6->CreateIterator();
402 // Expect failure: invalid CLUE input
403 EXPECT_EQ(iter6, nullptr);
404
405 std::shared_ptr<Dataset> ds7 = CLUE({clue_file}, task, usage, 0, ShuffleMode::kGlobal, 0, -1);
406 EXPECT_NE(ds7, nullptr);
407 // Create an iterator over the result of the above dataset
408 std::shared_ptr<Iterator> iter7 = ds7->CreateIterator();
409 // Expect failure: invalid CLUE input
410 EXPECT_EQ(iter7, nullptr);
411 }
412
TEST_F(MindDataTestPipeline,TestCLUEDatasetIFLYTEK)413 TEST_F(MindDataTestPipeline, TestCLUEDatasetIFLYTEK) {
414 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetIFLYTEK.";
415
416 // Create a CLUEFile Dataset, with single CLUE file
417 std::string clue_file = datasets_root_path_ + "/testCLUE/iflytek/train.json";
418 std::string task = "IFLYTEK";
419 std::string usage = "train";
420 std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 0, ShuffleMode::kFalse);
421 EXPECT_NE(ds, nullptr);
422
423 // Create an iterator over the result of the above dataset
424 // This will trigger the creation of the Execution Tree and launch it.
425 std::shared_ptr<Iterator> iter = ds->CreateIterator();
426 EXPECT_NE(iter, nullptr);
427
428 // Iterate the dataset and get each row
429 std::unordered_map<std::string, mindspore::MSTensor> row;
430 ASSERT_OK(iter->GetNextRow(&row));
431
432 EXPECT_NE(row.find("sentence"), row.end());
433 std::vector<std::string> expected_result = {"第一个文本", "第二个文本", "第三个文本"};
434
435 uint64_t i = 0;
436 while (row.size() != 0) {
437 auto text = row["sentence"];
438 std::shared_ptr<Tensor> de_text;
439 ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
440 std::string_view sv;
441 ASSERT_OK(de_text->GetItemAt(&sv, {}));
442 std::string ss(sv);
443 EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
444 MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
445 ASSERT_OK(iter->GetNextRow(&row));
446 i++;
447 }
448
449 // Expect 3 samples
450 EXPECT_EQ(i, 3);
451
452 // Manually terminate the pipeline
453 iter->Stop();
454 }
455
TEST_F(MindDataTestPipeline,TestCLUEDatasetShuffleFilesA)456 TEST_F(MindDataTestPipeline, TestCLUEDatasetShuffleFilesA) {
457 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetShuffleFilesA.";
458 // Test CLUE Dataset with files shuffle, num_parallel_workers=1
459
460 // Set configuration
461 uint32_t original_seed = GlobalContext::config_manager()->seed();
462 uint32_t original_num_parallel_workers = GlobalContext::config_manager()->num_parallel_workers();
463 MS_LOG(DEBUG) << "ORIGINAL seed: " << original_seed << ", num_parallel_workers: " << original_num_parallel_workers;
464 GlobalContext::config_manager()->set_seed(135);
465 GlobalContext::config_manager()->set_num_parallel_workers(1);
466
467 // Create a CLUE Dataset, with two text files, dev.json and train.json, in lexicographical order
468 // Note: train.json has 3 rows
469 // Note: dev.json has 3 rows
470 // Use default of all samples
471 // They have the same keywords
472 // Set shuffle to files shuffle
473 std::string clue_file1 = datasets_root_path_ + "/testCLUE/afqmc/train.json";
474 std::string clue_file2 = datasets_root_path_ + "/testCLUE/afqmc/dev.json";
475 std::string task = "AFQMC";
476 std::string usage = "train";
477 std::shared_ptr<Dataset> ds = CLUE({clue_file2, clue_file1}, task, usage, 0, ShuffleMode::kFiles);
478 EXPECT_NE(ds, nullptr);
479
480 // Create an iterator over the result of the above dataset.
481 // This will trigger the creation of the Execution Tree and launch it.
482 std::shared_ptr<Iterator> iter = ds->CreateIterator();
483 EXPECT_NE(iter, nullptr);
484
485 // Iterate the dataset and get each row
486 std::unordered_map<std::string, mindspore::MSTensor> row;
487 ASSERT_OK(iter->GetNextRow(&row));
488
489 EXPECT_NE(row.find("sentence1"), row.end());
490 std::vector<std::string> expected_result = {"你有花呗吗",
491 "吃饭能用花呗吗",
492 "蚂蚁花呗支付金额有什么限制",
493 "蚂蚁借呗等额还款能否换成先息后本",
494 "蚂蚁花呗说我违约了",
495 "帮我看看本月花呗账单结清了没"};
496
497 uint64_t i = 0;
498 while (row.size() != 0) {
499 auto text = row["sentence1"];
500 std::shared_ptr<Tensor> de_text;
501 ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
502 std::string_view sv;
503 ASSERT_OK(de_text->GetItemAt(&sv, {}));
504 std::string ss(sv);
505 MS_LOG(INFO) << "Text length: " << ss.length() << ", Text: " << ss.substr(0, 50);
506 // Compare against expected result
507 EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
508 i++;
509 ASSERT_OK(iter->GetNextRow(&row));
510 }
511
512 // Expect 3 + 3 = 6 samples
513 EXPECT_EQ(i, 6);
514
515 // Manually terminate the pipeline
516 iter->Stop();
517
518 // Restore configuration
519 GlobalContext::config_manager()->set_seed(original_seed);
520 GlobalContext::config_manager()->set_num_parallel_workers(original_num_parallel_workers);
521 }
522
TEST_F(MindDataTestPipeline,TestCLUEDatasetShuffleFilesB)523 TEST_F(MindDataTestPipeline, TestCLUEDatasetShuffleFilesB) {
524 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetShuffleFilesB.";
525 // Test CLUE Dataset with files shuffle, num_parallel_workers=1
526
527 // Set configuration
528 uint32_t original_seed = GlobalContext::config_manager()->seed();
529 uint32_t original_num_parallel_workers = GlobalContext::config_manager()->num_parallel_workers();
530 MS_LOG(DEBUG) << "ORIGINAL seed: " << original_seed << ", num_parallel_workers: " << original_num_parallel_workers;
531 GlobalContext::config_manager()->set_seed(135);
532 GlobalContext::config_manager()->set_num_parallel_workers(1);
533
534 // Create a CLUE Dataset, with two text files, train.json and dev.json, in non-lexicographical order
535 // Note: train.json has 3 rows
536 // Note: dev.json has 3 rows
537 // Use default of all samples
538 // They have the same keywords
539 // Set shuffle to files shuffle
540 std::string clue_file1 = datasets_root_path_ + "/testCLUE/afqmc/train.json";
541 std::string clue_file2 = datasets_root_path_ + "/testCLUE/afqmc/dev.json";
542 std::string task = "AFQMC";
543 std::string usage = "train";
544 std::shared_ptr<Dataset> ds = CLUE({clue_file1, clue_file2}, task, usage, 0, ShuffleMode::kFiles);
545 EXPECT_NE(ds, nullptr);
546
547 // Create an iterator over the result of the above dataset.
548 // This will trigger the creation of the Execution Tree and launch it.
549 std::shared_ptr<Iterator> iter = ds->CreateIterator();
550 EXPECT_NE(iter, nullptr);
551
552 // Iterate the dataset and get each row
553 std::unordered_map<std::string, mindspore::MSTensor> row;
554 ASSERT_OK(iter->GetNextRow(&row));
555
556 EXPECT_NE(row.find("sentence1"), row.end());
557 std::vector<std::string> expected_result = {"你有花呗吗",
558 "吃饭能用花呗吗",
559 "蚂蚁花呗支付金额有什么限制",
560 "蚂蚁借呗等额还款能否换成先息后本",
561 "蚂蚁花呗说我违约了",
562 "帮我看看本月花呗账单结清了没"};
563
564 uint64_t i = 0;
565 while (row.size() != 0) {
566 auto text = row["sentence1"];
567 std::shared_ptr<Tensor> de_text;
568 ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
569 std::string_view sv;
570 ASSERT_OK(de_text->GetItemAt(&sv, {}));
571 std::string ss(sv);
572 // Compare against expected result
573 EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
574 i++;
575 ASSERT_OK(iter->GetNextRow(&row));
576 }
577
578 // Expect 3 + 3 = 6 samples
579 EXPECT_EQ(i, 6);
580
581 // Manually terminate the pipeline
582 iter->Stop();
583
584 // Restore configuration
585 GlobalContext::config_manager()->set_seed(original_seed);
586 GlobalContext::config_manager()->set_num_parallel_workers(original_num_parallel_workers);
587 }
588
TEST_F(MindDataTestPipeline,TestCLUEDatasetShuffleGlobal)589 TEST_F(MindDataTestPipeline, TestCLUEDatasetShuffleGlobal) {
590 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetShuffleGlobal.";
591 // Test CLUE Dataset with GLOBLE shuffle
592
593 // Set configuration
594 uint32_t original_seed = GlobalContext::config_manager()->seed();
595 uint32_t original_num_parallel_workers = GlobalContext::config_manager()->num_parallel_workers();
596 MS_LOG(DEBUG) << "ORIGINAL seed: " << original_seed << ", num_parallel_workers: " << original_num_parallel_workers;
597 GlobalContext::config_manager()->set_seed(135);
598 GlobalContext::config_manager()->set_num_parallel_workers(4);
599
600 // Create a CLUEFile Dataset, with single CLUE file
601 std::string clue_file = datasets_root_path_ + "/testCLUE/afqmc/train.json";
602 std::string task = "AFQMC";
603 std::string usage = "train";
604 std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 0, ShuffleMode::kGlobal);
605 EXPECT_NE(ds, nullptr);
606
607 // Create an iterator over the result of the above dataset
608 // This will trigger the creation of the Execution Tree and launch it.
609 std::shared_ptr<Iterator> iter = ds->CreateIterator();
610 EXPECT_NE(iter, nullptr);
611
612 // Iterate the dataset and get each row
613 std::unordered_map<std::string, mindspore::MSTensor> row;
614 ASSERT_OK(iter->GetNextRow(&row));
615
616 EXPECT_NE(row.find("sentence1"), row.end());
617 std::vector<std::string> expected_result = {"蚂蚁花呗说我违约了", "帮我看看本月花呗账单结清了没",
618 "蚂蚁借呗等额还款能否换成先息后本"};
619 uint64_t i = 0;
620 while (row.size() != 0) {
621 auto text = row["sentence1"];
622 std::shared_ptr<Tensor> de_text;
623 ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
624 std::string_view sv;
625 ASSERT_OK(de_text->GetItemAt(&sv, {}));
626 std::string ss(sv);
627 EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
628 MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
629 i++;
630 ASSERT_OK(iter->GetNextRow(&row));
631 }
632
633 // Expect 3 samples
634 EXPECT_EQ(i, 3);
635
636 // Manually terminate the pipeline
637 iter->Stop();
638
639 // Restore configuration
640 GlobalContext::config_manager()->set_seed(original_seed);
641 GlobalContext::config_manager()->set_num_parallel_workers(original_num_parallel_workers);
642 }
643
TEST_F(MindDataTestPipeline,TestCLUEDatasetTNEWS)644 TEST_F(MindDataTestPipeline, TestCLUEDatasetTNEWS) {
645 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetTNEWS.";
646
647 // Create a CLUEFile Dataset, with single CLUE file
648 std::string clue_file = datasets_root_path_ + "/testCLUE/tnews/train.json";
649 std::string task = "TNEWS";
650 std::string usage = "train";
651 std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 0, ShuffleMode::kFalse);
652 EXPECT_NE(ds, nullptr);
653
654 // Create an iterator over the result of the above dataset
655 // This will trigger the creation of the Execution Tree and launch it.
656 std::shared_ptr<Iterator> iter = ds->CreateIterator();
657 EXPECT_NE(iter, nullptr);
658
659 // Iterate the dataset and get each row
660 std::unordered_map<std::string, mindspore::MSTensor> row;
661 ASSERT_OK(iter->GetNextRow(&row));
662
663 EXPECT_NE(row.find("sentence"), row.end());
664 std::vector<std::string> expected_result = {"新闻1", "新闻2", "新闻3"};
665
666 uint64_t i = 0;
667 while (row.size() != 0) {
668 auto text = row["sentence"];
669 std::shared_ptr<Tensor> de_text;
670 ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
671 std::string_view sv;
672 ASSERT_OK(de_text->GetItemAt(&sv, {}));
673 std::string ss(sv);
674 EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
675 MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
676 ASSERT_OK(iter->GetNextRow(&row));
677 i++;
678 }
679
680 // Expect 3 samples
681 EXPECT_EQ(i, 3);
682
683 // Manually terminate the pipeline
684 iter->Stop();
685 }
686
TEST_F(MindDataTestPipeline,TestCLUEDatasetWSC)687 TEST_F(MindDataTestPipeline, TestCLUEDatasetWSC) {
688 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetWSC.";
689
690 // Create a CLUEFile Dataset, with single CLUE file
691 std::string clue_file = datasets_root_path_ + "/testCLUE/wsc/train.json";
692 std::string task = "WSC";
693 std::string usage = "train";
694 std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 0, ShuffleMode::kFalse);
695 EXPECT_NE(ds, nullptr);
696
697 // Create an iterator over the result of the above dataset
698 // This will trigger the creation of the Execution Tree and launch it.
699 std::shared_ptr<Iterator> iter = ds->CreateIterator();
700 EXPECT_NE(iter, nullptr);
701
702 // Iterate the dataset and get each row
703 std::unordered_map<std::string, mindspore::MSTensor> row;
704 ASSERT_OK(iter->GetNextRow(&row));
705
706 EXPECT_NE(row.find("text"), row.end());
707 std::vector<std::string> expected_result = {"小明呢,他在哪?", "小红刚刚看到小明,他在操场",
708 "等小明回来,小张你叫他交作业"};
709
710 uint64_t i = 0;
711 while (row.size() != 0) {
712 auto text = row["text"];
713 std::shared_ptr<Tensor> de_text;
714 ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
715 std::string_view sv;
716 ASSERT_OK(de_text->GetItemAt(&sv, {}));
717 std::string ss(sv);
718 EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
719 MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
720 ASSERT_OK(iter->GetNextRow(&row));
721 i++;
722 }
723
724 // Expect 3 samples
725 EXPECT_EQ(i, 3);
726
727 // Manually terminate the pipeline
728 iter->Stop();
729 }
730