• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include "common/common.h"
17 #include "minddata/dataset/core/global_context.h"
18 #include "minddata/dataset/include/dataset/datasets.h"
19 
20 using namespace mindspore::dataset;
21 using mindspore::dataset::GlobalContext;
22 using mindspore::dataset::ShuffleMode;
23 using mindspore::dataset::Tensor;
24 
25 class MindDataTestPipeline : public UT::DatasetOpTesting {
26  protected:
27 };
28 
TEST_F(MindDataTestPipeline,TestCLUEDatasetAFQMC)29 TEST_F(MindDataTestPipeline, TestCLUEDatasetAFQMC) {
30   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetAFQMC.";
31 
32   // Create a CLUEFile Dataset, with single CLUE file
33   std::string train_file = datasets_root_path_ + "/testCLUE/afqmc/train.json";
34   std::string test_file = datasets_root_path_ + "/testCLUE/afqmc/test.json";
35   std::string eval_file = datasets_root_path_ + "/testCLUE/afqmc/dev.json";
36   std::string task = "AFQMC";
37   std::string usage = "train";
38   std::shared_ptr<Dataset> ds = CLUE({train_file}, task, usage, 0, ShuffleMode::kFalse);
39   EXPECT_NE(ds, nullptr);
40 
41   // Create an iterator over the result of the above dataset
42   // This will trigger the creation of the Execution Tree and launch it.
43   std::shared_ptr<Iterator> iter = ds->CreateIterator();
44   EXPECT_NE(iter, nullptr);
45 
46   // Iterate the dataset and get each row
47   std::unordered_map<std::string, mindspore::MSTensor> row;
48   ASSERT_OK(iter->GetNextRow(&row));
49 
50   EXPECT_NE(row.find("sentence1"), row.end());
51   std::vector<std::string> expected_result = {"蚂蚁借呗等额还款能否换成先息后本", "蚂蚁花呗说我违约了",
52                                                "帮我看看本月花呗账单结清了没"};
53 
54   uint64_t i = 0;
55   while (row.size() != 0) {
56     auto text = row["sentence1"];
57     std::shared_ptr<Tensor> de_text;
58     ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
59     std::string_view sv;
60     ASSERT_OK(de_text->GetItemAt(&sv, {}));
61     std::string ss(sv);
62     EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
63     MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
64     ASSERT_OK(iter->GetNextRow(&row));
65     i++;
66   }
67 
68   // Expect 3 samples
69   EXPECT_EQ(i, 3);
70 
71   // Manually terminate the pipeline
72   iter->Stop();
73 
74   // test
75   usage = "test";
76   expected_result = {"借呗取消的时间", "网商贷用什么方法转变成借呗", "我的借呗为什么开通不了"};
77   ds = CLUE({test_file}, task, usage, 0, ShuffleMode::kFalse);
78   EXPECT_NE(ds, nullptr);
79   iter = ds->CreateIterator();
80   EXPECT_NE(iter, nullptr);
81   ASSERT_OK(iter->GetNextRow(&row));
82   EXPECT_NE(row.find("sentence1"), row.end());
83   i = 0;
84   while (row.size() != 0) {
85     auto text = row["sentence1"];
86     std::shared_ptr<Tensor> de_text;
87     ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
88     std::string_view sv;
89     ASSERT_OK(de_text->GetItemAt(&sv, {}));
90     std::string ss(sv);
91     EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
92     ASSERT_OK(iter->GetNextRow(&row));
93     i++;
94   }
95   iter->Stop();
96 
97   // eval
98   usage = "eval";
99   expected_result = {"你有花呗吗", "吃饭能用花呗吗", "蚂蚁花呗支付金额有什么限制"};
100   ds = CLUE({eval_file}, task, usage, 0, ShuffleMode::kFalse);
101   EXPECT_NE(ds, nullptr);
102   iter = ds->CreateIterator();
103   EXPECT_NE(iter, nullptr);
104   ASSERT_OK(iter->GetNextRow(&row));
105   EXPECT_NE(row.find("sentence1"), row.end());
106   i = 0;
107   while (row.size() != 0) {
108     auto text = row["sentence1"];
109     std::shared_ptr<Tensor> de_text;
110     ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
111     std::string_view sv;
112     ASSERT_OK(de_text->GetItemAt(&sv, {}));
113     std::string ss(sv);
114     EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
115     ASSERT_OK(iter->GetNextRow(&row));
116     i++;
117   }
118   iter->Stop();
119 }
120 
TEST_F(MindDataTestPipeline,TestCLUEDatasetBasic)121 TEST_F(MindDataTestPipeline, TestCLUEDatasetBasic) {
122   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetBasic.";
123 
124   // Create a CLUEFile Dataset, with single CLUE file
125   std::string clue_file = datasets_root_path_ + "/testCLUE/afqmc/train.json";
126   std::string task = "AFQMC";
127   std::string usage = "train";
128   std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 2);
129   EXPECT_NE(ds, nullptr);
130 
131   // Create an iterator over the result of the above dataset
132   // This will trigger the creation of the Execution Tree and launch it.
133   std::shared_ptr<Iterator> iter = ds->CreateIterator();
134   EXPECT_NE(iter, nullptr);
135 
136   // Iterate the dataset and get each row
137   std::unordered_map<std::string, mindspore::MSTensor> row;
138   ASSERT_OK(iter->GetNextRow(&row));
139 
140   EXPECT_NE(row.find("sentence1"), row.end());
141   uint64_t i = 0;
142   while (row.size() != 0) {
143     auto text = row["sentence1"];
144     MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
145     i++;
146     ASSERT_OK(iter->GetNextRow(&row));
147   }
148 
149   // Expect 2 samples
150   EXPECT_EQ(i, 2);
151 
152   // Manually terminate the pipeline
153   iter->Stop();
154 }
155 
TEST_F(MindDataTestPipeline,TestCLUEDatasetBasicWithPipeline)156 TEST_F(MindDataTestPipeline, TestCLUEDatasetBasicWithPipeline) {
157   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetBasicWithPipeline.";
158 
159   // Create two CLUEFile Dataset, with single CLUE file
160   std::string clue_file = datasets_root_path_ + "/testCLUE/afqmc/train.json";
161   std::string task = "AFQMC";
162   std::string usage = "train";
163   std::shared_ptr<Dataset> ds1 = CLUE({clue_file}, task, usage, 2);
164   std::shared_ptr<Dataset> ds2 = CLUE({clue_file}, task, usage, 2);
165   EXPECT_NE(ds1, nullptr);
166   EXPECT_NE(ds2, nullptr);
167 
168   // Create two Repeat operation on ds
169   int32_t repeat_num = 2;
170   ds1 = ds1->Repeat(repeat_num);
171   EXPECT_NE(ds1, nullptr);
172   repeat_num = 3;
173   ds2 = ds2->Repeat(repeat_num);
174   EXPECT_NE(ds2, nullptr);
175 
176   // Create two Project operation on ds
177   std::vector<std::string> column_project = {"sentence1"};
178   ds1 = ds1->Project(column_project);
179   EXPECT_NE(ds1, nullptr);
180   ds2 = ds2->Project(column_project);
181   EXPECT_NE(ds2, nullptr);
182 
183   // Create a Concat operation on the ds
184   ds1 = ds1->Concat({ds2});
185   EXPECT_NE(ds1, nullptr);
186 
187   // Create an iterator over the result of the above dataset
188   // This will trigger the creation of the Execution Tree and launch it.
189   std::shared_ptr<Iterator> iter = ds1->CreateIterator();
190   EXPECT_NE(iter, nullptr);
191 
192   // Iterate the dataset and get each row
193   std::unordered_map<std::string, mindspore::MSTensor> row;
194   ASSERT_OK(iter->GetNextRow(&row));
195 
196   EXPECT_NE(row.find("sentence1"), row.end());
197   uint64_t i = 0;
198   while (row.size() != 0) {
199     auto text = row["sentence1"];
200     MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
201     i++;
202     ASSERT_OK(iter->GetNextRow(&row));
203   }
204 
205   // Expect 10 samples
206   EXPECT_EQ(i, 10);
207 
208   // Manually terminate the pipeline
209   iter->Stop();
210 }
211 
TEST_F(MindDataTestPipeline,TestCLUEGetters)212 TEST_F(MindDataTestPipeline, TestCLUEGetters) {
213   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEGetters.";
214 
215   // Create a CLUEFile Dataset, with single CLUE file
216   std::string clue_file = datasets_root_path_ + "/testCLUE/afqmc/train.json";
217   std::string task = "AFQMC";
218   std::string usage = "train";
219   std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 2);
220   std::vector<std::string> column_names = {"label", "sentence1", "sentence2"};
221   EXPECT_NE(ds, nullptr);
222 
223   EXPECT_EQ(ds->GetDatasetSize(), 2);
224   EXPECT_EQ(ds->GetColumnNames(), column_names);
225 }
226 
TEST_F(MindDataTestPipeline,TestCLUEDatasetCMNLI)227 TEST_F(MindDataTestPipeline, TestCLUEDatasetCMNLI) {
228   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetCMNLI.";
229 
230   // Create a CLUEFile Dataset, with single CLUE file
231   std::string clue_file = datasets_root_path_ + "/testCLUE/cmnli/train.json";
232   std::string task = "CMNLI";
233   std::string usage = "train";
234   std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 0, ShuffleMode::kFalse);
235   EXPECT_NE(ds, nullptr);
236 
237   // Create an iterator over the result of the above dataset
238   // This will trigger the creation of the Execution Tree and launch it.
239   std::shared_ptr<Iterator> iter = ds->CreateIterator();
240   EXPECT_NE(iter, nullptr);
241 
242   // Iterate the dataset and get each row
243   std::unordered_map<std::string, mindspore::MSTensor> row;
244   ASSERT_OK(iter->GetNextRow(&row));
245 
246   EXPECT_NE(row.find("sentence1"), row.end());
247   std::vector<std::string> expected_result = {"你应该给这件衣服定一个价格。", "我怎么知道他要说什么", "向左。"};
248 
249   uint64_t i = 0;
250   while (row.size() != 0) {
251     auto text = row["sentence1"];
252     std::shared_ptr<Tensor> de_text;
253     ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
254     std::string_view sv;
255     ASSERT_OK(de_text->GetItemAt(&sv, {}));
256     std::string ss(sv);
257     EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
258     MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
259     ASSERT_OK(iter->GetNextRow(&row));
260     i++;
261   }
262 
263   // Expect 3 samples
264   EXPECT_EQ(i, 3);
265 
266   // Manually terminate the pipeline
267   iter->Stop();
268 }
269 
TEST_F(MindDataTestPipeline,TestCLUEDatasetCSL)270 TEST_F(MindDataTestPipeline, TestCLUEDatasetCSL) {
271   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetCSL.";
272 
273   // Create a CLUEFile Dataset, with single CLUE file
274   std::string clue_file = datasets_root_path_ + "/testCLUE/csl/train.json";
275   std::string task = "CSL";
276   std::string usage = "train";
277   std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 0, ShuffleMode::kFalse);
278   EXPECT_NE(ds, nullptr);
279 
280   // Create an iterator over the result of the above dataset
281   // This will trigger the creation of the Execution Tree and launch it.
282   std::shared_ptr<Iterator> iter = ds->CreateIterator();
283   EXPECT_NE(iter, nullptr);
284 
285   // Iterate the dataset and get each row
286   std::unordered_map<std::string, mindspore::MSTensor> row;
287   ASSERT_OK(iter->GetNextRow(&row));
288 
289   EXPECT_NE(row.find("abst"), row.end());
290   std::vector<std::string> expected_result = {"这是一段长文本", "这是一段长文本", "这是一段长文本"};
291 
292   uint64_t i = 0;
293   while (row.size() != 0) {
294     auto text = row["abst"];
295     std::shared_ptr<Tensor> de_text;
296     ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
297     std::string_view sv;
298     ASSERT_OK(de_text->GetItemAt(&sv, {}));
299     std::string ss(sv);
300     EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
301     MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
302     ASSERT_OK(iter->GetNextRow(&row));
303     i++;
304   }
305 
306   // Expect 3 samples
307   EXPECT_EQ(i, 3);
308 
309   // Manually terminate the pipeline
310   iter->Stop();
311 }
312 
TEST_F(MindDataTestPipeline,TestCLUEDatasetDistribution)313 TEST_F(MindDataTestPipeline, TestCLUEDatasetDistribution) {
314   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetDistribution.";
315 
316   // Create a CLUEFile Dataset, with single CLUE file
317   std::string clue_file = datasets_root_path_ + "/testCLUE/afqmc/train.json";
318   std::string task = "AFQMC";
319   std::string usage = "train";
320   std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 0, ShuffleMode::kGlobal, 3, 0);
321   EXPECT_NE(ds, nullptr);
322 
323   // Create an iterator over the result of the above dataset
324   // This will trigger the creation of the Execution Tree and launch it.
325   std::shared_ptr<Iterator> iter = ds->CreateIterator();
326   EXPECT_NE(iter, nullptr);
327 
328   // Iterate the dataset and get each row
329   std::unordered_map<std::string, mindspore::MSTensor> row;
330   ASSERT_OK(iter->GetNextRow(&row));
331 
332   EXPECT_NE(row.find("sentence1"), row.end());
333   uint64_t i = 0;
334   while (row.size() != 0) {
335     auto text = row["sentence1"];
336     MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
337     i++;
338     ASSERT_OK(iter->GetNextRow(&row));
339   }
340 
341   // Expect 1 samples
342   EXPECT_EQ(i, 1);
343 
344   // Manually terminate the pipeline
345   iter->Stop();
346 }
347 
TEST_F(MindDataTestPipeline,TestCLUEDatasetFail)348 TEST_F(MindDataTestPipeline, TestCLUEDatasetFail) {
349   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetFail.";
350   // Create a CLUE Dataset
351   std::string clue_file = datasets_root_path_ + "/testCLUE/wsc/train.json";
352   std::string task = "WSC";
353   std::string usage = "train";
354   std::string invalid_clue_file = "./NotExistFile";
355 
356   std::shared_ptr<Dataset> ds0 = CLUE({}, task, usage);
357   EXPECT_NE(ds0, nullptr);
358   // Create an iterator over the result of the above dataset
359   std::shared_ptr<Iterator> iter0 = ds0->CreateIterator();
360   // Expect failure: invalid CLUE input
361   EXPECT_EQ(iter0, nullptr);
362 
363   std::shared_ptr<Dataset> ds1 = CLUE({invalid_clue_file}, task, usage);
364   EXPECT_NE(ds1, nullptr);
365   // Create an iterator over the result of the above dataset
366   std::shared_ptr<Iterator> iter1 = ds1->CreateIterator();
367   // Expect failure: invalid CLUE input
368   EXPECT_EQ(iter1, nullptr);
369 
370   std::shared_ptr<Dataset> ds2 = CLUE({clue_file}, "invalid_task", usage);
371   EXPECT_NE(ds2, nullptr);
372   // Create an iterator over the result of the above dataset
373   std::shared_ptr<Iterator> iter2 = ds2->CreateIterator();
374   // Expect failure: invalid CLUE input
375   EXPECT_EQ(iter2, nullptr);
376 
377   std::shared_ptr<Dataset> ds3 = CLUE({clue_file}, task, "invalid_usage");
378   EXPECT_NE(ds3, nullptr);
379   // Create an iterator over the result of the above dataset
380   std::shared_ptr<Iterator> iter3 = ds3->CreateIterator();
381   // Expect failure: invalid CLUE input
382   EXPECT_EQ(iter3, nullptr);
383 
384   std::shared_ptr<Dataset> ds4 = CLUE({clue_file}, task, usage, 0, ShuffleMode::kGlobal, 2, 2);
385   EXPECT_NE(ds4, nullptr);
386   // Create an iterator over the result of the above dataset
387   std::shared_ptr<Iterator> iter4 = ds4->CreateIterator();
388   // Expect failure: invalid CLUE input
389   EXPECT_EQ(iter4, nullptr);
390 
391   std::shared_ptr<Dataset> ds5 = CLUE({clue_file}, task, usage, -1, ShuffleMode::kGlobal);
392   EXPECT_NE(ds5, nullptr);
393   // Create an iterator over the result of the above dataset
394   std::shared_ptr<Iterator> iter5 = ds5->CreateIterator();
395   // Expect failure: invalid CLUE input
396   EXPECT_EQ(iter5, nullptr);
397 
398   std::shared_ptr<Dataset> ds6 = CLUE({clue_file}, task, usage, 0, ShuffleMode::kGlobal, -1);
399   EXPECT_NE(ds6, nullptr);
400   // Create an iterator over the result of the above dataset
401   std::shared_ptr<Iterator> iter6 = ds6->CreateIterator();
402   // Expect failure: invalid CLUE input
403   EXPECT_EQ(iter6, nullptr);
404 
405   std::shared_ptr<Dataset> ds7 = CLUE({clue_file}, task, usage, 0, ShuffleMode::kGlobal, 0, -1);
406   EXPECT_NE(ds7, nullptr);
407   // Create an iterator over the result of the above dataset
408   std::shared_ptr<Iterator> iter7 = ds7->CreateIterator();
409   // Expect failure: invalid CLUE input
410   EXPECT_EQ(iter7, nullptr);
411 }
412 
TEST_F(MindDataTestPipeline,TestCLUEDatasetIFLYTEK)413 TEST_F(MindDataTestPipeline, TestCLUEDatasetIFLYTEK) {
414   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetIFLYTEK.";
415 
416   // Create a CLUEFile Dataset, with single CLUE file
417   std::string clue_file = datasets_root_path_ + "/testCLUE/iflytek/train.json";
418   std::string task = "IFLYTEK";
419   std::string usage = "train";
420   std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 0, ShuffleMode::kFalse);
421   EXPECT_NE(ds, nullptr);
422 
423   // Create an iterator over the result of the above dataset
424   // This will trigger the creation of the Execution Tree and launch it.
425   std::shared_ptr<Iterator> iter = ds->CreateIterator();
426   EXPECT_NE(iter, nullptr);
427 
428   // Iterate the dataset and get each row
429   std::unordered_map<std::string, mindspore::MSTensor> row;
430   ASSERT_OK(iter->GetNextRow(&row));
431 
432   EXPECT_NE(row.find("sentence"), row.end());
433   std::vector<std::string> expected_result = {"第一个文本", "第二个文本", "第三个文本"};
434 
435   uint64_t i = 0;
436   while (row.size() != 0) {
437     auto text = row["sentence"];
438     std::shared_ptr<Tensor> de_text;
439     ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
440     std::string_view sv;
441     ASSERT_OK(de_text->GetItemAt(&sv, {}));
442     std::string ss(sv);
443     EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
444     MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
445     ASSERT_OK(iter->GetNextRow(&row));
446     i++;
447   }
448 
449   // Expect 3 samples
450   EXPECT_EQ(i, 3);
451 
452   // Manually terminate the pipeline
453   iter->Stop();
454 }
455 
TEST_F(MindDataTestPipeline,TestCLUEDatasetShuffleFilesA)456 TEST_F(MindDataTestPipeline, TestCLUEDatasetShuffleFilesA) {
457   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetShuffleFilesA.";
458   // Test CLUE Dataset with files shuffle, num_parallel_workers=1
459 
460   // Set configuration
461   uint32_t original_seed = GlobalContext::config_manager()->seed();
462   uint32_t original_num_parallel_workers = GlobalContext::config_manager()->num_parallel_workers();
463   MS_LOG(DEBUG) << "ORIGINAL seed: " << original_seed << ", num_parallel_workers: " << original_num_parallel_workers;
464   GlobalContext::config_manager()->set_seed(135);
465   GlobalContext::config_manager()->set_num_parallel_workers(1);
466 
467   // Create a CLUE Dataset, with two text files, dev.json and train.json, in lexicographical order
468   // Note: train.json has 3 rows
469   // Note: dev.json has 3 rows
470   // Use default of all samples
471   // They have the same keywords
472   // Set shuffle to files shuffle
473   std::string clue_file1 = datasets_root_path_ + "/testCLUE/afqmc/train.json";
474   std::string clue_file2 = datasets_root_path_ + "/testCLUE/afqmc/dev.json";
475   std::string task = "AFQMC";
476   std::string usage = "train";
477   std::shared_ptr<Dataset> ds = CLUE({clue_file2, clue_file1}, task, usage, 0, ShuffleMode::kFiles);
478   EXPECT_NE(ds, nullptr);
479 
480   // Create an iterator over the result of the above dataset.
481   // This will trigger the creation of the Execution Tree and launch it.
482   std::shared_ptr<Iterator> iter = ds->CreateIterator();
483   EXPECT_NE(iter, nullptr);
484 
485   // Iterate the dataset and get each row
486   std::unordered_map<std::string, mindspore::MSTensor> row;
487   ASSERT_OK(iter->GetNextRow(&row));
488 
489   EXPECT_NE(row.find("sentence1"), row.end());
490   std::vector<std::string> expected_result = {"你有花呗吗",
491                                               "吃饭能用花呗吗",
492                                               "蚂蚁花呗支付金额有什么限制",
493                                               "蚂蚁借呗等额还款能否换成先息后本",
494                                               "蚂蚁花呗说我违约了",
495                                               "帮我看看本月花呗账单结清了没"};
496 
497   uint64_t i = 0;
498   while (row.size() != 0) {
499     auto text = row["sentence1"];
500     std::shared_ptr<Tensor> de_text;
501     ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
502     std::string_view sv;
503     ASSERT_OK(de_text->GetItemAt(&sv, {}));
504     std::string ss(sv);
505     MS_LOG(INFO) << "Text length: " << ss.length() << ", Text: " << ss.substr(0, 50);
506     // Compare against expected result
507     EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
508     i++;
509     ASSERT_OK(iter->GetNextRow(&row));
510   }
511 
512   // Expect 3 + 3 = 6 samples
513   EXPECT_EQ(i, 6);
514 
515   // Manually terminate the pipeline
516   iter->Stop();
517 
518   // Restore configuration
519   GlobalContext::config_manager()->set_seed(original_seed);
520   GlobalContext::config_manager()->set_num_parallel_workers(original_num_parallel_workers);
521 }
522 
TEST_F(MindDataTestPipeline,TestCLUEDatasetShuffleFilesB)523 TEST_F(MindDataTestPipeline, TestCLUEDatasetShuffleFilesB) {
524   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetShuffleFilesB.";
525   // Test CLUE Dataset with files shuffle, num_parallel_workers=1
526 
527   // Set configuration
528   uint32_t original_seed = GlobalContext::config_manager()->seed();
529   uint32_t original_num_parallel_workers = GlobalContext::config_manager()->num_parallel_workers();
530   MS_LOG(DEBUG) << "ORIGINAL seed: " << original_seed << ", num_parallel_workers: " << original_num_parallel_workers;
531   GlobalContext::config_manager()->set_seed(135);
532   GlobalContext::config_manager()->set_num_parallel_workers(1);
533 
534   // Create a CLUE Dataset, with two text files, train.json and dev.json, in non-lexicographical order
535   // Note: train.json has 3 rows
536   // Note: dev.json has 3 rows
537   // Use default of all samples
538   // They have the same keywords
539   // Set shuffle to files shuffle
540   std::string clue_file1 = datasets_root_path_ + "/testCLUE/afqmc/train.json";
541   std::string clue_file2 = datasets_root_path_ + "/testCLUE/afqmc/dev.json";
542   std::string task = "AFQMC";
543   std::string usage = "train";
544   std::shared_ptr<Dataset> ds = CLUE({clue_file1, clue_file2}, task, usage, 0, ShuffleMode::kFiles);
545   EXPECT_NE(ds, nullptr);
546 
547   // Create an iterator over the result of the above dataset.
548   // This will trigger the creation of the Execution Tree and launch it.
549   std::shared_ptr<Iterator> iter = ds->CreateIterator();
550   EXPECT_NE(iter, nullptr);
551 
552   // Iterate the dataset and get each row
553   std::unordered_map<std::string, mindspore::MSTensor> row;
554   ASSERT_OK(iter->GetNextRow(&row));
555 
556   EXPECT_NE(row.find("sentence1"), row.end());
557   std::vector<std::string> expected_result = {"你有花呗吗",
558                                               "吃饭能用花呗吗",
559                                               "蚂蚁花呗支付金额有什么限制",
560                                               "蚂蚁借呗等额还款能否换成先息后本",
561                                               "蚂蚁花呗说我违约了",
562                                               "帮我看看本月花呗账单结清了没"};
563 
564   uint64_t i = 0;
565   while (row.size() != 0) {
566     auto text = row["sentence1"];
567     std::shared_ptr<Tensor> de_text;
568     ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
569     std::string_view sv;
570     ASSERT_OK(de_text->GetItemAt(&sv, {}));
571     std::string ss(sv);
572     // Compare against expected result
573     EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
574     i++;
575     ASSERT_OK(iter->GetNextRow(&row));
576   }
577 
578   // Expect 3 + 3 = 6 samples
579   EXPECT_EQ(i, 6);
580 
581   // Manually terminate the pipeline
582   iter->Stop();
583 
584   // Restore configuration
585   GlobalContext::config_manager()->set_seed(original_seed);
586   GlobalContext::config_manager()->set_num_parallel_workers(original_num_parallel_workers);
587 }
588 
TEST_F(MindDataTestPipeline,TestCLUEDatasetShuffleGlobal)589 TEST_F(MindDataTestPipeline, TestCLUEDatasetShuffleGlobal) {
590   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetShuffleGlobal.";
591   // Test CLUE Dataset with GLOBLE shuffle
592 
593   // Set configuration
594   uint32_t original_seed = GlobalContext::config_manager()->seed();
595   uint32_t original_num_parallel_workers = GlobalContext::config_manager()->num_parallel_workers();
596   MS_LOG(DEBUG) << "ORIGINAL seed: " << original_seed << ", num_parallel_workers: " << original_num_parallel_workers;
597   GlobalContext::config_manager()->set_seed(135);
598   GlobalContext::config_manager()->set_num_parallel_workers(4);
599 
600   // Create a CLUEFile Dataset, with single CLUE file
601   std::string clue_file = datasets_root_path_ + "/testCLUE/afqmc/train.json";
602   std::string task = "AFQMC";
603   std::string usage = "train";
604   std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 0, ShuffleMode::kGlobal);
605   EXPECT_NE(ds, nullptr);
606 
607   // Create an iterator over the result of the above dataset
608   // This will trigger the creation of the Execution Tree and launch it.
609   std::shared_ptr<Iterator> iter = ds->CreateIterator();
610   EXPECT_NE(iter, nullptr);
611 
612   // Iterate the dataset and get each row
613   std::unordered_map<std::string, mindspore::MSTensor> row;
614   ASSERT_OK(iter->GetNextRow(&row));
615 
616   EXPECT_NE(row.find("sentence1"), row.end());
617   std::vector<std::string> expected_result = {"蚂蚁花呗说我违约了", "帮我看看本月花呗账单结清了没",
618                                               "蚂蚁借呗等额还款能否换成先息后本"};
619   uint64_t i = 0;
620   while (row.size() != 0) {
621     auto text = row["sentence1"];
622     std::shared_ptr<Tensor> de_text;
623     ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
624     std::string_view sv;
625     ASSERT_OK(de_text->GetItemAt(&sv, {}));
626     std::string ss(sv);
627     EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
628     MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
629     i++;
630     ASSERT_OK(iter->GetNextRow(&row));
631   }
632 
633   // Expect 3 samples
634   EXPECT_EQ(i, 3);
635 
636   // Manually terminate the pipeline
637   iter->Stop();
638 
639   // Restore configuration
640   GlobalContext::config_manager()->set_seed(original_seed);
641   GlobalContext::config_manager()->set_num_parallel_workers(original_num_parallel_workers);
642 }
643 
TEST_F(MindDataTestPipeline,TestCLUEDatasetTNEWS)644 TEST_F(MindDataTestPipeline, TestCLUEDatasetTNEWS) {
645   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetTNEWS.";
646 
647   // Create a CLUEFile Dataset, with single CLUE file
648   std::string clue_file = datasets_root_path_ + "/testCLUE/tnews/train.json";
649   std::string task = "TNEWS";
650   std::string usage = "train";
651   std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 0, ShuffleMode::kFalse);
652   EXPECT_NE(ds, nullptr);
653 
654   // Create an iterator over the result of the above dataset
655   // This will trigger the creation of the Execution Tree and launch it.
656   std::shared_ptr<Iterator> iter = ds->CreateIterator();
657   EXPECT_NE(iter, nullptr);
658 
659   // Iterate the dataset and get each row
660   std::unordered_map<std::string, mindspore::MSTensor> row;
661   ASSERT_OK(iter->GetNextRow(&row));
662 
663   EXPECT_NE(row.find("sentence"), row.end());
664   std::vector<std::string> expected_result = {"新闻1", "新闻2", "新闻3"};
665 
666   uint64_t i = 0;
667   while (row.size() != 0) {
668     auto text = row["sentence"];
669     std::shared_ptr<Tensor> de_text;
670     ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
671     std::string_view sv;
672     ASSERT_OK(de_text->GetItemAt(&sv, {}));
673     std::string ss(sv);
674     EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
675     MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
676     ASSERT_OK(iter->GetNextRow(&row));
677     i++;
678   }
679 
680   // Expect 3 samples
681   EXPECT_EQ(i, 3);
682 
683   // Manually terminate the pipeline
684   iter->Stop();
685 }
686 
TEST_F(MindDataTestPipeline,TestCLUEDatasetWSC)687 TEST_F(MindDataTestPipeline, TestCLUEDatasetWSC) {
688   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCLUEDatasetWSC.";
689 
690   // Create a CLUEFile Dataset, with single CLUE file
691   std::string clue_file = datasets_root_path_ + "/testCLUE/wsc/train.json";
692   std::string task = "WSC";
693   std::string usage = "train";
694   std::shared_ptr<Dataset> ds = CLUE({clue_file}, task, usage, 0, ShuffleMode::kFalse);
695   EXPECT_NE(ds, nullptr);
696 
697   // Create an iterator over the result of the above dataset
698   // This will trigger the creation of the Execution Tree and launch it.
699   std::shared_ptr<Iterator> iter = ds->CreateIterator();
700   EXPECT_NE(iter, nullptr);
701 
702   // Iterate the dataset and get each row
703   std::unordered_map<std::string, mindspore::MSTensor> row;
704   ASSERT_OK(iter->GetNextRow(&row));
705 
706   EXPECT_NE(row.find("text"), row.end());
707   std::vector<std::string> expected_result = {"小明呢,他在哪?", "小红刚刚看到小明,他在操场",
708                                               "等小明回来,小张你叫他交作业"};
709 
710   uint64_t i = 0;
711   while (row.size() != 0) {
712     auto text = row["text"];
713     std::shared_ptr<Tensor> de_text;
714     ASSERT_OK(Tensor::CreateFromMSTensor(text, &de_text));
715     std::string_view sv;
716     ASSERT_OK(de_text->GetItemAt(&sv, {}));
717     std::string ss(sv);
718     EXPECT_STREQ(ss.c_str(), expected_result[i].c_str());
719     MS_LOG(INFO) << "Tensor text shape: " << text.Shape();
720     ASSERT_OK(iter->GetNextRow(&row));
721     i++;
722   }
723 
724   // Expect 3 samples
725   EXPECT_EQ(i, 3);
726 
727   // Manually terminate the pipeline
728   iter->Stop();
729 }
730