• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2022 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include <memory>
17 #include <string>
18 #include <vector>
19 
20 #include "common/common.h"
21 #include "include/api/status.h"
22 #include "minddata/dataset/include/dataset/config.h"
23 #include "minddata/dataset/include/dataset/datasets.h"
24 #include "minddata/dataset/include/dataset/text.h"
25 #include "minddata/dataset/include/dataset/transforms.h"
26 #include "minddata/dataset/text/char_n_gram.h"
27 #include "minddata/dataset/text/fast_text.h"
28 #include "minddata/dataset/text/glove.h"
29 #include "minddata/dataset/text/vectors.h"
30 
31 using namespace mindspore::dataset;
32 using mindspore::Status;
33 using mindspore::dataset::CharNGram;
34 using mindspore::dataset::FastText;
35 using mindspore::dataset::GloVe;
36 using mindspore::dataset::ShuffleMode;
37 using mindspore::dataset::Tensor;
38 using mindspore::dataset::Vectors;
39 using mindspore::dataset::Vocab;
40 
41 class MindDataTestPipeline : public UT::DatasetOpTesting {
42  protected:
43 };
44 
45 /// Feature: BasicTokenizer op
46 /// Description: Test BasicTokenizer op on TextFileDataset with default inputs
47 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBasicTokenizerSuccess1)48 TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess1) {
49   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess1.";
50   // Test BasicTokenizer with default parameters
51 
52   // Create a TextFile dataset
53   std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
54   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
55   EXPECT_NE(ds, nullptr);
56 
57   // Create Take operation on ds
58   ds = ds->Take(6);
59   EXPECT_NE(ds, nullptr);
60 
61   // Create BasicTokenizer operation on ds
62   std::shared_ptr<TensorTransform> basic_tokenizer = std::make_shared<text::BasicTokenizer>();
63   EXPECT_NE(basic_tokenizer, nullptr);
64 
65   // Create Map operation on ds
66   ds = ds->Map({basic_tokenizer}, {"text"});
67   EXPECT_NE(ds, nullptr);
68 
69   // Create an iterator over the result of the above dataset
70   // This will trigger the creation of the Execution Tree and launch it.
71   std::shared_ptr<Iterator> iter = ds->CreateIterator();
72   EXPECT_NE(iter, nullptr);
73 
74   // Iterate the dataset and get each row
75   std::unordered_map<std::string, mindspore::MSTensor> row;
76   ASSERT_OK(iter->GetNextRow(&row));
77 
78   std::vector<std::vector<std::string>> expected = {
79     {"Welcome", "to", "Beijing", "北", "京", "欢", "迎", "您"},
80     {"長", "風", "破", "浪", "會", "有", "時", ",", "直", "掛", "雲", "帆", "濟", "滄", "海"},
81     {"��", "嘿", "嘿", "��", "哈", "哈", "��", "大", "笑", "��", "嘻", "嘻"},
82     {"明", "朝", "(", "1368", "—",  "1644", "年", ")", "和", "清", "朝", "(", "1644", "—",  "1911", "年", ")",
83      ",", "是", "中", "国",   "封", "建",   "王", "朝", "史", "上", "最", "后", "两",   "个", "朝",   "代"},
84     {"明", "代",   "(", "1368",     "-",  "1644", ")",      "と", "清", "代",    "(", "1644",
85      "-",  "1911", ")", "は",       "、", "中",   "国",      "の", "封", "建",    "王", "朝",
86      "の", "歴",   "史", "における", "最", "後",   "の2つの", "王", "朝", "でした"},
87     {"명나라", "(", "1368", "-",    "1644", ")",      "와",       "청나라", "(",  "1644",    "-",
88      "1911",   ")", "는",   "중국", "봉건", "왕조의", "역사에서", "마지막", "두", "왕조였다"}};
89 
90   uint64_t i = 0;
91   while (row.size() != 0) {
92     auto ind = row["text"];
93     std::shared_ptr<Tensor> de_expected_tensor;
94     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
95     mindspore::MSTensor expected_tensor =
96       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
97     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
98 
99     ASSERT_OK(iter->GetNextRow(&row));
100     i++;
101   }
102 
103   EXPECT_EQ(i, 6);
104 
105   // Manually terminate the pipeline
106   iter->Stop();
107 }
108 
109 /// Feature: BasicTokenizer op
110 /// Description: Test BasicTokenizer op on TextFileDataset with lower_case=true
111 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBasicTokenizerSuccess2)112 TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess2) {
113   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess2.";
114   // Test BasicTokenizer with lower_case true
115 
116   // Create a TextFile dataset
117   std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
118   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
119   EXPECT_NE(ds, nullptr);
120 
121   // Create Skip operation on ds
122   ds = ds->Skip(6);
123   EXPECT_NE(ds, nullptr);
124 
125   // Create BasicTokenizer operation on ds
126   std::shared_ptr<TensorTransform> basic_tokenizer = std::make_shared<text::BasicTokenizer>(true);
127   EXPECT_NE(basic_tokenizer, nullptr);
128 
129   // Create Map operation on ds
130   ds = ds->Map({basic_tokenizer}, {"text"});
131   EXPECT_NE(ds, nullptr);
132 
133   // Create an iterator over the result of the above dataset
134   // This will trigger the creation of the Execution Tree and launch it.
135   std::shared_ptr<Iterator> iter = ds->CreateIterator();
136   EXPECT_NE(iter, nullptr);
137 
138   // Iterate the dataset and get each row
139   std::unordered_map<std::string, mindspore::MSTensor> row;
140   ASSERT_OK(iter->GetNextRow(&row));
141 
142   std::vector<std::string> expected = {"this", "is", "a", "funky", "string"};
143   std::shared_ptr<Tensor> de_expected_tensor;
144   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
145   mindspore::MSTensor expected_tensor =
146     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
147 
148   uint64_t i = 0;
149   while (row.size() != 0) {
150     auto ind = row["text"];
151     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
152     ASSERT_OK(iter->GetNextRow(&row));
153     i++;
154   }
155 
156   EXPECT_EQ(i, 1);
157 
158   // Manually terminate the pipeline
159   iter->Stop();
160 }
161 
162 /// Feature: BasicTokenizer op
163 /// Description: Test BasicTokenizer op on TextFileDataset with with_offsets=true and lower_case=true
164 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBasicTokenizerSuccess3)165 TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess3) {
166   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess3.";
167   // Test BasicTokenizer with with_offsets true and lower_case true
168 
169   // Create a TextFile dataset
170   std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
171   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
172   EXPECT_NE(ds, nullptr);
173 
174   // Create Skip operation on ds
175   ds = ds->Skip(6);
176   EXPECT_NE(ds, nullptr);
177 
178   // Create BasicTokenizer operation on ds
179   std::shared_ptr<TensorTransform> basic_tokenizer =
180     std::make_shared<text::BasicTokenizer>(true, false, NormalizeForm::kNone, true, true);
181   EXPECT_NE(basic_tokenizer, nullptr);
182 
183   // Create Map operation on ds
184   ds = ds->Map({basic_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
185   EXPECT_NE(ds, nullptr);
186 
187   // Create an iterator over the result of the above dataset
188   // This will trigger the creation of the Execution Tree and launch it.
189   std::shared_ptr<Iterator> iter = ds->CreateIterator();
190   EXPECT_NE(iter, nullptr);
191 
192   // Iterate the dataset and get each row
193   std::unordered_map<std::string, mindspore::MSTensor> row;
194   ASSERT_OK(iter->GetNextRow(&row));
195 
196   std::vector<std::string> expected_tokens = {"this", "is", "a", "funky", "string"};
197   std::vector<uint32_t> expected_offsets_start = {0, 5, 8, 10, 16};
198   std::vector<uint32_t> expected_offsets_limit = {4, 7, 9, 15, 22};
199 
200   std::shared_ptr<Tensor> de_expected_tokens;
201   ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
202   mindspore::MSTensor ms_expected_tokens =
203     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
204 
205   std::shared_ptr<Tensor> de_expected_offsets_start;
206   ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
207   mindspore::MSTensor ms_expected_offsets_start =
208     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
209 
210   std::shared_ptr<Tensor> de_expected_offsets_limit;
211   ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
212   mindspore::MSTensor ms_expected_offsets_limit =
213     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
214 
215   uint64_t i = 0;
216   while (row.size() != 0) {
217     auto ind = row["token"];
218     EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
219 
220     auto start = row["offsets_start"];
221     EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
222 
223     auto limit = row["offsets_limit"];
224     EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
225 
226     ASSERT_OK(iter->GetNextRow(&row));
227     i++;
228   }
229 
230   EXPECT_EQ(i, 1);
231 
232   // Manually terminate the pipeline
233   iter->Stop();
234 }
235 
236 std::vector<std::string> list = {
237   "床", "前", "明",    "月",    "光",    "疑",    "是",      "地",        "上",        "霜",   "举",    "头",
238   "望", "低", "思",    "故",    "乡",    "繁",    "體",      "字",        "嘿",        "哈",   "大",    "笑",
239   "嘻", "i",  "am",    "mak",   "make",  "small", "mistake", "##s",       "during",    "work", "##ing", "hour",
240   "��",  "��",  "��",     "��",     "+",     "/",     "-",       "=",         "12",        "28",   "40",    "16",
241   " ",  "I",  "[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]",  "[unused1]", "[unused10]"};
242 
243 /// Feature: BertTokenizer op
244 /// Description: Test BertTokenizer op on TextFileDataset with default parameters
245 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess1)246 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess1) {
247   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess1.";
248   // Test BertTokenizer with default parameters
249 
250   // Create a TextFile dataset
251   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
252   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
253   EXPECT_NE(ds, nullptr);
254 
255   // Create Take operation on ds
256   ds = ds->Take(4);
257   EXPECT_NE(ds, nullptr);
258 
259   // Create a vocab from vector
260   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
261   Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
262   EXPECT_EQ(s, Status::OK());
263 
264   // Create BertTokenizer operation on ds
265   std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(vocab);
266   EXPECT_NE(bert_tokenizer, nullptr);
267 
268   // Create Map operation on ds
269   ds = ds->Map({bert_tokenizer}, {"text"});
270   EXPECT_NE(ds, nullptr);
271 
272   // Create an iterator over the result of the above dataset
273   // This will trigger the creation of the Execution Tree and launch it.
274   std::shared_ptr<Iterator> iter = ds->CreateIterator();
275   EXPECT_NE(iter, nullptr);
276 
277   // Iterate the dataset and get each row
278   std::unordered_map<std::string, mindspore::MSTensor> row;
279   ASSERT_OK(iter->GetNextRow(&row));
280 
281   std::vector<std::vector<std::string>> expected = {{"床", "前", "明", "月", "光"},
282                                                     {"疑", "是", "地", "上", "霜"},
283                                                     {"举", "头", "望", "明", "月"},
284                                                     {"低", "头", "思", "故", "乡"}};
285 
286   uint64_t i = 0;
287   while (row.size() != 0) {
288     auto ind = row["text"];
289     std::shared_ptr<Tensor> de_expected_tensor;
290     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
291     mindspore::MSTensor expected_tensor =
292       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
293     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
294 
295     ASSERT_OK(iter->GetNextRow(&row));
296     i++;
297   }
298 
299   EXPECT_EQ(i, 4);
300 
301   // Manually terminate the pipeline
302   iter->Stop();
303 }
304 
305 /// Feature: BertTokenizer op
306 /// Description: Test BertTokenizer op on TextFileDataset with lower_case=true
307 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess2)308 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess2) {
309   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess2.";
310   // Test BertTokenizer with lower_case true
311 
312   // Create a TextFile dataset
313   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
314   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
315   EXPECT_NE(ds, nullptr);
316 
317   // Create Skip operation on ds
318   ds = ds->Skip(4);
319   EXPECT_NE(ds, nullptr);
320 
321   // Create Take operation on ds
322   ds = ds->Take(1);
323   EXPECT_NE(ds, nullptr);
324 
325   // Create a vocab from vector
326   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
327   Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
328   EXPECT_EQ(s, Status::OK());
329 
330   // Create BertTokenizer operation on ds
331   std::shared_ptr<TensorTransform> bert_tokenizer =
332     std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", true);
333   EXPECT_NE(bert_tokenizer, nullptr);
334 
335   // Create Map operation on ds
336   ds = ds->Map({bert_tokenizer}, {"text"});
337   EXPECT_NE(ds, nullptr);
338 
339   // Create an iterator over the result of the above dataset
340   // This will trigger the creation of the Execution Tree and launch it.
341   std::shared_ptr<Iterator> iter = ds->CreateIterator();
342   EXPECT_NE(iter, nullptr);
343 
344   // Iterate the dataset and get each row
345   std::unordered_map<std::string, mindspore::MSTensor> row;
346   ASSERT_OK(iter->GetNextRow(&row));
347 
348   std::vector<std::string> expected = {"i",   "am",     "mak",  "##ing", "small", "mistake",
349                                        "##s", "during", "work", "##ing", "hour",  "##s"};
350   std::shared_ptr<Tensor> de_expected_tensor;
351   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
352   mindspore::MSTensor expected_tensor =
353     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
354 
355   uint64_t i = 0;
356   while (row.size() != 0) {
357     auto ind = row["text"];
358     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
359     ASSERT_OK(iter->GetNextRow(&row));
360     i++;
361   }
362 
363   EXPECT_EQ(i, 1);
364 
365   // Manually terminate the pipeline
366   iter->Stop();
367 }
368 
369 /// Feature: BertTokenizer op
370 /// Description: Test BertTokenizer op on TextFileDataset with NormalizeForm::kNfc
371 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess3)372 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess3) {
373   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess3.";
374   // Test BertTokenizer with normalization_form NFKC
375 
376   // Create a TextFile dataset
377   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
378   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
379   EXPECT_NE(ds, nullptr);
380 
381   // Create Skip operation on ds
382   ds = ds->Skip(5);
383   EXPECT_NE(ds, nullptr);
384 
385   // Create Take operation on ds
386   ds = ds->Take(2);
387   EXPECT_NE(ds, nullptr);
388 
389   // Create a vocab from vector
390   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
391   Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
392   EXPECT_EQ(s, Status::OK());
393 
394   // Create BertTokenizer operation on ds
395   std::shared_ptr<TensorTransform> bert_tokenizer =
396     std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", false, false, NormalizeForm::kNfc);
397   EXPECT_NE(bert_tokenizer, nullptr);
398 
399   // Create Map operation on ds
400   ds = ds->Map({bert_tokenizer}, {"text"});
401   EXPECT_NE(ds, nullptr);
402 
403   // Create an iterator over the result of the above dataset
404   // This will trigger the creation of the Execution Tree and launch it.
405   std::shared_ptr<Iterator> iter = ds->CreateIterator();
406   EXPECT_NE(iter, nullptr);
407 
408   // Iterate the dataset and get each row
409   std::unordered_map<std::string, mindspore::MSTensor> row;
410   ASSERT_OK(iter->GetNextRow(&row));
411 
412   std::vector<std::vector<std::string>> expected = {
413     {"��", "嘿", "嘿", "��", "哈", "哈", "��", "大", "笑", "��", "嘻", "嘻"}, {"繁", "體", "字"}};
414 
415   uint64_t i = 0;
416   while (row.size() != 0) {
417     auto ind = row["text"];
418     std::shared_ptr<Tensor> de_expected_tensor;
419     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
420     mindspore::MSTensor expected_tensor =
421       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
422     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
423 
424     ASSERT_OK(iter->GetNextRow(&row));
425     i++;
426   }
427 
428   EXPECT_EQ(i, 2);
429 
430   // Manually terminate the pipeline
431   iter->Stop();
432 }
433 
434 /// Feature: BertTokenizer op
435 /// Description: Test BertTokenizer op on TextFileDataset with keep_whitespace=true
436 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess4)437 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess4) {
438   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess4.";
439   // Test BertTokenizer with keep_whitespace true
440 
441   // Create a TextFile dataset
442   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
443   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
444   EXPECT_NE(ds, nullptr);
445 
446   // Create Skip operation on ds
447   ds = ds->Skip(7);
448   EXPECT_NE(ds, nullptr);
449 
450   // Create Take operation on ds
451   ds = ds->Take(1);
452   EXPECT_NE(ds, nullptr);
453 
454   // Create a vocab from vector
455   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
456   Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
457   EXPECT_EQ(s, Status::OK());
458 
459   // Create BertTokenizer operation on ds
460   std::shared_ptr<TensorTransform> bert_tokenizer =
461     std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", false, true);
462   EXPECT_NE(bert_tokenizer, nullptr);
463 
464   // Create Map operation on ds
465   ds = ds->Map({bert_tokenizer}, {"text"});
466   EXPECT_NE(ds, nullptr);
467 
468   // Create an iterator over the result of the above dataset
469   // This will trigger the creation of the Execution Tree and launch it.
470   std::shared_ptr<Iterator> iter = ds->CreateIterator();
471   EXPECT_NE(iter, nullptr);
472 
473   // Iterate the dataset and get each row
474   std::unordered_map<std::string, mindspore::MSTensor> row;
475   ASSERT_OK(iter->GetNextRow(&row));
476 
477   std::vector<std::string> expected = {"[UNK]", " ", "[CLS]"};
478   std::shared_ptr<Tensor> de_expected_tensor;
479   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
480   mindspore::MSTensor expected_tensor =
481     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
482 
483   uint64_t i = 0;
484   while (row.size() != 0) {
485     auto ind = row["text"];
486     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
487     ASSERT_OK(iter->GetNextRow(&row));
488     i++;
489   }
490 
491   EXPECT_EQ(i, 1);
492 
493   // Manually terminate the pipeline
494   iter->Stop();
495 }
496 
497 /// Feature: BertTokenizer op
498 /// Description: Test BertTokenizer op on TextFileDataset with empty unknown_token and keep_whitespace=true
499 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess5)500 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess5) {
501   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess5.";
502   // Test BertTokenizer with unknown_token empty and keep_whitespace true
503 
504   // Create a TextFile dataset
505   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
506   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
507   EXPECT_NE(ds, nullptr);
508 
509   // Create Skip operation on ds
510   ds = ds->Skip(7);
511   EXPECT_NE(ds, nullptr);
512 
513   // Create Take operation on ds
514   ds = ds->Take(1);
515   EXPECT_NE(ds, nullptr);
516 
517   // Create a vocab from vector
518   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
519   Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
520   EXPECT_EQ(s, Status::OK());
521 
522   // Create BertTokenizer operation on ds
523   std::shared_ptr<TensorTransform> bert_tokenizer =
524     std::make_shared<text::BertTokenizer>(vocab, "##", 100, "", false, true);
525   EXPECT_NE(bert_tokenizer, nullptr);
526 
527   // Create Map operation on ds
528   ds = ds->Map({bert_tokenizer}, {"text"});
529   EXPECT_NE(ds, nullptr);
530 
531   // Create an iterator over the result of the above dataset
532   // This will trigger the creation of the Execution Tree and launch it.
533   std::shared_ptr<Iterator> iter = ds->CreateIterator();
534   EXPECT_NE(iter, nullptr);
535 
536   // Iterate the dataset and get each row
537   std::unordered_map<std::string, mindspore::MSTensor> row;
538   ASSERT_OK(iter->GetNextRow(&row));
539 
540   std::vector<std::string> expected = {"unused", " ", "[CLS]"};
541   std::shared_ptr<Tensor> de_expected_tensor;
542   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
543   mindspore::MSTensor expected_tensor =
544     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
545 
546   uint64_t i = 0;
547   while (row.size() != 0) {
548     auto ind = row["text"];
549     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
550     ASSERT_OK(iter->GetNextRow(&row));
551     i++;
552   }
553 
554   EXPECT_EQ(i, 1);
555 
556   // Manually terminate the pipeline
557   iter->Stop();
558 }
559 
560 /// Feature: BertTokenizer op
561 /// Description: Test BertTokenizer op with preserve_unused_token=false, empty unknown_token, and keep_whitespace=true
562 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess6)563 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess6) {
564   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess6.";
565   // Test BertTokenizer with preserve_unused_token false, unknown_token empty and keep_whitespace true
566 
567   // Create a TextFile dataset
568   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
569   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
570   EXPECT_NE(ds, nullptr);
571 
572   // Create Skip operation on ds
573   ds = ds->Skip(7);
574   EXPECT_NE(ds, nullptr);
575 
576   // Create Take operation on ds
577   ds = ds->Take(1);
578   EXPECT_NE(ds, nullptr);
579 
580   // Create a vocab from vector
581   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
582   Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
583   EXPECT_EQ(s, Status::OK());
584 
585   // Create BertTokenizer operation on ds
586   std::shared_ptr<TensorTransform> bert_tokenizer =
587     std::make_shared<text::BertTokenizer>(vocab, "##", 100, "", false, true, NormalizeForm::kNone, false);
588   EXPECT_NE(bert_tokenizer, nullptr);
589 
590   // Create Map operation on ds
591   ds = ds->Map({bert_tokenizer}, {"text"});
592   EXPECT_NE(ds, nullptr);
593 
594   // Create an iterator over the result of the above dataset
595   // This will trigger the creation of the Execution Tree and launch it.
596   std::shared_ptr<Iterator> iter = ds->CreateIterator();
597   EXPECT_NE(iter, nullptr);
598 
599   // Iterate the dataset and get each row
600   std::unordered_map<std::string, mindspore::MSTensor> row;
601   ASSERT_OK(iter->GetNextRow(&row));
602 
603   std::vector<std::string> expected = {"unused", " ", "[", "CLS", "]"};
604   std::shared_ptr<Tensor> de_expected_tensor;
605   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
606   mindspore::MSTensor expected_tensor =
607     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
608 
609   uint64_t i = 0;
610   while (row.size() != 0) {
611     auto ind = row["text"];
612     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
613     ASSERT_OK(iter->GetNextRow(&row));
614     i++;
615   }
616 
617   EXPECT_EQ(i, 1);
618 
619   // Manually terminate the pipeline
620   iter->Stop();
621 }
622 
623 /// Feature: BertTokenizer op
624 /// Description: Test BertTokenizer op with with_offsets=true and lower_case=true
625 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess7)626 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess7) {
627   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess7.";
628   // Test BertTokenizer with with_offsets true and lower_case true
629 
630   // Create a TextFile dataset
631   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
632   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
633   EXPECT_NE(ds, nullptr);
634 
635   // Create Skip operation on ds
636   ds = ds->Skip(4);
637   EXPECT_NE(ds, nullptr);
638 
639   // Create Take operation on ds
640   ds = ds->Take(1);
641   EXPECT_NE(ds, nullptr);
642 
643   // Create a vocab from vector
644   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
645   Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
646   EXPECT_EQ(s, Status::OK());
647 
648   // Create BertTokenizer operation on ds
649   std::shared_ptr<TensorTransform> bert_tokenizer =
650     std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", true, false, NormalizeForm::kNone, true, true);
651   EXPECT_NE(bert_tokenizer, nullptr);
652 
653   // Create Map operation on ds
654   ds = ds->Map({bert_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
655   EXPECT_NE(ds, nullptr);
656 
657   // Create an iterator over the result of the above dataset
658   // This will trigger the creation of the Execution Tree and launch it.
659   std::shared_ptr<Iterator> iter = ds->CreateIterator();
660   EXPECT_NE(iter, nullptr);
661 
662   // Iterate the dataset and get each row
663   std::unordered_map<std::string, mindspore::MSTensor> row;
664   ASSERT_OK(iter->GetNextRow(&row));
665 
666   std::vector<std::string> expected_tokens = {"i",   "am",     "mak",  "##ing", "small", "mistake",
667                                               "##s", "during", "work", "##ing", "hour",  "##s"};
668   std::vector<uint32_t> expected_offsets_start = {0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46};
669   std::vector<uint32_t> expected_offsets_limit = {1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47};
670 
671   std::shared_ptr<Tensor> de_expected_tokens;
672   ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
673   mindspore::MSTensor ms_expected_tokens =
674     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
675 
676   std::shared_ptr<Tensor> de_expected_offsets_start;
677   ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
678   mindspore::MSTensor ms_expected_offsets_start =
679     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
680 
681   std::shared_ptr<Tensor> de_expected_offsets_limit;
682   ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
683   mindspore::MSTensor ms_expected_offsets_limit =
684     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
685 
686   uint64_t i = 0;
687   while (row.size() != 0) {
688     auto ind = row["token"];
689     EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
690 
691     auto start = row["offsets_start"];
692     EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
693 
694     auto limit = row["offsets_limit"];
695     EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
696 
697     ASSERT_OK(iter->GetNextRow(&row));
698     i++;
699   }
700 
701   EXPECT_EQ(i, 1);
702 
703   // Manually terminate the pipeline
704   iter->Stop();
705 }
706 
707 /// Feature: BertTokenizer op
708 /// Description: Test BertTokenizer op with nullptr vocab
709 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestBertTokenizerFail1)710 TEST_F(MindDataTestPipeline, TestBertTokenizerFail1) {
711   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail1.";
712   // Test BertTokenizer with nullptr vocab
713 
714   // Create a TextFile dataset
715   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
716   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
717   EXPECT_NE(ds, nullptr);
718 
719   // Create BertTokenizer operation on ds
720   std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(nullptr);
721   EXPECT_NE(bert_tokenizer, nullptr);
722 
723   // Create a Map operation on ds
724   ds = ds->Map({bert_tokenizer});
725   EXPECT_NE(ds, nullptr);
726 
727   std::shared_ptr<Iterator> iter = ds->CreateIterator();
728   // Expect failure: invalid BertTokenizer input with nullptr vocab
729   EXPECT_EQ(iter, nullptr);
730 }
731 
732 /// Feature: BertTokenizer op
733 /// Description: Test BertTokenizer op with negative max_bytes_per_token
734 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestBertTokenizerFail2)735 TEST_F(MindDataTestPipeline, TestBertTokenizerFail2) {
736   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail2.";
737   // Test BertTokenizer with negative max_bytes_per_token
738 
739   // Create a TextFile dataset
740   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
741   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
742   EXPECT_NE(ds, nullptr);
743 
744   // Create a vocab from vector
745   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
746   Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
747   EXPECT_EQ(s, Status::OK());
748 
749   // Create BertTokenizer operation on ds
750   std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(vocab, "##", -1);
751   EXPECT_NE(bert_tokenizer, nullptr);
752 
753   // Create a Map operation on ds
754   ds = ds->Map({bert_tokenizer});
755   EXPECT_NE(ds, nullptr);
756 
757   std::shared_ptr<Iterator> iter = ds->CreateIterator();
758   // Expect failure: invalid BertTokenizer input with nullptr vocab
759   EXPECT_EQ(iter, nullptr);
760 }
761 
762 /// Feature: CaseFold op
763 /// Description: Test CaseFold op on TextFileDataset with default parameters
764 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestCaseFoldSuccess)765 TEST_F(MindDataTestPipeline, TestCaseFoldSuccess) {
766   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaseFoldSuccess.";
767 
768   // Create a TextFile dataset
769   std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
770   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
771   EXPECT_NE(ds, nullptr);
772 
773   // Create casefold operation on ds
774   std::shared_ptr<TensorTransform> casefold = std::make_shared<text::CaseFold>();
775   EXPECT_NE(casefold, nullptr);
776 
777   // Create Map operation on ds
778   ds = ds->Map({casefold}, {"text"});
779   EXPECT_NE(ds, nullptr);
780 
781   // Create an iterator over the result of the above dataset
782   // This will trigger the creation of the Execution Tree and launch it.
783   std::shared_ptr<Iterator> iter = ds->CreateIterator();
784   EXPECT_NE(iter, nullptr);
785 
786   // Iterate the dataset and get each row
787   std::unordered_map<std::string, mindspore::MSTensor> row;
788   ASSERT_OK(iter->GetNextRow(&row));
789 
790   std::vector<std::string> expected = {"welcome to beijing!", "北京欢迎您!", "我喜欢english!", "  "};
791 
792   uint64_t i = 0;
793   while (row.size() != 0) {
794     auto ind = row["text"];
795     std::shared_ptr<Tensor> de_expected_tensor;
796     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
797     mindspore::MSTensor ms_expected_tensor =
798       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
799     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
800     ASSERT_OK(iter->GetNextRow(&row));
801     i++;
802   }
803 
804   EXPECT_EQ(i, 4);
805 
806   // Manually terminate the pipeline
807   iter->Stop();
808 }
809 
810 /// Feature: FilterWikipediaXML op
811 /// Description: Test FilterWikipediaXML op in pipeline mode
812 /// Expectation: The data is processed successfully
TEST_F(MindDataTestPipeline,TestFilterWikipediaXMLSuccess)813 TEST_F(MindDataTestPipeline, TestFilterWikipediaXMLSuccess) {
814   // Testing the parameter of FilterWikipediaXML interface .
815   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFilterWikipediaXMLSuccess.";
816 
817   // Create a TextFile dataset
818   std::string data_file = datasets_root_path_ + "/testTokenizerData/2.txt";
819   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
820   EXPECT_NE(ds, nullptr);
821 
822   // Create filter_wikipedia_xml operation on ds
823   std::shared_ptr<TensorTransform> filter_wikipedia_xml = std::make_shared<text::FilterWikipediaXML>();
824   EXPECT_NE(filter_wikipedia_xml, nullptr);
825 
826   // Create Map operation on ds
827   ds = ds->Map({filter_wikipedia_xml}, {"text"});
828   EXPECT_NE(ds, nullptr);
829 
830   // Create an iterator over the result of the above dataset
831   // This will trigger the creation of the Execution Tree and launch it.
832   std::shared_ptr<Iterator> iter = ds->CreateIterator();
833   EXPECT_NE(iter, nullptr);
834 
835   // Iterate the dataset and get each row
836   std::unordered_map<std::string, mindspore::MSTensor> row;
837   ASSERT_OK(iter->GetNextRow(&row));
838   std::vector<std::string> expected = {"welcome to beijing", "", ""};
839 
840   uint64_t i = 0;
841 
842   while (row.size() != 0) {
843     auto ind = row["text"];
844     std::shared_ptr<Tensor> de_expected_tensor;
845     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
846     mindspore::MSTensor ms_expected_tensor =
847       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
848     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
849     ASSERT_OK(iter->GetNextRow(&row));
850     i++;
851   }
852 
853   EXPECT_EQ(i, 3);
854 
855   // Manually terminate the pipeline
856   iter->Stop();
857 }
858 
859 /// Feature: JiebaTokenizer op
860 /// Description: Test JiebaTokenizer op when the mode is JiebaMode::kMp and with_offsets=false
861 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerSuccess)862 TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess) {
863   // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is false.
864   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess.";
865 
866   // Create a TextFile dataset
867   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
868   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
869   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
870   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
871   EXPECT_NE(ds, nullptr);
872 
873   // Create jieba_tokenizer operation on ds
874   std::shared_ptr<TensorTransform> jieba_tokenizer =
875     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
876   EXPECT_NE(jieba_tokenizer, nullptr);
877 
878   // Create Map operation on ds
879   ds = ds->Map({jieba_tokenizer}, {"text"});
880   EXPECT_NE(ds, nullptr);
881 
882   // Create an iterator over the result of the above dataset
883   // This will trigger the creation of the Execution Tree and launch it.
884   std::shared_ptr<Iterator> iter = ds->CreateIterator();
885   EXPECT_NE(iter, nullptr);
886 
887   // Iterate the dataset and get each row
888   std::unordered_map<std::string, mindspore::MSTensor> row;
889   ASSERT_OK(iter->GetNextRow(&row));
890 
891   std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
892   std::shared_ptr<Tensor> de_expected_tensor;
893   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
894   mindspore::MSTensor expected_tensor =
895     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
896 
897   uint64_t i = 0;
898   while (row.size() != 0) {
899     auto ind = row["text"];
900     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
901     ASSERT_OK(iter->GetNextRow(&row));
902     i++;
903   }
904 
905   EXPECT_EQ(i, 1);
906 
907   // Manually terminate the pipeline
908   iter->Stop();
909 }
910 
911 /// Feature: JiebaTokenizer op
912 /// Description: Test JiebaTokenizer op when the mode is JiebaMode::kHmm and with_offsets=false
913 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerSuccess1)914 TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess1) {
915   // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kHmm and the with_offsets is false.
916   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess1.";
917 
918   // Create a TextFile dataset
919   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
920   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
921   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
922   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
923   EXPECT_NE(ds, nullptr);
924 
925   // Create jieba_tokenizer operation on ds
926   std::shared_ptr<TensorTransform> jieba_tokenizer =
927     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kHmm);
928   EXPECT_NE(jieba_tokenizer, nullptr);
929 
930   // Create Map operation on ds
931   ds = ds->Map({jieba_tokenizer}, {"text"});
932   EXPECT_NE(ds, nullptr);
933 
934   // Create an iterator over the result of the above dataset
935   // This will trigger the creation of the Execution Tree and launch it.
936   std::shared_ptr<Iterator> iter = ds->CreateIterator();
937   EXPECT_NE(iter, nullptr);
938 
939   // Iterate the dataset and get each row
940   std::unordered_map<std::string, mindspore::MSTensor> row;
941   ASSERT_OK(iter->GetNextRow(&row));
942 
943   std::vector<std::string> expected = {"今天", "天气", "太", "好", "了", "我们", "一起", "去", "外面", "玩", "吧"};
944   std::shared_ptr<Tensor> de_expected_tensor;
945   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
946   mindspore::MSTensor expected_tensor =
947     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
948 
949   uint64_t i = 0;
950   while (row.size() != 0) {
951     auto ind = row["text"];
952     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
953     ASSERT_OK(iter->GetNextRow(&row));
954     i++;
955   }
956 
957   EXPECT_EQ(i, 1);
958 
959   // Manually terminate the pipeline
960   iter->Stop();
961 }
962 
963 /// Feature: JiebaTokenizer op
964 /// Description: Test JiebaTokenizer op when the mode is JiebaMode::kMp and with_offsets=true
965 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerSuccess2)966 TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess2) {
967   // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is true.
968   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess2.";
969 
970   // Create a TextFile dataset
971   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
972   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
973   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
974   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
975   EXPECT_NE(ds, nullptr);
976 
977   // Create jieba_tokenizer operation on ds
978   std::shared_ptr<TensorTransform> jieba_tokenizer =
979     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp, true);
980   EXPECT_NE(jieba_tokenizer, nullptr);
981 
982   // Create Map operation on ds
983   ds = ds->Map({jieba_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
984   EXPECT_NE(ds, nullptr);
985 
986   // Create an iterator over the result of the above dataset
987   // This will trigger the creation of the Execution Tree and launch it.
988   std::shared_ptr<Iterator> iter = ds->CreateIterator();
989   EXPECT_NE(iter, nullptr);
990 
991   // Iterate the dataset and get each row
992   std::unordered_map<std::string, mindspore::MSTensor> row;
993   ASSERT_OK(iter->GetNextRow(&row));
994 
995   std::vector<std::string> expected_tokens = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
996   std::vector<uint32_t> expected_offsets_start = {0, 12, 21, 27, 33, 36, 42};
997   std::vector<uint32_t> expected_offsets_limit = {12, 21, 27, 33, 36, 42, 48};
998 
999   std::shared_ptr<Tensor> de_expected_tokens;
1000   ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
1001   mindspore::MSTensor ms_expected_tokens =
1002     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
1003 
1004   std::shared_ptr<Tensor> de_expected_offsets_start;
1005   ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
1006   mindspore::MSTensor ms_expected_offsets_start =
1007     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
1008 
1009   std::shared_ptr<Tensor> de_expected_offsets_limit;
1010   ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
1011   mindspore::MSTensor ms_expected_offsets_limit =
1012     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
1013 
1014   uint64_t i = 0;
1015   while (row.size() != 0) {
1016     auto ind = row["token"];
1017     EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
1018 
1019     auto start = row["offsets_start"];
1020     EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
1021 
1022     auto limit = row["offsets_limit"];
1023     EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
1024 
1025     ASSERT_OK(iter->GetNextRow(&row));
1026     i++;
1027   }
1028 
1029   EXPECT_EQ(i, 1);
1030 
1031   // Manually terminate the pipeline
1032   iter->Stop();
1033 }
1034 
1035 /// Feature: JiebaTokenizer op
1036 /// Description: Test JiebaTokenizer op with empty hmm_path
1037 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestJiebaTokenizerFail1)1038 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail1) {
1039   // Testing the incorrect parameter of JiebaTokenizer interface.
1040   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail1.";
1041 
1042   // Create a TextFile dataset
1043   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1044   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1045   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1046   EXPECT_NE(ds, nullptr);
1047 
1048   // Create jieba_tokenizer operation on ds
1049   // Testing the parameter hmm_path is empty
1050   std::shared_ptr<TensorTransform> jieba_tokenizer =
1051     std::make_shared<text::JiebaTokenizer>("", mp_path, JiebaMode::kMp);
1052   EXPECT_NE(jieba_tokenizer, nullptr);
1053 
1054   // Create a Map operation on ds
1055   ds = ds->Map({jieba_tokenizer});
1056   EXPECT_NE(ds, nullptr);
1057 
1058   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1059   // Expect failure: invalid JiebaTokenizer input (parameter hmm_path is empty)
1060   EXPECT_EQ(iter, nullptr);
1061 }
1062 
1063 /// Feature: JiebaTokenizer op
1064 /// Description: Test JiebaTokenizer op with empty mp_path
1065 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestJiebaTokenizerFail2)1066 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail2) {
1067   // Testing the incorrect parameter of JiebaTokenizer interface.
1068   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail2.";
1069 
1070   // Create a TextFile dataset
1071   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1072   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1073   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1074   EXPECT_NE(ds, nullptr);
1075 
1076   // Create jieba_tokenizer operation on ds
1077   // Testing the parameter mp_path is empty
1078   std::shared_ptr<TensorTransform> jieba_tokenizer =
1079     std::make_shared<text::JiebaTokenizer>(hmm_path, "", JiebaMode::kMp);
1080   EXPECT_NE(jieba_tokenizer, nullptr);
1081 
1082   // Create a Map operation on ds
1083   ds = ds->Map({jieba_tokenizer});
1084   EXPECT_NE(ds, nullptr);
1085 
1086   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1087   // Expect failure: invalid JiebaTokenizer input (parameter mp_path is empty)
1088   EXPECT_EQ(iter, nullptr);
1089 }
1090 
1091 /// Feature: JiebaTokenizer op
1092 /// Description: Test JiebaTokenizer op with invalid hmm_path
1093 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestJiebaTokenizerFail3)1094 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail3) {
1095   // Testing the incorrect parameter of JiebaTokenizer interface.
1096   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail3.";
1097 
1098   // Create a TextFile dataset
1099   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1100   std::string hmm_path_invalid = datasets_root_path_ + "/jiebadict/1.txt";
1101   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1102   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1103   EXPECT_NE(ds, nullptr);
1104 
1105   // Create jieba_tokenizer operation on ds
1106   // Testing the parameter hmm_path is invalid path
1107   std::shared_ptr<TensorTransform> jieba_tokenizer =
1108     std::make_shared<text::JiebaTokenizer>(hmm_path_invalid, mp_path, JiebaMode::kMp);
1109   EXPECT_NE(jieba_tokenizer, nullptr);
1110 
1111   // Create a Map operation on ds
1112   ds = ds->Map({jieba_tokenizer});
1113   EXPECT_NE(ds, nullptr);
1114 
1115   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1116   // Expect failure: invalid JiebaTokenizer input (parameter hmm_path is invalid path)
1117   EXPECT_EQ(iter, nullptr);
1118 }
1119 
1120 /// Feature: JiebaTokenizer op
1121 /// Description: Test JiebaTokenizer op with invalid mp_path
1122 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestJiebaTokenizerFail4)1123 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail4) {
1124   // Testing the incorrect parameter of JiebaTokenizer interface.
1125   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail4.";
1126 
1127   // Create a TextFile dataset
1128   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1129   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1130   std::string mp_path_invalid = datasets_root_path_ + "/jiebadict/1.txt";
1131   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1132   EXPECT_NE(ds, nullptr);
1133 
1134   // Create jieba_tokenizer operation on ds
1135   // Testing the parameter mp_path is invalid path
1136   std::shared_ptr<TensorTransform> jieba_tokenizer =
1137     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path_invalid, JiebaMode::kMp);
1138   EXPECT_NE(jieba_tokenizer, nullptr);
1139 
1140   // Create a Map operation on ds
1141   ds = ds->Map({jieba_tokenizer});
1142   EXPECT_NE(ds, nullptr);
1143 
1144   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1145   // Expect failure: invalid JiebaTokenizer input (parameter mp_path is invalid path)
1146   EXPECT_EQ(iter, nullptr);
1147 }
1148 
1149 /// Feature: JiebaTokenizer op
1150 /// Description: Test AddWord of JiebaTokenizer when the freq is not provided (default 0)
1151 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddWord)1152 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord) {
1153   // Testing the parameter AddWord of JiebaTokenizer when the freq is not provided (default 0).
1154   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord.";
1155 
1156   // Create a TextFile dataset
1157   std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
1158   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1159   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1160   std::shared_ptr<Dataset> ds = TextFile({data_file});
1161   EXPECT_NE(ds, nullptr);
1162 
1163   // Create jieba_tokenizer operation on ds
1164   std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1165     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1166   EXPECT_NE(jieba_tokenizer, nullptr);
1167 
1168   // Add word with freq not provided (default 0)
1169   ASSERT_OK(jieba_tokenizer->AddWord("男默女泪"));
1170 
1171   // Create Map operation on ds
1172   ds = ds->Map({jieba_tokenizer}, {"text"});
1173   EXPECT_NE(ds, nullptr);
1174 
1175   // Create an iterator over the result of the above dataset
1176   // This will trigger the creation of the Execution Tree and launch it.
1177   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1178   EXPECT_NE(iter, nullptr);
1179 
1180   // Iterate the dataset and get each row
1181   std::unordered_map<std::string, mindspore::MSTensor> row;
1182   ASSERT_OK(iter->GetNextRow(&row));
1183 
1184   std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
1185   std::shared_ptr<Tensor> de_expected_tensor;
1186   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1187   mindspore::MSTensor expected_tensor =
1188     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1189 
1190   uint64_t i = 0;
1191   while (row.size() != 0) {
1192     auto ind = row["text"];
1193     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1194     ASSERT_OK(iter->GetNextRow(&row));
1195     i++;
1196   }
1197 
1198   EXPECT_EQ(i, 1);
1199 
1200   // Manually terminate the pipeline
1201   iter->Stop();
1202 }
1203 
1204 /// Feature: JiebaTokenizer op
1205 /// Description: Test AddWord of JiebaTokenizer when the freq is set explicitly to 0
1206 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddWord1)1207 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord1) {
1208   // Testing the parameter AddWord of JiebaTokenizer when the freq is set explicitly to 0.
1209   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord1.";
1210 
1211   // Create a TextFile dataset
1212   std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
1213   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1214   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1215   std::shared_ptr<Dataset> ds = TextFile({data_file});
1216   EXPECT_NE(ds, nullptr);
1217 
1218   // Create jieba_tokenizer operation on ds
1219   std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1220     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1221   EXPECT_NE(jieba_tokenizer, nullptr);
1222 
1223   // Add word with freq is set explicitly to 0
1224   ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 0));
1225 
1226   // Create Map operation on ds
1227   ds = ds->Map({jieba_tokenizer}, {"text"});
1228   EXPECT_NE(ds, nullptr);
1229 
1230   // Create an iterator over the result of the above dataset
1231   // This will trigger the creation of the Execution Tree and launch it.
1232   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1233   EXPECT_NE(iter, nullptr);
1234 
1235   // Iterate the dataset and get each row
1236   std::unordered_map<std::string, mindspore::MSTensor> row;
1237   ASSERT_OK(iter->GetNextRow(&row));
1238 
1239   std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
1240   std::shared_ptr<Tensor> de_expected_tensor;
1241   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1242   mindspore::MSTensor expected_tensor =
1243     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1244 
1245   uint64_t i = 0;
1246   while (row.size() != 0) {
1247     auto ind = row["text"];
1248     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1249     ASSERT_OK(iter->GetNextRow(&row));
1250     i++;
1251   }
1252 
1253   EXPECT_EQ(i, 1);
1254 
1255   // Manually terminate the pipeline
1256   iter->Stop();
1257 }
1258 
1259 /// Feature: JiebaTokenizer op
1260 /// Description: Test AddWord of JiebaTokenizer when the freq is set to 10
1261 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddWord2)1262 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord2) {
1263   // Testing the parameter AddWord of JiebaTokenizer when the freq is 10.
1264   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord2.";
1265 
1266   // Create a TextFile dataset
1267   std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
1268   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1269   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1270   std::shared_ptr<Dataset> ds = TextFile({data_file});
1271   EXPECT_NE(ds, nullptr);
1272 
1273   // Create jieba_tokenizer operation on ds
1274   std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1275     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1276   EXPECT_NE(jieba_tokenizer, nullptr);
1277 
1278   // Add word with freq 10
1279   ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 10));
1280 
1281   // Create Map operation on ds
1282   ds = ds->Map({jieba_tokenizer}, {"text"});
1283   EXPECT_NE(ds, nullptr);
1284 
1285   // Create an iterator over the result of the above dataset
1286   // This will trigger the creation of the Execution Tree and launch it.
1287   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1288   EXPECT_NE(iter, nullptr);
1289 
1290   // Iterate the dataset and get each row
1291   std::unordered_map<std::string, mindspore::MSTensor> row;
1292   ASSERT_OK(iter->GetNextRow(&row));
1293 
1294   std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
1295   std::shared_ptr<Tensor> de_expected_tensor;
1296   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1297   mindspore::MSTensor expected_tensor =
1298     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1299 
1300   uint64_t i = 0;
1301   while (row.size() != 0) {
1302     auto ind = row["text"];
1303     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1304     ASSERT_OK(iter->GetNextRow(&row));
1305     i++;
1306   }
1307 
1308   EXPECT_EQ(i, 1);
1309 
1310   // Manually terminate the pipeline
1311   iter->Stop();
1312 }
1313 
1314 /// Feature: JiebaTokenizer op
1315 /// Description: Test AddWord of JiebaTokenizer when the freq is 20000 which affects the result of segmentation
1316 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddWord3)1317 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord3) {
1318   // Testing the parameter AddWord of JiebaTokenizer when the freq is 20000 which affects the result of segmentation.
1319   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord3.";
1320 
1321   // Create a TextFile dataset
1322   std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt";
1323   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1324   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1325   std::shared_ptr<Dataset> ds = TextFile({data_file});
1326   EXPECT_NE(ds, nullptr);
1327 
1328   // Create jieba_tokenizer operation on ds
1329   std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1330     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1331   EXPECT_NE(jieba_tokenizer, nullptr);
1332 
1333   // Add word with freq 20000
1334   ASSERT_OK(jieba_tokenizer->AddWord("江大桥", 20000));
1335 
1336   // Create Map operation on ds
1337   ds = ds->Map({jieba_tokenizer}, {"text"});
1338   EXPECT_NE(ds, nullptr);
1339 
1340   // Create an iterator over the result of the above dataset
1341   // This will trigger the creation of the Execution Tree and launch it.
1342   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1343   EXPECT_NE(iter, nullptr);
1344 
1345   // Iterate the dataset and get each row
1346   std::unordered_map<std::string, mindspore::MSTensor> row;
1347   ASSERT_OK(iter->GetNextRow(&row));
1348 
1349   std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"};
1350   std::shared_ptr<Tensor> de_expected_tensor;
1351   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1352   mindspore::MSTensor expected_tensor =
1353     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1354 
1355   uint64_t i = 0;
1356   while (row.size() != 0) {
1357     auto ind = row["text"];
1358     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1359     ASSERT_OK(iter->GetNextRow(&row));
1360     i++;
1361   }
1362 
1363   EXPECT_EQ(i, 1);
1364 
1365   // Manually terminate the pipeline
1366   iter->Stop();
1367 }
1368 
1369 /// Feature: JiebaTokenizer op
1370 /// Description: Test AddWord of JiebaTokenizer with invalid parameters
1371 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddWordFail)1372 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWordFail) {
1373   // Testing the incorrect parameter of AddWord in JiebaTokenizer.
1374   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWordFail.";
1375 
1376   // Create a TextFile dataset
1377   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1378   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1379   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1380   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1381   EXPECT_NE(ds, nullptr);
1382 
1383   // Testing the parameter word of AddWord is empty
1384   std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1385     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1386   EXPECT_NE(jieba_tokenizer, nullptr);
1387   EXPECT_NE(jieba_tokenizer->AddWord("", 10), Status::OK());
1388   // Testing the parameter freq of AddWord is negative
1389   std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer1 =
1390     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1391   EXPECT_NE(jieba_tokenizer1, nullptr);
1392   EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK());
1393 }
1394 
1395 /// Feature: JiebaTokenizer op
1396 /// Description: Test AddDict of JiebaTokenizer when the input is a vector of word-freq pair
1397 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddDict)1398 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDict) {
1399   // Testing AddDict of JiebaTokenizer when the input is a vector of word-freq pair.
1400   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDict.";
1401 
1402   // Create a TextFile dataset
1403   std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt";
1404   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1405   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1406   std::shared_ptr<Dataset> ds = TextFile({data_file});
1407   EXPECT_NE(ds, nullptr);
1408 
1409   // Create jieba_tokenizer operation on ds
1410   std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1411     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1412   EXPECT_NE(jieba_tokenizer, nullptr);
1413 
1414   // Add word with freq 20000
1415   std::vector<std::pair<std::string, int64_t>> user_dict = {{"江大桥", 20000}};
1416   ASSERT_OK(jieba_tokenizer->AddDict(user_dict));
1417 
1418   // Create Map operation on ds
1419   ds = ds->Map({jieba_tokenizer}, {"text"});
1420   EXPECT_NE(ds, nullptr);
1421 
1422   // Create an iterator over the result of the above dataset
1423   // This will trigger the creation of the Execution Tree and launch it.
1424   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1425   EXPECT_NE(iter, nullptr);
1426 
1427   // Iterate the dataset and get each row
1428   std::unordered_map<std::string, mindspore::MSTensor> row;
1429   ASSERT_OK(iter->GetNextRow(&row));
1430 
1431   std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"};
1432   std::shared_ptr<Tensor> de_expected_tensor;
1433   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1434   mindspore::MSTensor expected_tensor =
1435     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1436 
1437   uint64_t i = 0;
1438   while (row.size() != 0) {
1439     auto txt = row["text"];
1440     EXPECT_MSTENSOR_EQ(txt, expected_tensor);
1441     ASSERT_OK(iter->GetNextRow(&row));
1442     i++;
1443   }
1444 
1445   EXPECT_EQ(i, 1);
1446 
1447   // Manually terminate the pipeline
1448   iter->Stop();
1449 }
1450 
1451 /// Feature: JiebaTokenizer op
1452 /// Description: Test AddDict of JiebaTokenizer when the input is a path to dict
1453 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddDictFromFile)1454 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDictFromFile) {
1455   // Testing AddDict of JiebaTokenizer when the input is a path to dict.
1456   // Test error scenario for AddDict: invalid path
1457   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDictFromFile.";
1458 
1459   // Create a TextFile dataset
1460   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1461   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1462   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1463   std::shared_ptr<Dataset> ds = TextFile({data_file});
1464   EXPECT_NE(ds, nullptr);
1465 
1466   // Create jieba_tokenizer operation on ds
1467   std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1468     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1469   EXPECT_NE(jieba_tokenizer, nullptr);
1470 
1471   // Load dict from txt file
1472   std::string user_dict_path = datasets_root_path_ + "/testJiebaDataset/user_dict.txt";
1473   std::string invalid_path = datasets_root_path_ + "/testJiebaDataset/invalid_path.txt";
1474   EXPECT_ERROR(jieba_tokenizer->AddDict(invalid_path));
1475   ASSERT_OK(jieba_tokenizer->AddDict(user_dict_path));
1476 
1477   // Create Map operation on ds
1478   ds = ds->Map({jieba_tokenizer}, {"text"});
1479   EXPECT_NE(ds, nullptr);
1480 
1481   // Create an iterator over the result of the above dataset
1482   // This will trigger the creation of the Execution Tree and launch it.
1483   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1484   EXPECT_NE(iter, nullptr);
1485 
1486   // Iterate the dataset and get each row
1487   std::unordered_map<std::string, mindspore::MSTensor> row;
1488   ASSERT_OK(iter->GetNextRow(&row));
1489 
1490   std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
1491   std::shared_ptr<Tensor> de_expected_tensor;
1492   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1493   mindspore::MSTensor expected_tensor =
1494     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1495 
1496   uint64_t i = 0;
1497   while (row.size() != 0) {
1498     auto txt = row["text"];
1499     EXPECT_MSTENSOR_EQ(txt, expected_tensor);
1500     ASSERT_OK(iter->GetNextRow(&row));
1501     i++;
1502   }
1503 
1504   EXPECT_EQ(i, 1);
1505 
1506   // Manually terminate the pipeline
1507   iter->Stop();
1508 }
1509 
1510 /// Feature: SlidingWindow op
1511 /// Description: Test SlidingWindow when the axis is 0
1512 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestSlidingWindowSuccess)1513 TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) {
1514   // Testing the parameter of SlidingWindow interface when the axis is 0.
1515   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess.";
1516 
1517   // Create a TextFile dataset
1518   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
1519   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1520   EXPECT_NE(ds, nullptr);
1521 
1522   // Create white_tokenizer operation on ds
1523   std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
1524   EXPECT_NE(white_tokenizer, nullptr);
1525   // Create sliding_window operation on ds
1526   std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(3, 0);
1527   EXPECT_NE(sliding_window, nullptr);
1528 
1529   // Create Map operation on ds
1530   ds = ds->Map({white_tokenizer, sliding_window}, {"text"});
1531   EXPECT_NE(ds, nullptr);
1532 
1533   // Create an iterator over the result of the above dataset
1534   // This will trigger the creation of the Execution Tree and launch it.
1535   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1536   EXPECT_NE(iter, nullptr);
1537 
1538   // Iterate the dataset and get each row
1539   std::unordered_map<std::string, mindspore::MSTensor> row;
1540   ASSERT_OK(iter->GetNextRow(&row));
1541 
1542   std::vector<std::vector<std::string>> expected = {{"This", "is", "a", "is", "a", "text", "a", "text", "file."},
1543                                                     {"Be", "happy", "every", "happy", "every", "day."},
1544                                                     {"Good", "luck", "to", "luck", "to", "everyone."}};
1545 
1546   uint64_t i = 0;
1547   while (row.size() != 0) {
1548     auto ind = row["text"];
1549 
1550     std::shared_ptr<Tensor> de_expected_tensor;
1551     int x = expected[i].size() / 3;
1552     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x, 3}), &de_expected_tensor));
1553     mindspore::MSTensor expected_tensor =
1554       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1555     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1556 
1557     ASSERT_OK(iter->GetNextRow(&row));
1558     i++;
1559   }
1560 
1561   EXPECT_EQ(i, 3);
1562 
1563   // Manually terminate the pipeline
1564   iter->Stop();
1565 }
1566 
1567 /// Feature: SlidingWindow op
1568 /// Description: Test SlidingWindow when the axis is -1
1569 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestSlidingWindowSuccess1)1570 TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess1) {
1571   // Testing the parameter of SlidingWindow interface when the axis is -1.
1572   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess1.";
1573 
1574   // Create a TextFile dataset
1575   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
1576   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1577   EXPECT_NE(ds, nullptr);
1578 
1579   // Create white_tokenizer operation on ds
1580   std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
1581   EXPECT_NE(white_tokenizer, nullptr);
1582   // Create sliding_window operation on ds
1583   std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(2, -1);
1584   EXPECT_NE(sliding_window, nullptr);
1585 
1586   // Create Map operation on ds
1587   ds = ds->Map({white_tokenizer, sliding_window}, {"text"});
1588   EXPECT_NE(ds, nullptr);
1589 
1590   // Create an iterator over the result of the above dataset
1591   // This will trigger the creation of the Execution Tree and launch it.
1592   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1593   EXPECT_NE(iter, nullptr);
1594 
1595   // Iterate the dataset and get each row
1596   std::unordered_map<std::string, mindspore::MSTensor> row;
1597   ASSERT_OK(iter->GetNextRow(&row));
1598 
1599   std::vector<std::vector<std::string>> expected = {{"This", "is", "is", "a", "a", "text", "text", "file."},
1600                                                     {"Be", "happy", "happy", "every", "every", "day."},
1601                                                     {"Good", "luck", "luck", "to", "to", "everyone."}};
1602   uint64_t i = 0;
1603   while (row.size() != 0) {
1604     auto ind = row["text"];
1605 
1606     std::shared_ptr<Tensor> de_expected_tensor;
1607     int x = expected[i].size() / 2;
1608     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x, 2}), &de_expected_tensor));
1609     mindspore::MSTensor expected_tensor =
1610       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1611     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1612 
1613     ASSERT_OK(iter->GetNextRow(&row));
1614     i++;
1615   }
1616 
1617   EXPECT_EQ(i, 3);
1618 
1619   // Manually terminate the pipeline
1620   iter->Stop();
1621 }
1622 
1623 /// Feature: SlidingWindow op
1624 /// Description: Test SlidingWindow when the width=0
1625 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestSlidingWindowFail1)1626 TEST_F(MindDataTestPipeline, TestSlidingWindowFail1) {
1627   // Testing the incorrect parameter of SlidingWindow interface.
1628   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail1.";
1629 
1630   // Create a TextFile dataset
1631   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
1632   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1633   EXPECT_NE(ds, nullptr);
1634 
1635   // Create sliding_window operation on ds
1636   // Testing the parameter width less than or equal to 0
1637   // The parameter axis support 0 or -1 only for now
1638   std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(0, 0);
1639   EXPECT_NE(sliding_window, nullptr);
1640 
1641   // Create a Map operation on ds
1642   ds = ds->Map({sliding_window});
1643   EXPECT_NE(ds, nullptr);
1644 
1645   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1646   // Expect failure: invalid SlidingWindow input (width less than or equal to 0)
1647   EXPECT_EQ(iter, nullptr);
1648 }
1649 
1650 /// Feature: SlidingWindow op
1651 /// Description: Test SlidingWindow when the width=-2
1652 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestSlidingWindowFail2)1653 TEST_F(MindDataTestPipeline, TestSlidingWindowFail2) {
1654   // Testing the incorrect parameter of SlidingWindow interface.
1655   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail2.";
1656 
1657   // Create a TextFile dataset
1658   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
1659   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1660   EXPECT_NE(ds, nullptr);
1661 
1662   // Create sliding_window operation on ds
1663   // Testing the parameter width less than or equal to 0
1664   // The parameter axis support 0 or -1 only for now
1665   std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(-2, 0);
1666   EXPECT_NE(sliding_window, nullptr);
1667 
1668   // Create a Map operation on ds
1669   ds = ds->Map({sliding_window});
1670   EXPECT_NE(ds, nullptr);
1671 
1672   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1673   // Expect failure: invalid SlidingWindow input (width less than or equal to 0)
1674   EXPECT_EQ(iter, nullptr);
1675 }
1676 
1677 /// Feature: ToNumber op
1678 /// Description: Test ToNumber with integer numbers
1679 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestToNumberSuccess1)1680 TEST_F(MindDataTestPipeline, TestToNumberSuccess1) {
1681   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberSuccess1.";
1682   // Test ToNumber with integer numbers
1683 
1684   std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1685 
1686   // Create a TextFile dataset
1687   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1688   EXPECT_NE(ds, nullptr);
1689 
1690   // Create a Take operation on ds
1691   ds = ds->Take(8);
1692   EXPECT_NE(ds, nullptr);
1693 
1694   // Create ToNumber operation on ds
1695   std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
1696   EXPECT_NE(to_number, nullptr);
1697 
1698   // Create a Map operation on ds
1699   ds = ds->Map({to_number}, {"text"});
1700   EXPECT_NE(ds, nullptr);
1701 
1702   // Create an iterator over the result of the above dataset
1703   // This will trigger the creation of the Execution Tree and launch it.
1704   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1705   EXPECT_NE(iter, nullptr);
1706 
1707   // Iterate the dataset and get each row
1708   std::unordered_map<std::string, mindspore::MSTensor> row;
1709   ASSERT_OK(iter->GetNextRow(&row));
1710 
1711   std::vector<int64_t> expected = {-121, 14, -2219, 7623, -8162536, 162371864, -1726483716, 98921728421};
1712 
1713   uint64_t i = 0;
1714   while (row.size() != 0) {
1715     auto ind = row["text"];
1716     std::shared_ptr<Tensor> de_expected_tensor;
1717     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
1718     mindspore::MSTensor ms_expected_tensor =
1719       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1720     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
1721     ASSERT_OK(iter->GetNextRow(&row));
1722     i++;
1723   }
1724 
1725   EXPECT_EQ(i, 8);
1726 
1727   // Manually terminate the pipeline
1728   iter->Stop();
1729 }
1730 
1731 /// Feature: ToNumber op
1732 /// Description: Test ToNumber with float numbers
1733 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestToNumberSuccess2)1734 TEST_F(MindDataTestPipeline, TestToNumberSuccess2) {
1735   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberSuccess2.";
1736   // Test ToNumber with float numbers
1737 
1738   std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1739 
1740   // Create a TextFile dataset
1741   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1742   EXPECT_NE(ds, nullptr);
1743 
1744   // Create a Skip operation on ds
1745   ds = ds->Skip(8);
1746   EXPECT_NE(ds, nullptr);
1747 
1748   // Create a Take operation on ds
1749   ds = ds->Take(6);
1750   EXPECT_NE(ds, nullptr);
1751 
1752   // Create ToNumber operation on ds
1753   std::shared_ptr<TensorTransform> to_number =
1754     std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat64);
1755   EXPECT_NE(to_number, nullptr);
1756 
1757   // Create a Map operation on ds
1758   ds = ds->Map({to_number}, {"text"});
1759   EXPECT_NE(ds, nullptr);
1760 
1761   // Create an iterator over the result of the above dataset
1762   // This will trigger the creation of the Execution Tree and launch it.
1763   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1764   EXPECT_NE(iter, nullptr);
1765 
1766   // Iterate the dataset and get each row
1767   std::unordered_map<std::string, mindspore::MSTensor> row;
1768   ASSERT_OK(iter->GetNextRow(&row));
1769 
1770   std::vector<double_t> expected = {-1.1, 1.4, -2219.321, 7623.453, -816256.234282, 162371864.243243};
1771 
1772   uint64_t i = 0;
1773   while (row.size() != 0) {
1774     auto ind = row["text"];
1775     std::shared_ptr<Tensor> de_expected_tensor;
1776     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
1777     mindspore::MSTensor ms_expected_tensor =
1778       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1779     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
1780     ASSERT_OK(iter->GetNextRow(&row));
1781     i++;
1782   }
1783 
1784   EXPECT_EQ(i, 6);
1785 
1786   // Manually terminate the pipeline
1787   iter->Stop();
1788 }
1789 
1790 /// Feature: ToNumber op
1791 /// Description: Test ToNumber with overflow integer numbers
1792 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestToNumberFail1)1793 TEST_F(MindDataTestPipeline, TestToNumberFail1) {
1794   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail1.";
1795   // Test ToNumber with overflow integer numbers
1796 
1797   std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1798 
1799   // Create a TextFile dataset
1800   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1801   EXPECT_NE(ds, nullptr);
1802 
1803   // Create a Skip operation on ds
1804   ds = ds->Skip(2);
1805   EXPECT_NE(ds, nullptr);
1806 
1807   // Create a Take operation on ds
1808   ds = ds->Take(6);
1809   EXPECT_NE(ds, nullptr);
1810 
1811   // Create ToNumber operation on ds
1812   std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
1813   EXPECT_NE(to_number, nullptr);
1814 
1815   // Create a Map operation on ds
1816   ds = ds->Map({to_number}, {"text"});
1817   EXPECT_NE(ds, nullptr);
1818 
1819   // Create an iterator over the result of the above dataset
1820   // This will trigger the creation of the Execution Tree and launch it.
1821   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1822   EXPECT_NE(iter, nullptr);
1823 
1824   // Iterate the dataset and get each row
1825   std::unordered_map<std::string, mindspore::MSTensor> row;
1826 
1827   // Expect error: input out of bounds of int8
1828   EXPECT_ERROR(iter->GetNextRow(&row));
1829 
1830   uint64_t i = 0;
1831   while (row.size() != 0) {
1832     EXPECT_ERROR(iter->GetNextRow(&row));
1833     i++;
1834   }
1835 
1836   // Expect failure: GetNextRow fail and return nothing
1837   EXPECT_EQ(i, 0);
1838 
1839   // Manually terminate the pipeline
1840   iter->Stop();
1841 }
1842 
1843 /// Feature: ToNumber op
1844 /// Description: Test ToNumber with overflow float numbers
1845 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestToNumberFail2)1846 TEST_F(MindDataTestPipeline, TestToNumberFail2) {
1847   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail2.";
1848   // Test ToNumber with overflow float numbers
1849 
1850   std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1851 
1852   // Create a TextFile dataset
1853   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1854   EXPECT_NE(ds, nullptr);
1855 
1856   // Create a Skip operation on ds
1857   ds = ds->Skip(12);
1858   EXPECT_NE(ds, nullptr);
1859 
1860   // Create a Take operation on ds
1861   ds = ds->Take(2);
1862   EXPECT_NE(ds, nullptr);
1863 
1864   // Create ToNumber operation on ds
1865   std::shared_ptr<TensorTransform> to_number =
1866     std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
1867   EXPECT_NE(to_number, nullptr);
1868 
1869   // Create a Map operation on ds
1870   ds = ds->Map({to_number}, {"text"});
1871   EXPECT_NE(ds, nullptr);
1872 
1873   // Create an iterator over the result of the above dataset
1874   // This will trigger the creation of the Execution Tree and launch it.
1875   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1876   EXPECT_NE(iter, nullptr);
1877 
1878   // Iterate the dataset and get each row
1879   std::unordered_map<std::string, mindspore::MSTensor> row;
1880 
1881   // Expect error: input out of bounds of float16
1882   EXPECT_ERROR(iter->GetNextRow(&row));
1883 
1884   uint64_t i = 0;
1885   while (row.size() != 0) {
1886     EXPECT_ERROR(iter->GetNextRow(&row));
1887     i++;
1888   }
1889 
1890   // Expect failure: GetNextRow fail and return nothing
1891   EXPECT_EQ(i, 0);
1892 
1893   // Manually terminate the pipeline
1894   iter->Stop();
1895 }
1896 
1897 /// Feature: ToNumber op
1898 /// Description: Test ToNumber with non numerical input
1899 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestToNumberFail3)1900 TEST_F(MindDataTestPipeline, TestToNumberFail3) {
1901   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail3.";
1902   // Test ToNumber with non numerical input
1903 
1904   std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1905 
1906   // Create a TextFile dataset
1907   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1908   EXPECT_NE(ds, nullptr);
1909 
1910   // Create a Skip operation on ds
1911   ds = ds->Skip(14);
1912   EXPECT_NE(ds, nullptr);
1913 
1914   // Create ToNumber operation on ds
1915   std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
1916   EXPECT_NE(to_number, nullptr);
1917 
1918   // Create a Map operation on ds
1919   ds = ds->Map({to_number}, {"text"});
1920   EXPECT_NE(ds, nullptr);
1921 
1922   // Create an iterator over the result of the above dataset
1923   // This will trigger the creation of the Execution Tree and launch it.
1924   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1925   EXPECT_NE(iter, nullptr);
1926 
1927   // Iterate the dataset and get each row
1928   std::unordered_map<std::string, mindspore::MSTensor> row;
1929 
1930   // Expect error: invalid input which is non numerical
1931   EXPECT_ERROR(iter->GetNextRow(&row));
1932 
1933   uint64_t i = 0;
1934   while (row.size() != 0) {
1935     EXPECT_ERROR(iter->GetNextRow(&row));
1936     i++;
1937   }
1938 
1939   // Expect failure: GetNextRow fail and return nothing
1940   EXPECT_EQ(i, 0);
1941 
1942   // Manually terminate the pipeline
1943   iter->Stop();
1944 }
1945 
1946 /// Feature: ToNumber op
1947 /// Description: Test ToNumber with non numerical data type (kObjectTypeString)
1948 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestToNumberFail4)1949 TEST_F(MindDataTestPipeline, TestToNumberFail4) {
1950   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail4.";
1951   // Test ToNumber with non numerical data type
1952 
1953   std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1954 
1955   // Create a TextFile dataset
1956   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1957   EXPECT_NE(ds, nullptr);
1958 
1959   // Create ToNumber operation on ds
1960   std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kObjectTypeString);
1961   EXPECT_NE(to_number, nullptr);
1962 
1963   // Create a Map operation on ds
1964   ds = ds->Map({to_number}, {"text"});
1965   EXPECT_NE(ds, nullptr);
1966 
1967   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1968   // Expect failure: invalid parameter with non numerical data type
1969   EXPECT_EQ(iter, nullptr);
1970 }
1971 
1972 /// Feature: ToNumber op
1973 /// Description: Test ToNumber with non numerical data type (kObjectTypeBool)
1974 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestToNumberFail5)1975 TEST_F(MindDataTestPipeline, TestToNumberFail5) {
1976   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail5.";
1977   // Test ToNumber with non numerical data type
1978 
1979   std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1980 
1981   // Create a TextFile dataset
1982   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1983   EXPECT_NE(ds, nullptr);
1984 
1985   // Create ToNumber operation on ds
1986   std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeBool);
1987   EXPECT_NE(to_number, nullptr);
1988 
1989   // Create a Map operation on ds
1990   ds = ds->Map({to_number}, {"text"});
1991   EXPECT_NE(ds, nullptr);
1992 
1993   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1994   // Expect failure: invalid parameter with non numerical data type
1995   EXPECT_EQ(iter, nullptr);
1996 }
1997 
1998 /// Feature: TruncateSequencePair op
1999 /// Description: Test TruncateSequencePair basic usage
2000 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestTruncateSequencePairSuccess1)2001 TEST_F(MindDataTestPipeline, TestTruncateSequencePairSuccess1) {
2002   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairSuccess1.";
2003   // Testing basic TruncateSequencePair
2004 
2005   // Set seed for RandomDataset
2006   auto original_seed = config::get_seed();
2007   bool status_set_seed = config::set_seed(0);
2008   EXPECT_EQ(status_set_seed, true);
2009 
2010   // Set num_parallel_workers for RandomDataset
2011   auto original_worker = config::get_num_parallel_workers();
2012   bool status_set_worker = config::set_num_parallel_workers(1);
2013   EXPECT_EQ(status_set_worker, true);
2014 
2015   // Create a RandomDataset which has column names "col1" and "col2"
2016   std::shared_ptr<SchemaObj> schema = Schema();
2017   ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt16, {5}));
2018   ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt32, {3}));
2019   std::shared_ptr<Dataset> ds = RandomData(3, schema);
2020   EXPECT_NE(ds, nullptr);
2021 
2022   // Create a truncate_sequence_pair operation on ds
2023   std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(4);
2024   EXPECT_NE(truncate_sequence_pair, nullptr);
2025 
2026   // Create Map operation on ds
2027   ds = ds->Map({truncate_sequence_pair}, {"col1", "col2"});
2028   EXPECT_NE(ds, nullptr);
2029 
2030   // Create an iterator over the result of the above dataset
2031   // This will trigger the creation of the Execution Tree and launch it.
2032   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2033   EXPECT_NE(iter, nullptr);
2034 
2035   // Iterate the dataset and get each row
2036   std::unordered_map<std::string, mindspore::MSTensor> row;
2037   ASSERT_OK(iter->GetNextRow(&row));
2038 
2039   std::vector<std::vector<int16_t>> expected1 = {{-29556, -29556}, {-18505, -18505}, {-25958, -25958}};
2040   std::vector<std::vector<int32_t>> expected2 = {
2041     {-1751672937, -1751672937}, {-656877352, -656877352}, {-606348325, -606348325}};
2042 
2043   uint64_t i = 0;
2044   while (row.size() != 0) {
2045     auto ind1 = row["col1"];
2046     auto ind2 = row["col2"];
2047 
2048     std::shared_ptr<Tensor> de_expected_tensor1;
2049     ASSERT_OK(Tensor::CreateFromVector(expected1[i], &de_expected_tensor1));
2050     mindspore::MSTensor expected_tensor1 =
2051       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor1));
2052     EXPECT_MSTENSOR_EQ(ind1, expected_tensor1);
2053 
2054     std::shared_ptr<Tensor> de_expected_tensor2;
2055     ASSERT_OK(Tensor::CreateFromVector(expected2[i], &de_expected_tensor2));
2056     mindspore::MSTensor expected_tensor2 =
2057       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor2));
2058     EXPECT_MSTENSOR_EQ(ind2, expected_tensor2);
2059 
2060     ASSERT_OK(iter->GetNextRow(&row));
2061     i++;
2062   }
2063 
2064   EXPECT_EQ(i, 3);
2065 
2066   // Manually terminate the pipeline
2067   iter->Stop();
2068 
2069   // Restore original seed and num_parallel_workers
2070   status_set_seed = config::set_seed(original_seed);
2071   EXPECT_EQ(status_set_seed, true);
2072   status_set_worker = config::set_num_parallel_workers(original_worker);
2073   EXPECT_EQ(status_set_worker, true);
2074 }
2075 
2076 /// Feature: TruncateSequencePair op
2077 /// Description: Test TruncateSequencePair with odd max_length
2078 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestTruncateSequencePairSuccess2)2079 TEST_F(MindDataTestPipeline, TestTruncateSequencePairSuccess2) {
2080   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairSuccess2.";
2081   // Testing basic TruncateSequencePair with odd max_length
2082 
2083   // Set seed for RandomDataset
2084   auto original_seed = config::get_seed();
2085   bool status_set_seed = config::set_seed(1);
2086   EXPECT_EQ(status_set_seed, true);
2087 
2088   // Set num_parallel_workers for RandomDataset
2089   auto original_worker = config::get_num_parallel_workers();
2090   bool status_set_worker = config::set_num_parallel_workers(1);
2091   EXPECT_EQ(status_set_worker, true);
2092 
2093   // Create a RandomDataset which has column names "col1" and "col2"
2094   std::shared_ptr<SchemaObj> schema = Schema();
2095   ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt32, {4}));
2096   ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt64, {4}));
2097   std::shared_ptr<Dataset> ds = RandomData(4, schema);
2098   EXPECT_NE(ds, nullptr);
2099 
2100   // Create a truncate_sequence_pair operation on ds
2101   std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(5);
2102   EXPECT_NE(truncate_sequence_pair, nullptr);
2103 
2104   // Create Map operation on ds
2105   ds = ds->Map({truncate_sequence_pair}, {"col1", "col2"});
2106   EXPECT_NE(ds, nullptr);
2107 
2108   // Create an iterator over the result of the above dataset
2109   // This will trigger the creation of the Execution Tree and launch it.
2110   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2111   EXPECT_NE(iter, nullptr);
2112 
2113   // Iterate the dataset and get each row
2114   std::unordered_map<std::string, mindspore::MSTensor> row;
2115   ASSERT_OK(iter->GetNextRow(&row));
2116 
2117   std::vector<std::vector<int32_t>> expected1 = {{1785358954, 1785358954, 1785358954},
2118                                                  {-1195853640, -1195853640, -1195853640},
2119                                                  {0, 0, 0},
2120                                                  {1296911693, 1296911693, 1296911693}};
2121   std::vector<std::vector<int64_t>> expected2 = {
2122     {-1, -1}, {-1229782938247303442, -1229782938247303442}, {2314885530818453536, 2314885530818453536}, {-1, -1}};
2123 
2124   uint64_t i = 0;
2125   while (row.size() != 0) {
2126     auto ind1 = row["col1"];
2127     auto ind2 = row["col2"];
2128 
2129     std::shared_ptr<Tensor> de_expected_tensor1;
2130     ASSERT_OK(Tensor::CreateFromVector(expected1[i], &de_expected_tensor1));
2131     mindspore::MSTensor expected_tensor1 =
2132       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor1));
2133     EXPECT_MSTENSOR_EQ(ind1, expected_tensor1);
2134 
2135     std::shared_ptr<Tensor> de_expected_tensor2;
2136     ASSERT_OK(Tensor::CreateFromVector(expected2[i], &de_expected_tensor2));
2137     mindspore::MSTensor expected_tensor2 =
2138       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor2));
2139     EXPECT_MSTENSOR_EQ(ind2, expected_tensor2);
2140 
2141     ASSERT_OK(iter->GetNextRow(&row));
2142     i++;
2143   }
2144 
2145   EXPECT_EQ(i, 4);
2146 
2147   // Manually terminate the pipeline
2148   iter->Stop();
2149 
2150   // Restore original seed and num_parallel_workers
2151   status_set_seed = config::set_seed(original_seed);
2152   EXPECT_EQ(status_set_seed, true);
2153   status_set_worker = config::set_num_parallel_workers(original_worker);
2154   EXPECT_EQ(status_set_worker, true);
2155 }
2156 
2157 /// Feature: TruncateSequencePair op
2158 /// Description: Test TruncateSequencePair with negative max_length
2159 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestTruncateSequencePairFail)2160 TEST_F(MindDataTestPipeline, TestTruncateSequencePairFail) {
2161   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairFail.";
2162   // Testing TruncateSequencePair with negative max_length
2163 
2164   // Create a RandomDataset which has column names "col1" and "col2"
2165   std::shared_ptr<SchemaObj> schema = Schema();
2166   ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt8, {3}));
2167   ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt8, {3}));
2168   std::shared_ptr<Dataset> ds = RandomData(3, schema);
2169   EXPECT_NE(ds, nullptr);
2170 
2171   // Create a truncate_sequence_pair operation on ds
2172   std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(-1);
2173   EXPECT_NE(truncate_sequence_pair, nullptr);
2174 
2175   // Create a Map operation on ds
2176   ds = ds->Map({truncate_sequence_pair});
2177   EXPECT_NE(ds, nullptr);
2178 
2179   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2180   // Expect failure: invalid TruncateSequencePair input (invalid parameter with negative max_length)
2181   EXPECT_EQ(iter, nullptr);
2182 }
2183 
2184 /// Feature: Ngram op
2185 /// Description: Test parameters for Ngram interface
2186 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestNgramSuccess)2187 TEST_F(MindDataTestPipeline, TestNgramSuccess) {
2188   // Testing the parameter of Ngram interface.
2189   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess.";
2190 
2191   // Create a TextFile dataset
2192   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2193   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2194   EXPECT_NE(ds, nullptr);
2195 
2196   // Create white_tokenizer operation on ds
2197   std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
2198   EXPECT_NE(white_tokenizer, nullptr);
2199   // Create sliding_window operation on ds
2200   auto ngram_op = std::make_shared<text::Ngram>(
2201     std::vector<int>{2}, std::pair<std::string, int32_t>{"_", 1}, std::pair<std::string, int32_t>{"_", 1}, " ");
2202   EXPECT_NE(ngram_op, nullptr);
2203 
2204   // Create Map operation on ds
2205   ds = ds->Map({white_tokenizer, ngram_op}, {"text"});
2206   EXPECT_NE(ds, nullptr);
2207 
2208   // Create an iterator over the result of the above dataset
2209   // This will trigger the creation of the Execution Tree and launch it.
2210   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2211   EXPECT_NE(iter, nullptr);
2212 
2213   // Iterate the dataset and get each row
2214   std::unordered_map<std::string, mindspore::MSTensor> row;
2215   ASSERT_OK(iter->GetNextRow(&row));
2216 
2217   std::vector<std::vector<std::string>> expected = {{"_ This", "This is", "is a", "a text", "text file.", "file. _"},
2218                                                     {"_ Be", "Be happy", "happy every", "every day.", "day. _"},
2219                                                     {"_ Good", "Good luck", "luck to", "to everyone.", "everyone. _"}};
2220 
2221   uint64_t i = 0;
2222   while (row.size() != 0) {
2223     auto ind = row["text"];
2224 
2225     std::shared_ptr<Tensor> de_expected_tensor;
2226     int x = expected[i].size();
2227     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
2228     mindspore::MSTensor expected_tensor =
2229       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2230     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
2231 
2232     ASSERT_OK(iter->GetNextRow(&row));
2233     i++;
2234   }
2235 
2236   EXPECT_EQ(i, 3);
2237 
2238   // Manually terminate the pipeline
2239   iter->Stop();
2240 }
2241 
2242 /// Feature: Ngram op
2243 /// Description: Test Ngram basic usage
2244 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestNgramSuccess1)2245 TEST_F(MindDataTestPipeline, TestNgramSuccess1) {
2246   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess1.";
2247 
2248   // Create a TextFile dataset
2249   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2250   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2251   EXPECT_NE(ds, nullptr);
2252 
2253   // Create white_tokenizer operation on ds
2254   std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
2255   EXPECT_NE(white_tokenizer, nullptr);
2256   // Create sliding_window operation on ds
2257   auto ngram_op = std::make_shared<text::Ngram>(
2258     std::vector<int32_t>{2, 3}, std::pair<std::string, int32_t>{"&", 2}, std::pair<std::string, int32_t>{"&", 2}, "-");
2259   EXPECT_NE(ngram_op, nullptr);
2260 
2261   // Create Map operation on ds
2262   ds = ds->Map({white_tokenizer, ngram_op}, {"text"});
2263   EXPECT_NE(ds, nullptr);
2264 
2265   // Create an iterator over the result of the above dataset
2266   // This will trigger the creation of the Execution Tree and launch it.
2267   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2268   EXPECT_NE(iter, nullptr);
2269 
2270   // Iterate the dataset and get each row
2271   std::unordered_map<std::string, mindspore::MSTensor> row;
2272   ASSERT_OK(iter->GetNextRow(&row));
2273 
2274   std::vector<std::vector<std::string>> expected = {
2275     {"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a", "is-a-text",
2276      "a-text-file.", "text-file.-&", "file.-&-&"},
2277     {"&-Be", "Be-happy", "happy-every", "every-day.", "day.-&", "&-&-Be", "&-Be-happy", "Be-happy-every",
2278      "happy-every-day.", "every-day.-&", "day.-&-&"},
2279     {"&-Good", "Good-luck", "luck-to", "to-everyone.", "everyone.-&", "&-&-Good", "&-Good-luck", "Good-luck-to",
2280      "luck-to-everyone.", "to-everyone.-&", "everyone.-&-&"}};
2281 
2282   uint64_t i = 0;
2283   while (row.size() != 0) {
2284     auto ind = row["text"];
2285 
2286     std::shared_ptr<Tensor> de_expected_tensor;
2287     int x = expected[i].size();
2288     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
2289     mindspore::MSTensor expected_tensor =
2290       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2291     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
2292 
2293     ASSERT_OK(iter->GetNextRow(&row));
2294     i++;
2295   }
2296 
2297   EXPECT_EQ(i, 3);
2298 
2299   // Manually terminate the pipeline
2300   iter->Stop();
2301 }
2302 
2303 /// Feature: Ngram op
2304 /// Description: Test Ngram where the vector of ngram is empty
2305 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestNgramFail1)2306 TEST_F(MindDataTestPipeline, TestNgramFail1) {
2307   // Testing the incorrect parameter of Ngram interface.
2308   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail1.";
2309 
2310   // Create a TextFile dataset
2311   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2312   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2313   EXPECT_NE(ds, nullptr);
2314 
2315   // Create sliding_window operation on ds
2316   // Testing the vector of ngram is empty
2317   auto ngram_op = std::make_shared<text::Ngram>(std::vector<int32_t>{});
2318   EXPECT_NE(ngram_op, nullptr);
2319 
2320   // Create a Map operation on ds
2321   ds = ds->Map({ngram_op});
2322   EXPECT_NE(ds, nullptr);
2323 
2324   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2325   // Expect failure: invalid Ngram input (the vector of ngram is empty)
2326   EXPECT_EQ(iter, nullptr);
2327 }
2328 
2329 /// Feature: Ngram op
2330 /// Description: Test Ngram where value of ngram vector is equal to 0
2331 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestNgramFail2)2332 TEST_F(MindDataTestPipeline, TestNgramFail2) {
2333   // Testing the incorrect parameter of Ngram interface.
2334   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail2.";
2335 
2336   // Create a TextFile dataset
2337   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2338   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2339   EXPECT_NE(ds, nullptr);
2340 
2341   // Create sliding_window operation on ds
2342   // Testing the value of ngrams vector less than and equal to 0
2343   auto ngram_op = std::make_shared<text::Ngram>(std::vector<int32_t>{0});
2344   EXPECT_NE(ngram_op, nullptr);
2345 
2346   // Create a Map operation on ds
2347   ds = ds->Map({ngram_op});
2348   EXPECT_NE(ds, nullptr);
2349 
2350   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2351   // Expect failure: invalid Ngram input (the value of ngrams vector less than and equal to 0)
2352   EXPECT_EQ(iter, nullptr);
2353 }
2354 
2355 /// Feature: Ngram op
2356 /// Description: Test Ngram where value of ngram vector is less than 0
2357 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestNgramFail3)2358 TEST_F(MindDataTestPipeline, TestNgramFail3) {
2359   // Testing the incorrect parameter of Ngram interface.
2360   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail3.";
2361 
2362   // Create a TextFile dataset
2363   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2364   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2365   EXPECT_NE(ds, nullptr);
2366 
2367   // Create sliding_window operation on ds
2368   // Testing the value of ngrams vector less than and equal to 0
2369   auto ngram_op = std::make_shared<text::Ngram>(std::vector<int32_t>{-2});
2370   EXPECT_NE(ngram_op, nullptr);
2371 
2372   // Create a Map operation on ds
2373   ds = ds->Map({ngram_op});
2374   EXPECT_NE(ds, nullptr);
2375 
2376   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2377   // Expect failure: invalid Ngram input (the value of ngrams vector less than and equal to 0)
2378   EXPECT_EQ(iter, nullptr);
2379 }
2380 
2381 /// Feature: Ngram op
2382 /// Description: Test Ngram where second parameter pad_width in left_pad vector is less than 0
2383 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestNgramFail4)2384 TEST_F(MindDataTestPipeline, TestNgramFail4) {
2385   // Testing the incorrect parameter of Ngram interface.
2386   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail4.";
2387 
2388   // Create a TextFile dataset
2389   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2390   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2391   EXPECT_NE(ds, nullptr);
2392 
2393   // Create sliding_window operation on ds
2394   // Testing the second parameter pad_width in left_pad vector less than 0
2395   auto ngram_op = std::make_shared<text::Ngram>(std::vector<int32_t>{2}, std::pair<std::string, int32_t>{"", -1});
2396   EXPECT_NE(ngram_op, nullptr);
2397 
2398   // Create a Map operation on ds
2399   ds = ds->Map({ngram_op});
2400   EXPECT_NE(ds, nullptr);
2401 
2402   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2403   // Expect failure: invalid Ngram input (the second parameter pad_width in left_pad vector less than 0)
2404   EXPECT_EQ(iter, nullptr);
2405 }
2406 
2407 /// Feature: Ngram op
2408 /// Description: Test Ngram where second parameter pad_width in right_pad vector is less than 0
2409 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestNgramFail5)2410 TEST_F(MindDataTestPipeline, TestNgramFail5) {
2411   // Testing the incorrect parameter of Ngram interface.
2412   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail5.";
2413 
2414   // Create a TextFile dataset
2415   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2416   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2417   EXPECT_NE(ds, nullptr);
2418 
2419   // Create sliding_window operation on ds
2420   // Testing the second parameter pad_width in right_pad vector less than 0
2421   auto ngram_op = std::make_shared<text::Ngram>(
2422     std::vector<int32_t>{2}, std::pair<std::string, int32_t>{"", 1}, std::pair<std::string, int32_t>{"", -1});
2423   EXPECT_NE(ngram_op, nullptr);
2424 
2425   // Create a Map operation on ds
2426   ds = ds->Map({ngram_op});
2427   EXPECT_NE(ds, nullptr);
2428 
2429   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2430   // Expect failure: invalid Ngram input (the second parameter pad_width in left_pad vector less than 0)
2431   EXPECT_EQ(iter, nullptr);
2432 }
2433 
2434 /// Feature: NormalizeUTF8 op
2435 /// Description: Test NormalizeUTF8 when the normalize_form is NormalizeForm::kNfkc
2436 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestNormalizeUTF8Success)2437 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success) {
2438   // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkc.
2439   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success.";
2440 
2441   // Create a TextFile dataset
2442   std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
2443   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2444   EXPECT_NE(ds, nullptr);
2445 
2446   // Create normalizeutf8 operation on ds
2447   std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfkc);
2448   EXPECT_NE(normalizeutf8, nullptr);
2449 
2450   // Create Map operation on ds
2451   ds = ds->Map({normalizeutf8}, {"text"});
2452   EXPECT_NE(ds, nullptr);
2453 
2454   // Create an iterator over the result of the above dataset
2455   // This will trigger the creation of the Execution Tree and launch it.
2456   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2457   EXPECT_NE(iter, nullptr);
2458 
2459   // Iterate the dataset and get each row
2460   std::unordered_map<std::string, mindspore::MSTensor> row;
2461   ASSERT_OK(iter->GetNextRow(&row));
2462 
2463   std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"};
2464 
2465   uint64_t i = 0;
2466   while (row.size() != 0) {
2467     auto ind = row["text"];
2468     std::shared_ptr<Tensor> de_expected_tensor;
2469     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2470     mindspore::MSTensor ms_expected_tensor =
2471       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2472     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2473     ASSERT_OK(iter->GetNextRow(&row));
2474     i++;
2475   }
2476 
2477   EXPECT_EQ(i, 6);
2478 
2479   // Manually terminate the pipeline
2480   iter->Stop();
2481 }
2482 
2483 /// Feature: NormalizeUTF8 op
2484 /// Description: Test NormalizeUTF8 when the normalize_form is NormalizeForm::kNfc
2485 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestNormalizeUTF8Success1)2486 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success1) {
2487   // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfc.
2488   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success1.";
2489 
2490   // Create a TextFile dataset
2491   std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
2492   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2493   EXPECT_NE(ds, nullptr);
2494 
2495   // Create normalizeutf8 operation on ds
2496   std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfc);
2497   EXPECT_NE(normalizeutf8, nullptr);
2498 
2499   // Create Map operation on ds
2500   ds = ds->Map({normalizeutf8}, {"text"});
2501   EXPECT_NE(ds, nullptr);
2502 
2503   // Create an iterator over the result of the above dataset
2504   // This will trigger the creation of the Execution Tree and launch it.
2505   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2506   EXPECT_NE(iter, nullptr);
2507 
2508   // Iterate the dataset and get each row
2509   std::unordered_map<std::string, mindspore::MSTensor> row;
2510   ASSERT_OK(iter->GetNextRow(&row));
2511 
2512   std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"};
2513 
2514   uint64_t i = 0;
2515   while (row.size() != 0) {
2516     auto ind = row["text"];
2517     std::shared_ptr<Tensor> de_expected_tensor;
2518     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2519     mindspore::MSTensor ms_expected_tensor =
2520       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2521     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2522     ASSERT_OK(iter->GetNextRow(&row));
2523     i++;
2524   }
2525 
2526   EXPECT_EQ(i, 6);
2527 
2528   // Manually terminate the pipeline
2529   iter->Stop();
2530 }
2531 
2532 /// Feature: NormalizeUTF8 op
2533 /// Description: Test NormalizeUTF8 when the normalize_form is NormalizeForm::kNfd
2534 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestNormalizeUTF8Success2)2535 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success2) {
2536   // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfd.
2537   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success2.";
2538 
2539   // Create a TextFile dataset
2540   std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
2541   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2542   EXPECT_NE(ds, nullptr);
2543 
2544   // Create normalizeutf8 operation on ds
2545   std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfd);
2546   EXPECT_NE(normalizeutf8, nullptr);
2547 
2548   // Create Map operation on ds
2549   ds = ds->Map({normalizeutf8}, {"text"});
2550   EXPECT_NE(ds, nullptr);
2551 
2552   // Create an iterator over the result of the above dataset
2553   // This will trigger the creation of the Execution Tree and launch it.
2554   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2555   EXPECT_NE(iter, nullptr);
2556 
2557   // Iterate the dataset and get each row
2558   std::unordered_map<std::string, mindspore::MSTensor> row;
2559   ASSERT_OK(iter->GetNextRow(&row));
2560 
2561   std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"};
2562 
2563   uint64_t i = 0;
2564   while (row.size() != 0) {
2565     auto ind = row["text"];
2566     std::shared_ptr<Tensor> de_expected_tensor;
2567     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2568     mindspore::MSTensor ms_expected_tensor =
2569       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2570     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2571     ASSERT_OK(iter->GetNextRow(&row));
2572     i++;
2573   }
2574 
2575   EXPECT_EQ(i, 6);
2576 
2577   // Manually terminate the pipeline
2578   iter->Stop();
2579 }
2580 
2581 /// Feature: NormalizeUTF8 op
2582 /// Description: Test NormalizeUTF8 when the normalize_form is NormalizeForm::kNfkd
2583 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestNormalizeUTF8Success3)2584 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success3) {
2585   // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkd.
2586   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success3.";
2587 
2588   // Create a TextFile dataset
2589   std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
2590   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2591   EXPECT_NE(ds, nullptr);
2592 
2593   // Create normalizeutf8 operation on ds
2594   std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfkd);
2595   EXPECT_NE(normalizeutf8, nullptr);
2596 
2597   // Create Map operation on ds
2598   ds = ds->Map({normalizeutf8}, {"text"});
2599   EXPECT_NE(ds, nullptr);
2600 
2601   // Create an iterator over the result of the above dataset
2602   // This will trigger the creation of the Execution Tree and launch it.
2603   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2604   EXPECT_NE(iter, nullptr);
2605 
2606   // Iterate the dataset and get each row
2607   std::unordered_map<std::string, mindspore::MSTensor> row;
2608   ASSERT_OK(iter->GetNextRow(&row));
2609 
2610   std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"};
2611 
2612   uint64_t i = 0;
2613   while (row.size() != 0) {
2614     auto ind = row["text"];
2615     std::shared_ptr<Tensor> de_expected_tensor;
2616     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2617     mindspore::MSTensor ms_expected_tensor =
2618       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2619     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2620     ASSERT_OK(iter->GetNextRow(&row));
2621     i++;
2622   }
2623 
2624   EXPECT_EQ(i, 6);
2625 
2626   // Manually terminate the pipeline
2627   iter->Stop();
2628 }
2629 
2630 /// Feature: RegexReplace op
2631 /// Description: Test RegexReplace when the replace_all=true
2632 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestRegexReplaceSuccess)2633 TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess) {
2634   // Testing the parameter of RegexReplace interface when the replace_all is true.
2635   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess.";
2636 
2637   // Create a TextFile dataset
2638   std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
2639   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2640   EXPECT_NE(ds, nullptr);
2641 
2642   // Create regex_replace operation on ds
2643   std::shared_ptr<TensorTransform> regex_replace = std::make_shared<text::RegexReplace>("\\s+", "_", true);
2644   EXPECT_NE(regex_replace, nullptr);
2645 
2646   // Create Map operation on ds
2647   ds = ds->Map({regex_replace}, {"text"});
2648   EXPECT_NE(ds, nullptr);
2649 
2650   // Create an iterator over the result of the above dataset
2651   // This will trigger the creation of the Execution Tree and launch it.
2652   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2653   EXPECT_NE(iter, nullptr);
2654 
2655   // Iterate the dataset and get each row
2656   std::unordered_map<std::string, mindspore::MSTensor> row;
2657   ASSERT_OK(iter->GetNextRow(&row));
2658 
2659   std::vector<std::string> expected = {"Hello_World", "Let's_Go",          "1:hello",        "2:world",
2660                                        "31:beijing",  "Welcome_to_China!", "_我_不想_长大_", "Welcome_to_Shenzhen!"};
2661 
2662   uint64_t i = 0;
2663   while (row.size() != 0) {
2664     auto ind = row["text"];
2665     std::shared_ptr<Tensor> de_expected_tensor;
2666     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2667     mindspore::MSTensor ms_expected_tensor =
2668       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2669     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2670     ASSERT_OK(iter->GetNextRow(&row));
2671     i++;
2672   }
2673 
2674   EXPECT_EQ(i, 8);
2675 
2676   // Manually terminate the pipeline
2677   iter->Stop();
2678 }
2679 
2680 /// Feature: RegexReplace op
2681 /// Description: Test RegexReplace when the replace_all=false
2682 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestRegexReplaceSuccess1)2683 TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess1) {
2684   // Testing the parameter of RegexReplace interface when the replace_all is false.
2685   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess1.";
2686 
2687   // Create a TextFile dataset
2688   std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
2689   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2690   EXPECT_NE(ds, nullptr);
2691 
2692   // Create regex_replace operation on ds
2693   std::shared_ptr<TensorTransform> regex_replace = std::make_shared<text::RegexReplace>("\\s+", "_", false);
2694   EXPECT_NE(regex_replace, nullptr);
2695 
2696   // Create Map operation on ds
2697   ds = ds->Map({regex_replace}, {"text"});
2698   EXPECT_NE(ds, nullptr);
2699 
2700   // Create an iterator over the result of the above dataset
2701   // This will trigger the creation of the Execution Tree and launch it.
2702   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2703   EXPECT_NE(iter, nullptr);
2704 
2705   // Iterate the dataset and get each row
2706   std::unordered_map<std::string, mindspore::MSTensor> row;
2707   ASSERT_OK(iter->GetNextRow(&row));
2708 
2709   std::vector<std::string> expected = {"Hello_World", "Let's_Go",          "1:hello",          "2:world",
2710                                        "31:beijing",  "Welcome_to China!", "_我	不想  长大	", "Welcome_to Shenzhen!"};
2711 
2712   uint64_t i = 0;
2713   while (row.size() != 0) {
2714     auto ind = row["text"];
2715     std::shared_ptr<Tensor> de_expected_tensor;
2716     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2717     mindspore::MSTensor ms_expected_tensor =
2718       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2719     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2720     ASSERT_OK(iter->GetNextRow(&row));
2721     i++;
2722   }
2723 
2724   EXPECT_EQ(i, 8);
2725 
2726   // Manually terminate the pipeline
2727   iter->Stop();
2728 }
2729 
2730 /// Feature: RegexTokenizer op
2731 /// Description: Test RegexTokenizer when with_offsets=false
2732 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestRegexTokenizerSuccess)2733 TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess) {
2734   // Testing the parameter of RegexTokenizer interface when the with_offsets is false.
2735   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess.";
2736 
2737   // Create a TextFile dataset
2738   std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
2739   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2740   EXPECT_NE(ds, nullptr);
2741 
2742   // Create regex_tokenizer operation on ds
2743   std::shared_ptr<TensorTransform> regex_tokenizer = std::make_shared<text::RegexTokenizer>("\\s+", "\\s+", false);
2744   EXPECT_NE(regex_tokenizer, nullptr);
2745 
2746   // Create Map operation on ds
2747   ds = ds->Map({regex_tokenizer}, {"text"});
2748   EXPECT_NE(ds, nullptr);
2749 
2750   // Create an iterator over the result of the above dataset
2751   // This will trigger the creation of the Execution Tree and launch it.
2752   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2753   EXPECT_NE(iter, nullptr);
2754 
2755   // Iterate the dataset and get each row
2756   std::unordered_map<std::string, mindspore::MSTensor> row;
2757   ASSERT_OK(iter->GetNextRow(&row));
2758 
2759   std::vector<std::vector<std::string>> expected = {{"Hello", " ", "World"},
2760                                                     {"Let's", " ", "Go"},
2761                                                     {"1:hello"},
2762                                                     {"2:world"},
2763                                                     {"31:beijing"},
2764                                                     {"Welcome", " ", "to", " ", "China!"},
2765                                                     {"  ", "我", "	", "不想", "  ", "长大", "	"},
2766                                                     {"Welcome", " ", "to", " ", "Shenzhen!"}};
2767 
2768   uint64_t i = 0;
2769   while (row.size() != 0) {
2770     auto ind = row["text"];
2771 
2772     std::shared_ptr<Tensor> de_expected_tensor;
2773     int x = expected[i].size();
2774     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
2775     mindspore::MSTensor expected_tensor =
2776       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2777     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
2778 
2779     ASSERT_OK(iter->GetNextRow(&row));
2780     i++;
2781   }
2782 
2783   EXPECT_EQ(i, 8);
2784 
2785   // Manually terminate the pipeline
2786   iter->Stop();
2787 }
2788 
2789 /// Feature: RegexTokenizer op
2790 /// Description: Test RegexTokenizer when with_offsets=true
2791 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestRegexTokenizerSuccess1)2792 TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess1) {
2793   // Testing the parameter of RegexTokenizer interface when the with_offsets is true.
2794   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess1.";
2795 
2796   // Create a TextFile dataset
2797   std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
2798   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2799   EXPECT_NE(ds, nullptr);
2800 
2801   // Create regex_tokenizer operation on ds
2802   std::shared_ptr<TensorTransform> regex_tokenizer = std::make_shared<text::RegexTokenizer>("\\s+", "\\s+", true);
2803   EXPECT_NE(regex_tokenizer, nullptr);
2804 
2805   // Create Map operation on ds
2806   ds = ds->Map({regex_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
2807   EXPECT_NE(ds, nullptr);
2808 
2809   // Create an iterator over the result of the above dataset
2810   // This will trigger the creation of the Execution Tree and launch it.
2811   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2812   EXPECT_NE(iter, nullptr);
2813 
2814   // Iterate the dataset and get each row
2815   std::unordered_map<std::string, mindspore::MSTensor> row;
2816   ASSERT_OK(iter->GetNextRow(&row));
2817 
2818   std::vector<std::vector<std::string>> expected_tokens = {{"Hello", " ", "World"},
2819                                                            {"Let's", " ", "Go"},
2820                                                            {"1:hello"},
2821                                                            {"2:world"},
2822                                                            {"31:beijing"},
2823                                                            {"Welcome", " ", "to", " ", "China!"},
2824                                                            {"  ", "我", "	", "不想", "  ", "长大", "	"},
2825                                                            {"Welcome", " ", "to", " ", "Shenzhen!"}};
2826 
2827   std::vector<std::vector<uint32_t>> expected_offsets_start = {
2828     {0, 5, 6}, {0, 5, 6}, {0}, {0}, {0}, {0, 7, 8, 10, 11}, {0, 2, 5, 6, 12, 14, 20}, {0, 7, 8, 10, 11}};
2829   std::vector<std::vector<uint32_t>> expected_offsets_limit = {
2830     {5, 6, 11}, {5, 6, 8}, {7}, {7}, {10}, {7, 8, 10, 11, 17}, {2, 5, 6, 12, 14, 20, 21}, {7, 8, 10, 11, 20}};
2831 
2832   uint64_t i = 0;
2833   while (row.size() != 0) {
2834     auto token = row["token"];
2835     auto start = row["offsets_start"];
2836     auto limit = row["offsets_limit"];
2837 
2838     std::shared_ptr<Tensor> de_expected_tokens;
2839     int x = expected_tokens[i].size();
2840     ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
2841     mindspore::MSTensor ms_expected_tokens =
2842       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
2843     EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
2844 
2845     std::shared_ptr<Tensor> de_expected_offsets_start;
2846     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
2847     mindspore::MSTensor ms_expected_offsets_start =
2848       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
2849     EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
2850 
2851     std::shared_ptr<Tensor> de_expected_offsets_limit;
2852     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
2853     mindspore::MSTensor ms_expected_offsets_limit =
2854       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
2855     EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
2856 
2857     ASSERT_OK(iter->GetNextRow(&row));
2858     i++;
2859   }
2860 
2861   EXPECT_EQ(i, 8);
2862 
2863   // Manually terminate the pipeline
2864   iter->Stop();
2865 }
2866 
2867 /// Feature: UnicodeCharTokenizer op
2868 /// Description: Test UnicodeCharTokenizer when with_offsets is default
2869 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestUnicodeCharTokenizerSuccess)2870 TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess) {
2871   // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is default.
2872   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess.";
2873 
2874   // Create a TextFile dataset
2875   std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
2876   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2877   EXPECT_NE(ds, nullptr);
2878 
2879   // Create unicodechar_tokenizer operation on ds
2880   std::shared_ptr<TensorTransform> unicodechar_tokenizer = std::make_shared<text::UnicodeCharTokenizer>();
2881   EXPECT_NE(unicodechar_tokenizer, nullptr);
2882 
2883   // Create Map operation on ds
2884   ds = ds->Map({unicodechar_tokenizer}, {"text"});
2885   EXPECT_NE(ds, nullptr);
2886 
2887   // Create an iterator over the result of the above dataset
2888   // This will trigger the creation of the Execution Tree and launch it.
2889   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2890   EXPECT_NE(iter, nullptr);
2891 
2892   // Iterate the dataset and get each row
2893   std::unordered_map<std::string, mindspore::MSTensor> row;
2894   ASSERT_OK(iter->GetNextRow(&row));
2895 
2896   std::vector<std::vector<std::string>> expected = {
2897     {"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"},
2898     {"北", "京", "欢", "迎", "您", "!"},
2899     {"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"},
2900     {" ", " "}};
2901 
2902   uint64_t i = 0;
2903   while (row.size() != 0) {
2904     auto ind = row["text"];
2905 
2906     std::shared_ptr<Tensor> de_expected_tensor;
2907     int x = expected[i].size();
2908     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
2909     mindspore::MSTensor expected_tensor =
2910       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2911     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
2912 
2913     ASSERT_OK(iter->GetNextRow(&row));
2914     i++;
2915   }
2916 
2917   EXPECT_EQ(i, 4);
2918 
2919   // Manually terminate the pipeline
2920   iter->Stop();
2921 }
2922 
2923 /// Feature: UnicodeCharTokenizer op
2924 /// Description: Test UnicodeCharTokenizer when with_offsets=true
2925 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestUnicodeCharTokenizerSuccess1)2926 TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess1) {
2927   // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is true.
2928   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess1.";
2929 
2930   // Create a TextFile dataset
2931   std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
2932   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2933   EXPECT_NE(ds, nullptr);
2934 
2935   // Create unicodechar_tokenizer operation on ds
2936   std::shared_ptr<TensorTransform> unicodechar_tokenizer = std::make_shared<text::UnicodeCharTokenizer>(true);
2937   EXPECT_NE(unicodechar_tokenizer, nullptr);
2938 
2939   // Create Map operation on ds
2940   ds = ds->Map({unicodechar_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
2941   EXPECT_NE(ds, nullptr);
2942 
2943   // Create an iterator over the result of the above dataset
2944   // This will trigger the creation of the Execution Tree and launch it.
2945   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2946   EXPECT_NE(iter, nullptr);
2947 
2948   // Iterate the dataset and get each row
2949   std::unordered_map<std::string, mindspore::MSTensor> row;
2950   ASSERT_OK(iter->GetNextRow(&row));
2951 
2952   std::vector<std::vector<std::string>> expected_tokens = {
2953     {"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"},
2954     {"北", "京", "欢", "迎", "您", "!"},
2955     {"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"},
2956     {" ", " "}};
2957 
2958   std::vector<std::vector<uint32_t>> expected_offsets_start = {
2959     {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18},
2960     {0, 3, 6, 9, 12, 15},
2961     {0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16},
2962     {0, 1}};
2963 
2964   std::vector<std::vector<uint32_t>> expected_offsets_limit = {
2965     {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
2966     {3, 6, 9, 12, 15, 18},
2967     {3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17},
2968     {1, 2}};
2969 
2970   uint64_t i = 0;
2971   while (row.size() != 0) {
2972     auto token = row["token"];
2973     auto start = row["offsets_start"];
2974     auto limit = row["offsets_limit"];
2975 
2976     std::shared_ptr<Tensor> de_expected_tokens;
2977     int x = expected_tokens[i].size();
2978     ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
2979     mindspore::MSTensor ms_expected_tokens =
2980       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
2981     EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
2982 
2983     std::shared_ptr<Tensor> de_expected_offsets_start;
2984     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
2985     mindspore::MSTensor ms_expected_offsets_start =
2986       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
2987     EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
2988 
2989     std::shared_ptr<Tensor> de_expected_offsets_limit;
2990     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
2991     mindspore::MSTensor ms_expected_offsets_limit =
2992       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
2993     EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
2994 
2995     ASSERT_OK(iter->GetNextRow(&row));
2996     i++;
2997   }
2998 
2999   EXPECT_EQ(i, 4);
3000 
3001   // Manually terminate the pipeline
3002   iter->Stop();
3003 }
3004 
3005 std::vector<std::string> vocab_english = {"book", "cholera", "era", "favor", "##ite", "my",
3006                                           "is",   "love",    "dur", "##ing", "the"};
3007 
3008 std::vector<std::string> vocab_chinese = {"我", "最", "喜", "欢", "的", "书", "是", "霍", "乱", "时", "期", "爱", "情"};
3009 
3010 /// Feature: WordpieceTokenizer op
3011 /// Description: Test WordpieceTokenizer with default parameters on English vocab
3012 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess1)3013 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess1) {
3014   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess1.";
3015   // Test WordpieceTokenizer with default parameters on English vocab
3016 
3017   // Create a TextFile dataset
3018   std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
3019   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3020   EXPECT_NE(ds, nullptr);
3021 
3022   // Create Take operation on ds
3023   ds = ds->Take(10);
3024   EXPECT_NE(ds, nullptr);
3025 
3026   // Create a vocab from vector
3027   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3028   Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3029   EXPECT_EQ(s, Status::OK());
3030 
3031   // Create WordpieceTokenizer operation on ds
3032   std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab);
3033   EXPECT_NE(wordpiece_tokenizer, nullptr);
3034 
3035   // Create Map operation on ds
3036   ds = ds->Map({wordpiece_tokenizer}, {"text"});
3037   EXPECT_NE(ds, nullptr);
3038 
3039   // Create an iterator over the result of the above dataset
3040   // This will trigger the creation of the Execution Tree and launch it.
3041   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3042   EXPECT_NE(iter, nullptr);
3043 
3044   // Iterate the dataset and get each row
3045   std::unordered_map<std::string, mindspore::MSTensor> row;
3046   ASSERT_OK(iter->GetNextRow(&row));
3047 
3048   std::vector<std::vector<std::string>> expected = {
3049     {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}};
3050 
3051   uint64_t i = 0;
3052   while (row.size() != 0) {
3053     auto txt = row["text"];
3054     std::shared_ptr<Tensor> de_expected_tensor;
3055     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
3056     mindspore::MSTensor expected_tensor =
3057       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3058     EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3059     ASSERT_OK(iter->GetNextRow(&row));
3060     i++;
3061   }
3062 
3063   EXPECT_EQ(i, 10);
3064 
3065   // Manually terminate the pipeline
3066   iter->Stop();
3067 }
3068 
3069 /// Feature: WordpieceTokenizer op
3070 /// Description: Test WordpieceTokenizer with empty unknown_token
3071 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess2)3072 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess2) {
3073   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess2.";
3074   // Test WordpieceTokenizer with empty unknown_token
3075 
3076   // Create a TextFile dataset
3077   std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
3078   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3079   EXPECT_NE(ds, nullptr);
3080 
3081   // Create Take operation on ds
3082   ds = ds->Take(10);
3083   EXPECT_NE(ds, nullptr);
3084 
3085   // Create a vocab from vector
3086   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3087   Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3088   EXPECT_EQ(s, Status::OK());
3089 
3090   // Create WordpieceTokenizer operation on ds
3091   std::shared_ptr<TensorTransform> wordpiece_tokenizer =
3092     std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "", false);
3093   EXPECT_NE(wordpiece_tokenizer, nullptr);
3094 
3095   // Create Map operation on ds
3096   ds = ds->Map({wordpiece_tokenizer}, {"text"});
3097   EXPECT_NE(ds, nullptr);
3098 
3099   // Create an iterator over the result of the above dataset
3100   // This will trigger the creation of the Execution Tree and launch it.
3101   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3102   EXPECT_NE(iter, nullptr);
3103 
3104   // Iterate the dataset and get each row
3105   std::unordered_map<std::string, mindspore::MSTensor> row;
3106   ASSERT_OK(iter->GetNextRow(&row));
3107 
3108   std::vector<std::vector<std::string>> expected = {
3109     {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"what"}};
3110 
3111   uint64_t i = 0;
3112   while (row.size() != 0) {
3113     auto txt = row["text"];
3114     std::shared_ptr<Tensor> de_expected_tensor;
3115     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
3116     mindspore::MSTensor expected_tensor =
3117       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3118     EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3119     ASSERT_OK(iter->GetNextRow(&row));
3120     i++;
3121   }
3122 
3123   EXPECT_EQ(i, 10);
3124 
3125   // Manually terminate the pipeline
3126   iter->Stop();
3127 }
3128 
3129 /// Feature: WordpieceTokenizer op
3130 /// Description: Test WordpieceTokenizer with non-default max_bytes_per_token
3131 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess3)3132 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess3) {
3133   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess3.";
3134   // Test WordpieceTokenizer with non-default max_bytes_per_token
3135 
3136   // Create a TextFile dataset
3137   std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
3138   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3139   EXPECT_NE(ds, nullptr);
3140 
3141   // Create Take operation on ds
3142   ds = ds->Take(10);
3143   EXPECT_NE(ds, nullptr);
3144 
3145   // Create a vocab from vector
3146   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3147   Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3148   EXPECT_EQ(s, Status::OK());
3149 
3150   // Create WordpieceTokenizer operation on ds
3151   std::shared_ptr<TensorTransform> wordpiece_tokenizer =
3152     std::make_shared<text::WordpieceTokenizer>(vocab, "##", 4, "[UNK]", false);
3153   EXPECT_NE(wordpiece_tokenizer, nullptr);
3154 
3155   // Create Map operation on ds
3156   ds = ds->Map({wordpiece_tokenizer}, {"text"});
3157   EXPECT_NE(ds, nullptr);
3158 
3159   // Create an iterator over the result of the above dataset
3160   // This will trigger the creation of the Execution Tree and launch it.
3161   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3162   EXPECT_NE(iter, nullptr);
3163 
3164   // Iterate the dataset and get each row
3165   std::unordered_map<std::string, mindspore::MSTensor> row;
3166   ASSERT_OK(iter->GetNextRow(&row));
3167 
3168   std::vector<std::vector<std::string>> expected = {{"my"},    {"[UNK]"}, {"book"},  {"is"},  {"love"},
3169                                                     {"[UNK]"}, {"the"},   {"[UNK]"}, {"era"}, {"[UNK]"}};
3170 
3171   uint64_t i = 0;
3172   while (row.size() != 0) {
3173     auto txt = row["text"];
3174     std::shared_ptr<Tensor> de_expected_tensor;
3175     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
3176     mindspore::MSTensor expected_tensor =
3177       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3178     EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3179     ASSERT_OK(iter->GetNextRow(&row));
3180     i++;
3181   }
3182 
3183   EXPECT_EQ(i, 10);
3184 
3185   // Manually terminate the pipeline
3186   iter->Stop();
3187 }
3188 
3189 /// Feature: WordpieceTokenizer op
3190 /// Description: Test WordpieceTokenizer with default parameters on Chinese vocab
3191 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess4)3192 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess4) {
3193   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess4.";
3194   // Test WordpieceTokenizer with default parameters on Chinese vocab
3195 
3196   // Create a TextFile dataset
3197   std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
3198   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3199   EXPECT_NE(ds, nullptr);
3200 
3201   // Create Skip operation on ds
3202   ds = ds->Skip(10);
3203   EXPECT_NE(ds, nullptr);
3204 
3205   // Create Take operation on ds
3206   ds = ds->Take(15);
3207   EXPECT_NE(ds, nullptr);
3208 
3209   // Create a vocab from vector
3210   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3211   Status s = Vocab::BuildFromVector(vocab_chinese, {}, true, &vocab);
3212   EXPECT_EQ(s, Status::OK());
3213 
3214   // Create WordpieceTokenizer operation on ds
3215   std::shared_ptr<TensorTransform> wordpiece_tokenizer =
3216     std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", false);
3217   EXPECT_NE(wordpiece_tokenizer, nullptr);
3218 
3219   // Create Map operation on ds
3220   ds = ds->Map({wordpiece_tokenizer}, {"text"});
3221   EXPECT_NE(ds, nullptr);
3222 
3223   // Create an iterator over the result of the above dataset
3224   // This will trigger the creation of the Execution Tree and launch it.
3225   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3226   EXPECT_NE(iter, nullptr);
3227 
3228   // Iterate the dataset and get each row
3229   std::unordered_map<std::string, mindspore::MSTensor> row;
3230   ASSERT_OK(iter->GetNextRow(&row));
3231 
3232   std::vector<std::vector<std::string>> expected = {{"我"}, {"最"}, {"喜"}, {"欢"}, {"的"}, {"书"}, {"是"},   {"霍"},
3233                                                     {"乱"}, {"时"}, {"期"}, {"的"}, {"爱"}, {"情"}, {"[UNK]"}};
3234 
3235   uint64_t i = 0;
3236   while (row.size() != 0) {
3237     auto txt = row["text"];
3238     std::shared_ptr<Tensor> de_expected_tensor;
3239     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
3240     mindspore::MSTensor expected_tensor =
3241       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3242     EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3243     ASSERT_OK(iter->GetNextRow(&row));
3244     i++;
3245   }
3246 
3247   EXPECT_EQ(i, 15);
3248 
3249   // Manually terminate the pipeline
3250   iter->Stop();
3251 }
3252 
3253 /// Feature: WordpieceTokenizer op
3254 /// Description: Test WordpieceTokenizer with with_offsets=true
3255 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess5)3256 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess5) {
3257   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess5.";
3258   // Test WordpieceTokenizer with with_offsets true
3259 
3260   // Create a TextFile dataset
3261   std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
3262   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3263   EXPECT_NE(ds, nullptr);
3264 
3265   // Create Take operation on ds
3266   ds = ds->Take(10);
3267   EXPECT_NE(ds, nullptr);
3268 
3269   // Create a vocab from vector
3270   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3271   Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3272   EXPECT_EQ(s, Status::OK());
3273 
3274   // Create WordpieceTokenizer operation on ds
3275   std::shared_ptr<TensorTransform> wordpiece_tokenizer =
3276     std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", true);
3277   EXPECT_NE(wordpiece_tokenizer, nullptr);
3278 
3279   // Create Map operation on ds
3280   ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
3281   EXPECT_NE(ds, nullptr);
3282 
3283   // Create an iterator over the result of the above dataset
3284   // This will trigger the creation of the Execution Tree and launch it.
3285   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3286   EXPECT_NE(iter, nullptr);
3287 
3288   // Iterate the dataset and get each row
3289   std::unordered_map<std::string, mindspore::MSTensor> row;
3290   ASSERT_OK(iter->GetNextRow(&row));
3291 
3292   std::vector<std::vector<std::string>> expected = {
3293     {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}};
3294   std::vector<std::vector<uint32_t>> expected_offsets_start = {{0}, {0, 5}, {0}, {0}, {0}, {0, 3}, {0}, {0}, {0}, {0}};
3295   std::vector<std::vector<uint32_t>> expected_offsets_limit = {{2}, {5, 8}, {4}, {2}, {4}, {3, 6}, {3}, {7}, {3}, {4}};
3296 
3297   uint64_t i = 0;
3298   while (row.size() != 0) {
3299     auto txt = row["token"];
3300     std::shared_ptr<Tensor> de_expected_tensor;
3301     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
3302     mindspore::MSTensor expected_tensor =
3303       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3304     EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3305 
3306     auto start = row["offsets_start"];
3307     std::shared_ptr<Tensor> de_expected_start_tensor;
3308     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], &de_expected_start_tensor));
3309     mindspore::MSTensor expected_start_tensor =
3310       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_start_tensor));
3311     EXPECT_MSTENSOR_EQ(start, expected_start_tensor);
3312 
3313     auto limit = row["offsets_limit"];
3314     std::shared_ptr<Tensor> de_expected_limit_tensor;
3315     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], &de_expected_limit_tensor));
3316     mindspore::MSTensor expected_limit_tensor =
3317       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_limit_tensor));
3318     EXPECT_MSTENSOR_EQ(limit, expected_limit_tensor);
3319     ASSERT_OK(iter->GetNextRow(&row));
3320     i++;
3321   }
3322 
3323   EXPECT_EQ(i, 10);
3324 
3325   // Manually terminate the pipeline
3326   iter->Stop();
3327 }
3328 
3329 /// Feature: WordpieceTokenizer op
3330 /// Description: Test WordpieceTokenizer with max_bytes_per_token=0
3331 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess6)3332 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess6) {
3333   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess6.";
3334   // Test WordpieceTokenizer with max_bytes_per_token equals to 0
3335 
3336   // Create a TextFile dataset
3337   std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
3338   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3339   EXPECT_NE(ds, nullptr);
3340 
3341   // Create Take operation on ds
3342   ds = ds->Take(10);
3343   EXPECT_NE(ds, nullptr);
3344 
3345   // Create a vocab from vector
3346   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3347   Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3348   EXPECT_EQ(s, Status::OK());
3349 
3350   // Create WordpieceTokenizer operation on ds
3351   std::shared_ptr<TensorTransform> wordpiece_tokenizer =
3352     std::make_shared<text::WordpieceTokenizer>(vocab, "##", 0, "[UNK]", true);
3353   EXPECT_NE(wordpiece_tokenizer, nullptr);
3354 
3355   // Create Map operation on ds
3356   ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
3357   EXPECT_NE(ds, nullptr);
3358 
3359   // Create an iterator over the result of the above dataset
3360   // This will trigger the creation of the Execution Tree and launch it.
3361   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3362   EXPECT_NE(iter, nullptr);
3363 
3364   // Iterate the dataset and get each row
3365   std::unordered_map<std::string, mindspore::MSTensor> row;
3366   ASSERT_OK(iter->GetNextRow(&row));
3367 
3368   std::vector<std::vector<std::string>> expected = {{"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"},
3369                                                     {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}};
3370 
3371   uint64_t i = 0;
3372   while (row.size() != 0) {
3373     auto txt = row["token"];
3374     std::shared_ptr<Tensor> de_expected_tensor;
3375     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
3376     mindspore::MSTensor expected_tensor =
3377       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3378     EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3379     ASSERT_OK(iter->GetNextRow(&row));
3380     i++;
3381   }
3382 
3383   EXPECT_EQ(i, 10);
3384 
3385   // Manually terminate the pipeline
3386   iter->Stop();
3387 }
3388 
3389 /// Feature: WordpieceTokenizer op
3390 /// Description: Test WordpieceTokenizer with nullptr vocab
3391 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerFail1)3392 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail1) {
3393   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail1.";
3394   // Test WordpieceTokenizer with nullptr vocab
3395 
3396   // Create a TextFile dataset
3397   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
3398   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3399   EXPECT_NE(ds, nullptr);
3400 
3401   // Create WordpieceTokenizer operation on ds
3402   std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(nullptr);
3403   EXPECT_NE(wordpiece_tokenizer, nullptr);
3404 
3405   // Create a Map operation on ds
3406   ds = ds->Map({wordpiece_tokenizer});
3407   EXPECT_NE(ds, nullptr);
3408 
3409   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3410   // Expect failure: invalid WordpieceTokenizer input with nullptr vocab
3411   EXPECT_EQ(iter, nullptr);
3412 }
3413 
3414 /// Feature: WordpieceTokenizer op
3415 /// Description: Test WordpieceTokenizer with negative max_bytes_per_token
3416 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerFail2)3417 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail2) {
3418   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail2.";
3419   // Test WordpieceTokenizer with negative max_bytes_per_token
3420 
3421   // Create a TextFile dataset
3422   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
3423   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3424   EXPECT_NE(ds, nullptr);
3425 
3426   // Create a vocab from vector
3427   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3428   Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3429   EXPECT_EQ(s, Status::OK());
3430 
3431   // Create WordpieceTokenizer operation on ds
3432   std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab, "##", -1);
3433   EXPECT_NE(wordpiece_tokenizer, nullptr);
3434 
3435   // Create a Map operation on ds
3436   ds = ds->Map({wordpiece_tokenizer});
3437   EXPECT_NE(ds, nullptr);
3438 
3439   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3440   // Expect failure: invalid WordpieceTokenizer input with nullptr vocab
3441   EXPECT_EQ(iter, nullptr);
3442 }
3443 
3444 /// Feature: UnicodeScriptTokenizer op
3445 /// Description: Test UnicodeScriptTokenizer when with_offsets and keep_whitespace is default
3446 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestUnicodeScriptTokenizerSuccess)3447 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess) {
3448   // Testing the parameter of UnicodeScriptTokenizer interface when the with_offsets and the keep_whitespace is default.
3449   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess.";
3450 
3451   // Create a TextFile dataset
3452   std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3453   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3454   EXPECT_NE(ds, nullptr);
3455 
3456   // Create unicodescript_tokenizer operation on ds
3457   std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>();
3458   EXPECT_NE(unicodescript_tokenizer, nullptr);
3459 
3460   // Create Map operation on ds
3461   ds = ds->Map({unicodescript_tokenizer}, {"text"});
3462   EXPECT_NE(ds, nullptr);
3463 
3464   // Create an iterator over the result of the above dataset
3465   // This will trigger the creation of the Execution Tree and launch it.
3466   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3467   EXPECT_NE(iter, nullptr);
3468 
3469   // Iterate the dataset and get each row
3470   std::unordered_map<std::string, mindspore::MSTensor> row;
3471   ASSERT_OK(iter->GetNextRow(&row));
3472 
3473   std::vector<std::vector<std::string>> expected = {
3474     {"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}};
3475 
3476   uint64_t i = 0;
3477   while (row.size() != 0) {
3478     auto ind = row["text"];
3479 
3480     std::shared_ptr<Tensor> de_expected_tensor;
3481     int x = expected[i].size();
3482     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
3483     mindspore::MSTensor expected_tensor =
3484       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3485     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
3486 
3487     ASSERT_OK(iter->GetNextRow(&row));
3488     i++;
3489   }
3490 
3491   EXPECT_EQ(i, 4);
3492 
3493   // Manually terminate the pipeline
3494   iter->Stop();
3495 }
3496 
3497 /// Feature: UnicodeScriptTokenizer op
3498 /// Description: Test UnicodeScriptTokenizer when with_offsets=false and keep_whitespace=true
3499 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestUnicodeScriptTokenizerSuccess1)3500 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess1) {
3501   // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is
3502   // false.
3503   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess1.";
3504 
3505   // Create a TextFile dataset
3506   std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3507   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3508   EXPECT_NE(ds, nullptr);
3509 
3510   // Create unicodescript_tokenizer operation on ds
3511   std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>(true);
3512   EXPECT_NE(unicodescript_tokenizer, nullptr);
3513 
3514   // Create Map operation on ds
3515   ds = ds->Map({unicodescript_tokenizer}, {"text"});
3516   EXPECT_NE(ds, nullptr);
3517 
3518   // Create an iterator over the result of the above dataset
3519   // This will trigger the creation of the Execution Tree and launch it.
3520   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3521   EXPECT_NE(iter, nullptr);
3522 
3523   // Iterate the dataset and get each row
3524   std::unordered_map<std::string, mindspore::MSTensor> row;
3525   ASSERT_OK(iter->GetNextRow(&row));
3526 
3527   std::vector<std::vector<std::string>> expected = {
3528     {"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {"  "}};
3529 
3530   uint64_t i = 0;
3531   while (row.size() != 0) {
3532     auto ind = row["text"];
3533 
3534     std::shared_ptr<Tensor> de_expected_tensor;
3535     int x = expected[i].size();
3536     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
3537     mindspore::MSTensor expected_tensor =
3538       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3539     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
3540 
3541     ASSERT_OK(iter->GetNextRow(&row));
3542     i++;
3543   }
3544 
3545   EXPECT_EQ(i, 4);
3546 
3547   // Manually terminate the pipeline
3548   iter->Stop();
3549 }
3550 
3551 /// Feature: UnicodeScriptTokenizer op
3552 /// Description: Test UnicodeScriptTokenizer when with_offsets=true and keep_whitespace=false
3553 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestUnicodeScriptTokenizerSuccess2)3554 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess2) {
3555   // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is false and the with_offsets is
3556   // true.
3557   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess2.";
3558 
3559   // Create a TextFile dataset
3560   std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3561   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3562   EXPECT_NE(ds, nullptr);
3563 
3564   // Create unicodescript_tokenizer operation on ds
3565   std::shared_ptr<TensorTransform> unicodescript_tokenizer =
3566     std::make_shared<text::UnicodeScriptTokenizer>(false, true);
3567   EXPECT_NE(unicodescript_tokenizer, nullptr);
3568 
3569   // Create Map operation on ds
3570   ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
3571   EXPECT_NE(ds, nullptr);
3572 
3573   // Create an iterator over the result of the above dataset
3574   // This will trigger the creation of the Execution Tree and launch it.
3575   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3576   EXPECT_NE(iter, nullptr);
3577 
3578   // Iterate the dataset and get each row
3579   std::unordered_map<std::string, mindspore::MSTensor> row;
3580   ASSERT_OK(iter->GetNextRow(&row));
3581 
3582   std::vector<std::vector<std::string>> expected_tokens = {
3583     {"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}};
3584 
3585   std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11, 18}, {0, 15}, {0, 9, 16}, {0}};
3586   std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 18, 19}, {15, 18}, {9, 16, 17}, {0}};
3587 
3588   uint64_t i = 0;
3589   while (row.size() != 0) {
3590     auto token = row["token"];
3591     auto start = row["offsets_start"];
3592     auto limit = row["offsets_limit"];
3593 
3594     std::shared_ptr<Tensor> de_expected_tokens;
3595     int x = expected_tokens[i].size();
3596     ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
3597     mindspore::MSTensor ms_expected_tokens =
3598       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
3599     EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
3600 
3601     std::shared_ptr<Tensor> de_expected_offsets_start;
3602     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
3603     mindspore::MSTensor ms_expected_offsets_start =
3604       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
3605     EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
3606 
3607     std::shared_ptr<Tensor> de_expected_offsets_limit;
3608     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
3609     mindspore::MSTensor ms_expected_offsets_limit =
3610       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
3611     EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
3612 
3613     ASSERT_OK(iter->GetNextRow(&row));
3614     i++;
3615   }
3616 
3617   EXPECT_EQ(i, 4);
3618 
3619   // Manually terminate the pipeline
3620   iter->Stop();
3621 }
3622 
3623 /// Feature: UnicodeScriptTokenizer op
3624 /// Description: Test UnicodeScriptTokenizer when with_offsets=true and keep_whitespace=true
3625 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestUnicodeScriptTokenizerSuccess3)3626 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess3) {
3627   // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is
3628   // true.
3629   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess3.";
3630 
3631   // Create a TextFile dataset
3632   std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3633   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3634   EXPECT_NE(ds, nullptr);
3635 
3636   // Create unicodescript_tokenizer operation on ds
3637   std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>(true, true);
3638   EXPECT_NE(unicodescript_tokenizer, nullptr);
3639 
3640   // Create Map operation on ds
3641   ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
3642   EXPECT_NE(ds, nullptr);
3643 
3644   // Create an iterator over the result of the above dataset
3645   // This will trigger the creation of the Execution Tree and launch it.
3646   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3647   EXPECT_NE(iter, nullptr);
3648 
3649   // Iterate the dataset and get each row
3650   std::unordered_map<std::string, mindspore::MSTensor> row;
3651   ASSERT_OK(iter->GetNextRow(&row));
3652 
3653   std::vector<std::vector<std::string>> expected_tokens = {
3654     {"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {"  "}};
3655 
3656   std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 7, 8, 10, 11, 18}, {0, 15}, {0, 9, 16}, {0}};
3657   std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 8, 10, 11, 18, 19}, {15, 18}, {9, 16, 17}, {2}};
3658 
3659   uint64_t i = 0;
3660   while (row.size() != 0) {
3661     auto token = row["token"];
3662     auto start = row["offsets_start"];
3663     auto limit = row["offsets_limit"];
3664 
3665     std::shared_ptr<Tensor> de_expected_tokens;
3666     int x = expected_tokens[i].size();
3667     ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
3668     mindspore::MSTensor ms_expected_tokens =
3669       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
3670     EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
3671 
3672     std::shared_ptr<Tensor> de_expected_offsets_start;
3673     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
3674     mindspore::MSTensor ms_expected_offsets_start =
3675       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
3676     EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
3677 
3678     std::shared_ptr<Tensor> de_expected_offsets_limit;
3679     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
3680     mindspore::MSTensor ms_expected_offsets_limit =
3681       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
3682     EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
3683 
3684     ASSERT_OK(iter->GetNextRow(&row));
3685     i++;
3686   }
3687 
3688   EXPECT_EQ(i, 4);
3689 
3690   // Manually terminate the pipeline
3691   iter->Stop();
3692 }
3693 
3694 /// Feature: WhitespaceTokenizer op
3695 /// Description: Test WhitespaceTokenizer when with_offsets is default
3696 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestWhitespaceTokenizerSuccess)3697 TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess) {
3698   // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is default.
3699   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess.";
3700 
3701   // Create a TextFile dataset
3702   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
3703   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3704   EXPECT_NE(ds, nullptr);
3705 
3706   // Create white_tokenizer operation on ds
3707   std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
3708   EXPECT_NE(white_tokenizer, nullptr);
3709 
3710   // Create Map operation on ds
3711   ds = ds->Map({white_tokenizer}, {"text"});
3712   EXPECT_NE(ds, nullptr);
3713 
3714   // Create an iterator over the result of the above dataset
3715   // This will trigger the creation of the Execution Tree and launch it.
3716   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3717   EXPECT_NE(iter, nullptr);
3718 
3719   // Iterate the dataset and get each row
3720   std::unordered_map<std::string, mindspore::MSTensor> row;
3721   ASSERT_OK(iter->GetNextRow(&row));
3722 
3723   std::vector<std::vector<std::string>> expected = {
3724     {"This", "is", "a", "text", "file."}, {"Be", "happy", "every", "day."}, {"Good", "luck", "to", "everyone."}};
3725 
3726   uint64_t i = 0;
3727   while (row.size() != 0) {
3728     auto ind = row["text"];
3729 
3730     std::shared_ptr<Tensor> de_expected_tensor;
3731     int x = expected[i].size();
3732     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
3733     mindspore::MSTensor expected_tensor =
3734       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3735     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
3736 
3737     ASSERT_OK(iter->GetNextRow(&row));
3738     i++;
3739   }
3740 
3741   EXPECT_EQ(i, 3);
3742 
3743   // Manually terminate the pipeline
3744   iter->Stop();
3745 }
3746 
3747 /// Feature: WhitespaceTokenizer op
3748 /// Description: Test WhitespaceTokenizer when with_offsets=true
3749 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestWhitespaceTokenizerSuccess1)3750 TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess1) {
3751   // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is true.
3752   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess1.";
3753 
3754   // Create a TextFile dataset
3755   std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3756   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3757   EXPECT_NE(ds, nullptr);
3758 
3759   // Create white_tokenizer operation on ds
3760   std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>(true);
3761   EXPECT_NE(white_tokenizer, nullptr);
3762 
3763   // Create Map operation on ds
3764   ds = ds->Map({white_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
3765   EXPECT_NE(ds, nullptr);
3766 
3767   // Create an iterator over the result of the above dataset
3768   // This will trigger the creation of the Execution Tree and launch it.
3769   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3770   EXPECT_NE(iter, nullptr);
3771 
3772   // Iterate the dataset and get each row
3773   std::unordered_map<std::string, mindspore::MSTensor> row;
3774   ASSERT_OK(iter->GetNextRow(&row));
3775 
3776   std::vector<std::vector<std::string>> expected_tokens = {
3777     {"Welcome", "to", "Beijing!"}, {"北京欢迎您!"}, {"我喜欢English!"}, {""}};
3778 
3779   std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11}, {0}, {0}, {0}};
3780   std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 19}, {18}, {17}, {0}};
3781 
3782   uint64_t i = 0;
3783   while (row.size() != 0) {
3784     auto token = row["token"];
3785     auto start = row["offsets_start"];
3786     auto limit = row["offsets_limit"];
3787 
3788     std::shared_ptr<Tensor> de_expected_tokens;
3789     int x = expected_tokens[i].size();
3790     ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
3791     mindspore::MSTensor ms_expected_tokens =
3792       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
3793     EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
3794 
3795     std::shared_ptr<Tensor> de_expected_offsets_start;
3796     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
3797     mindspore::MSTensor ms_expected_offsets_start =
3798       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
3799     EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
3800 
3801     std::shared_ptr<Tensor> de_expected_offsets_limit;
3802     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
3803     mindspore::MSTensor ms_expected_offsets_limit =
3804       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
3805     EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
3806 
3807     ASSERT_OK(iter->GetNextRow(&row));
3808     i++;
3809   }
3810 
3811   EXPECT_EQ(i, 4);
3812 
3813   // Manually terminate the pipeline
3814   iter->Stop();
3815 }
3816 
3817 /// Feature: Vectors
3818 /// Description: Test with default parameter in function BuildFromFile and function Lookup
3819 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestVectorsDefaultParam)3820 TEST_F(MindDataTestPipeline, TestVectorsDefaultParam) {
3821   // Test with default parameter.
3822   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDefaultParam.";
3823 
3824   // Create a TextFile dataset
3825   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
3826   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3827   EXPECT_NE(ds, nullptr);
3828 
3829   std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
3830   std::shared_ptr<Vectors> vectors;
3831   Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
3832   EXPECT_EQ(s, Status::OK());
3833 
3834   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors);
3835   EXPECT_NE(lookup, nullptr);
3836 
3837   // Create Map operation on ds
3838   ds = ds->Map({lookup}, {"text"});
3839   EXPECT_NE(ds, nullptr);
3840 
3841   // Create an iterator over the result of the above dataset
3842   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3843   EXPECT_NE(iter, nullptr);
3844 
3845   // Iterate the dataset and get each row
3846   std::unordered_map<std::string, mindspore::MSTensor> row;
3847   ASSERT_OK(iter->GetNextRow(&row));
3848 
3849   uint64_t i = 0;
3850   std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
3851                                               {0, 0, 0, 0, 0, 0},
3852                                               {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
3853                                               {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
3854                                               {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
3855                                               {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
3856                                               {0, 0, 0, 0, 0, 0}};
3857   while (row.size() != 0) {
3858     auto ind = row["text"];
3859     MS_LOG(INFO) << ind.Shape();
3860     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
3861     TensorPtr de_expected_item;
3862     dsize_t dim = 6;
3863     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
3864     mindspore::MSTensor ms_expected_item =
3865       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
3866     EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
3867 
3868     ASSERT_OK(iter->GetNextRow(&row));
3869     i++;
3870   }
3871 
3872   EXPECT_EQ(i, 7);
3873 
3874   // Manually terminate the pipeline
3875   iter->Stop();
3876 }
3877 
3878 /// Feature: Vectors
3879 /// Description: Test with all parameters which include `path` and `max_vector` in function BuildFromFile
3880 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestVectorsAllBuildfromfileParams)3881 TEST_F(MindDataTestPipeline, TestVectorsAllBuildfromfileParams) {
3882   // Test with two parameters.
3883   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllBuildfromfileParams.";
3884 
3885   // Create a TextFile dataset
3886   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
3887   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3888   EXPECT_NE(ds, nullptr);
3889 
3890   std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
3891   std::shared_ptr<Vectors> vectors;
3892   Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
3893   EXPECT_EQ(s, Status::OK());
3894 
3895   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors);
3896   EXPECT_NE(lookup, nullptr);
3897 
3898   // Create Map operation on ds
3899   ds = ds->Map({lookup}, {"text"});
3900   EXPECT_NE(ds, nullptr);
3901 
3902   // Create an iterator over the result of the above dataset
3903   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3904   EXPECT_NE(iter, nullptr);
3905 
3906   // Iterate the dataset and get each row
3907   std::unordered_map<std::string, mindspore::MSTensor> row;
3908   ASSERT_OK(iter->GetNextRow(&row));
3909 
3910   uint64_t i = 0;
3911   std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
3912                                               {0, 0, 0, 0, 0, 0},
3913                                               {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
3914                                               {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
3915                                               {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
3916                                               {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
3917                                               {0, 0, 0, 0, 0, 0}};
3918   while (row.size() != 0) {
3919     auto ind = row["text"];
3920     MS_LOG(INFO) << ind.Shape();
3921     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
3922     TensorPtr de_expected_item;
3923     dsize_t dim = 6;
3924     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
3925     mindspore::MSTensor ms_expected_item =
3926       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
3927     EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
3928 
3929     ASSERT_OK(iter->GetNextRow(&row));
3930     i++;
3931   }
3932 
3933   EXPECT_EQ(i, 7);
3934 
3935   // Manually terminate the pipeline
3936   iter->Stop();
3937 }
3938 
3939 /// Feature: Vectors
3940 /// Description: Test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
3941 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestVectorsUnknownInit)3942 TEST_F(MindDataTestPipeline, TestVectorsUnknownInit) {
3943   // Test with two parameters.
3944   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsUnknownInit.";
3945 
3946   // Create a TextFile dataset
3947   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
3948   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3949   EXPECT_NE(ds, nullptr);
3950 
3951   std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
3952   std::shared_ptr<Vectors> vectors;
3953   Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
3954   EXPECT_EQ(s, Status::OK());
3955 
3956   std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
3957   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init);
3958   EXPECT_NE(lookup, nullptr);
3959 
3960   // Create Map operation on ds
3961   ds = ds->Map({lookup}, {"text"});
3962   EXPECT_NE(ds, nullptr);
3963 
3964   // Create an iterator over the result of the above dataset
3965   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3966   EXPECT_NE(iter, nullptr);
3967 
3968   // Iterate the dataset and get each row
3969   std::unordered_map<std::string, mindspore::MSTensor> row;
3970   ASSERT_OK(iter->GetNextRow(&row));
3971 
3972   uint64_t i = 0;
3973   std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
3974                                               {-1, -1, -1, -1, -1, -1},
3975                                               {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
3976                                               {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
3977                                               {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
3978                                               {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
3979                                               {-1, -1, -1, -1, -1, -1}};
3980   while (row.size() != 0) {
3981     auto ind = row["text"];
3982     MS_LOG(INFO) << ind.Shape();
3983     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
3984     TensorPtr de_expected_item;
3985     dsize_t dim = 6;
3986     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
3987     mindspore::MSTensor ms_expected_item =
3988       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
3989     EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
3990 
3991     ASSERT_OK(iter->GetNextRow(&row));
3992     i++;
3993   }
3994 
3995   EXPECT_EQ(i, 7);
3996 
3997   // Manually terminate the pipeline
3998   iter->Stop();
3999 }
4000 
4001 /// Feature: Vectors
4002 /// Description: Test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
4003 ///     `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
4004 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestVectorsAllParams)4005 TEST_F(MindDataTestPipeline, TestVectorsAllParams) {
4006   // Test with all parameters.
4007   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllParams.";
4008   // Create a TextFile dataset
4009   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4010   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4011   EXPECT_NE(ds, nullptr);
4012 
4013   std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
4014   std::shared_ptr<Vectors> vectors;
4015   Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
4016   EXPECT_EQ(s, Status::OK());
4017 
4018   std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
4019   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
4020   EXPECT_NE(lookup, nullptr);
4021 
4022   // Create Map operation on ds
4023   ds = ds->Map({lookup}, {"text"});
4024   EXPECT_NE(ds, nullptr);
4025 
4026   // Create an iterator over the result of the above dataset
4027   std::shared_ptr<Iterator> iter = ds->CreateIterator();
4028   EXPECT_NE(iter, nullptr);
4029 
4030   // Iterate the dataset and get each row
4031   std::unordered_map<std::string, mindspore::MSTensor> row;
4032   ASSERT_OK(iter->GetNextRow(&row));
4033 
4034   uint64_t i = 0;
4035   std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4036                                               {-1, -1, -1, -1, -1, -1},
4037                                               {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4038                                               {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4039                                               {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4040                                               {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4041                                               {-1, -1, -1, -1, -1, -1}};
4042   while (row.size() != 0) {
4043     auto ind = row["text"];
4044     MS_LOG(INFO) << ind.Shape();
4045     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4046     TensorPtr de_expected_item;
4047     dsize_t dim = 6;
4048     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4049     mindspore::MSTensor ms_expected_item =
4050       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4051     EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4052 
4053     ASSERT_OK(iter->GetNextRow(&row));
4054     i++;
4055   }
4056 
4057   EXPECT_EQ(i, 7);
4058 
4059   // Manually terminate the pipeline
4060   iter->Stop();
4061 }
4062 
4063 /// Feature: Vectors
4064 /// Description: Test with pre-vectors set that have the different dimension
4065 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestVectorsDifferentDimension)4066 TEST_F(MindDataTestPipeline, TestVectorsDifferentDimension) {
4067   // Tokens don't have the same number of vectors.
4068   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDifferentDimension.";
4069 
4070   // Create a TextFile dataset
4071   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4072   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4073   EXPECT_NE(ds, nullptr);
4074 
4075   std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_dim_different.txt";
4076   std::shared_ptr<Vectors> vectors;
4077   Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
4078   EXPECT_NE(s, Status::OK());
4079 }
4080 
4081 /// Feature: Vectors
4082 /// Description: Test with pre-vectors set that has the head-info
4083 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestVectorsWithHeadInfo)4084 TEST_F(MindDataTestPipeline, TestVectorsWithHeadInfo) {
4085   // Test with words that has head info.
4086   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithHeadInfo.";
4087   // Create a TextFile dataset
4088   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4089   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4090   EXPECT_NE(ds, nullptr);
4091 
4092   std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_with_info.txt";
4093   std::shared_ptr<Vectors> vectors;
4094   Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
4095   EXPECT_EQ(s, Status::OK());
4096 
4097   std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
4098   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
4099   EXPECT_NE(lookup, nullptr);
4100 
4101   // Create Map operation on ds
4102   ds = ds->Map({lookup}, {"text"});
4103   EXPECT_NE(ds, nullptr);
4104 
4105   // Create an iterator over the result of the above dataset
4106   std::shared_ptr<Iterator> iter = ds->CreateIterator();
4107   EXPECT_NE(iter, nullptr);
4108 
4109   // Iterate the dataset and get each row
4110   std::unordered_map<std::string, mindspore::MSTensor> row;
4111   ASSERT_OK(iter->GetNextRow(&row));
4112 
4113   uint64_t i = 0;
4114   std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4115                                               {-1, -1, -1, -1, -1, -1},
4116                                               {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4117                                               {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4118                                               {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4119                                               {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4120                                               {-1, -1, -1, -1, -1, -1}};
4121   while (row.size() != 0) {
4122     auto ind = row["text"];
4123     MS_LOG(INFO) << ind.Shape();
4124     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4125     TensorPtr de_expected_item;
4126     dsize_t dim = 6;
4127     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4128     mindspore::MSTensor ms_expected_item =
4129       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4130     EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4131 
4132     ASSERT_OK(iter->GetNextRow(&row));
4133     i++;
4134   }
4135 
4136   EXPECT_EQ(i, 7);
4137 
4138   // Manually terminate the pipeline
4139   iter->Stop();
4140 }
4141 
4142 /// Feature: Vectors
4143 /// Description: Test with the parameter max_vectors that is <= 0
4144 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestVectorsMaxVectorsLessThanZero)4145 TEST_F(MindDataTestPipeline, TestVectorsMaxVectorsLessThanZero) {
4146   // Test with max_vectors <= 0.
4147   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsMaxVectorsLessThanZero.";
4148 
4149   // Create a TextFile dataset
4150   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4151   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4152   EXPECT_NE(ds, nullptr);
4153 
4154   std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
4155   std::shared_ptr<Vectors> vectors;
4156   Status s = Vectors::BuildFromFile(&vectors, vectors_dir, -1);
4157   EXPECT_NE(s, Status::OK());
4158 }
4159 
4160 /// Feature: Vectors
4161 /// Description: Test with the pre-vectors file that is empty
4162 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestVectorsWithEmptyFile)4163 TEST_F(MindDataTestPipeline, TestVectorsWithEmptyFile) {
4164   // Read empty file.
4165   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithEmptyFile.";
4166 
4167   // Create a TextFile dataset
4168   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4169   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4170   EXPECT_NE(ds, nullptr);
4171 
4172   std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_empty.txt";
4173   std::shared_ptr<Vectors> vectors;
4174   Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
4175   EXPECT_NE(s, Status::OK());
4176 }
4177 
4178 /// Feature: Vectors
4179 /// Description: Test with the pre-vectors file that is not exist
4180 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestVectorsWithNotExistFile)4181 TEST_F(MindDataTestPipeline, TestVectorsWithNotExistFile) {
4182   // Test with not exist file.
4183   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithNotExistFile.";
4184 
4185   // Create a TextFile dataset
4186   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4187   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4188   EXPECT_NE(ds, nullptr);
4189 
4190   std::string vectors_dir = datasets_root_path_ + "/testVectors/no_vectors.txt";
4191   std::shared_ptr<Vectors> vectors;
4192   Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
4193   EXPECT_NE(s, Status::OK());
4194 }
4195 
4196 /// Feature: Vectors
4197 /// Description: Test with the pre-vectors set that has a situation that info-head is not the first line in the set
4198 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestVectorsWithWrongInfoFile)4199 TEST_F(MindDataTestPipeline, TestVectorsWithWrongInfoFile) {
4200   // Wrong info.
4201   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithWrongInfoFile.";
4202 
4203   // Create a TextFile dataset
4204   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4205   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4206   EXPECT_NE(ds, nullptr);
4207 
4208   std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_with_wrong_info.txt";
4209   std::shared_ptr<Vectors> vectors;
4210   Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
4211   EXPECT_NE(s, Status::OK());
4212 }
4213 
4214 /// Feature: FastText
4215 /// Description: Test with default parameter in function BuildFromFile and function Lookup
4216 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestFastTextDefaultParam)4217 TEST_F(MindDataTestPipeline, TestFastTextDefaultParam) {
4218   // Test with default parameter.
4219   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDefaultParam.";
4220 
4221   // Create a TextFile dataset
4222   std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4223   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4224   EXPECT_NE(ds, nullptr);
4225 
4226   std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
4227   std::shared_ptr<FastText> fast_text;
4228   Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
4229   EXPECT_EQ(s, Status::OK());
4230 
4231   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text);
4232   EXPECT_NE(lookup, nullptr);
4233 
4234   // Create Map operation on ds
4235   ds = ds->Map({lookup}, {"text"});
4236   EXPECT_NE(ds, nullptr);
4237 
4238   // Create an iterator over the result of the above dataset
4239   std::shared_ptr<Iterator> iter = ds->CreateIterator();
4240   EXPECT_NE(iter, nullptr);
4241 
4242   // Iterate the dataset and get each row
4243   std::unordered_map<std::string, mindspore::MSTensor> row;
4244   ASSERT_OK(iter->GetNextRow(&row));
4245 
4246   uint64_t i = 0;
4247   std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4248                                               {0, 0, 0, 0, 0, 0},
4249                                               {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4250                                               {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4251                                               {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4252                                               {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4253                                               {0, 0, 0, 0, 0, 0}};
4254   while (row.size() != 0) {
4255     auto ind = row["text"];
4256     MS_LOG(INFO) << ind.Shape();
4257     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4258     TensorPtr de_expected_item;
4259     dsize_t dim = 6;
4260     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4261     mindspore::MSTensor ms_expected_item =
4262       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4263     EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4264 
4265     ASSERT_OK(iter->GetNextRow(&row));
4266     i++;
4267   }
4268 
4269   EXPECT_EQ(i, 7);
4270 
4271   // Manually terminate the pipeline
4272   iter->Stop();
4273 }
4274 
4275 /// Feature: FastText
4276 /// Description: Test with all parameters which include `path` and `max_vector` in function BuildFromFile
4277 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestFastTextAllBuildfromfileParams)4278 TEST_F(MindDataTestPipeline, TestFastTextAllBuildfromfileParams) {
4279   // Test with two parameters.
4280   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllBuildfromfileParams.";
4281 
4282   // Create a TextFile dataset
4283   std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4284   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4285   EXPECT_NE(ds, nullptr);
4286 
4287   std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
4288   std::shared_ptr<FastText> fast_text;
4289   Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
4290   EXPECT_EQ(s, Status::OK());
4291 
4292   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text);
4293   EXPECT_NE(lookup, nullptr);
4294 
4295   // Create Map operation on ds
4296   ds = ds->Map({lookup}, {"text"});
4297   EXPECT_NE(ds, nullptr);
4298 
4299   // Create an iterator over the result of the above dataset
4300   std::shared_ptr<Iterator> iter = ds->CreateIterator();
4301   EXPECT_NE(iter, nullptr);
4302 
4303   // Iterate the dataset and get each row
4304   std::unordered_map<std::string, mindspore::MSTensor> row;
4305   ASSERT_OK(iter->GetNextRow(&row));
4306 
4307   uint64_t i = 0;
4308   std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4309                                               {0, 0, 0, 0, 0, 0},
4310                                               {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4311                                               {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4312                                               {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4313                                               {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4314                                               {0, 0, 0, 0, 0, 0}};
4315   while (row.size() != 0) {
4316     auto ind = row["text"];
4317     MS_LOG(INFO) << ind.Shape();
4318     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4319     TensorPtr de_expected_item;
4320     dsize_t dim = 6;
4321     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4322     mindspore::MSTensor ms_expected_item =
4323       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4324     EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4325 
4326     ASSERT_OK(iter->GetNextRow(&row));
4327     i++;
4328   }
4329 
4330   EXPECT_EQ(i, 7);
4331 
4332   // Manually terminate the pipeline
4333   iter->Stop();
4334 }
4335 
4336 /// Feature: FastText
4337 /// Description: Test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
4338 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestFastTextUnknownInit)4339 TEST_F(MindDataTestPipeline, TestFastTextUnknownInit) {
4340   // Test with two parameters.
4341   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextUnknownInit.";
4342 
4343   // Create a TextFile dataset
4344   std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4345   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4346   EXPECT_NE(ds, nullptr);
4347 
4348   std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
4349   std::shared_ptr<FastText> fast_text;
4350   Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
4351   EXPECT_EQ(s, Status::OK());
4352 
4353   std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
4354   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text, unknown_init);
4355   EXPECT_NE(lookup, nullptr);
4356 
4357   // Create Map operation on ds
4358   ds = ds->Map({lookup}, {"text"});
4359   EXPECT_NE(ds, nullptr);
4360 
4361   // Create an iterator over the result of the above dataset
4362   std::shared_ptr<Iterator> iter = ds->CreateIterator();
4363   EXPECT_NE(iter, nullptr);
4364 
4365   // Iterate the dataset and get each row
4366   std::unordered_map<std::string, mindspore::MSTensor> row;
4367   ASSERT_OK(iter->GetNextRow(&row));
4368 
4369   uint64_t i = 0;
4370   std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4371                                               {-1, -1, -1, -1, -1, -1},
4372                                               {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4373                                               {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4374                                               {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4375                                               {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4376                                               {-1, -1, -1, -1, -1, -1}};
4377   while (row.size() != 0) {
4378     auto ind = row["text"];
4379     MS_LOG(INFO) << ind.Shape();
4380     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4381     TensorPtr de_expected_item;
4382     dsize_t dim = 6;
4383     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4384     mindspore::MSTensor ms_expected_item =
4385       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4386     EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4387 
4388     ASSERT_OK(iter->GetNextRow(&row));
4389     i++;
4390   }
4391 
4392   EXPECT_EQ(i, 7);
4393 
4394   // Manually terminate the pipeline
4395   iter->Stop();
4396 }
4397 
4398 /// Feature: FastText
4399 /// Description: Test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
4400 ///     `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
4401 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestFastTextAllParams)4402 TEST_F(MindDataTestPipeline, TestFastTextAllParams) {
4403   // Test with all parameters.
4404   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllParams.";
4405   // Create a TextFile dataset
4406   std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4407   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4408   EXPECT_NE(ds, nullptr);
4409 
4410   std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
4411   std::shared_ptr<FastText> fast_text;
4412   Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
4413   EXPECT_EQ(s, Status::OK());
4414 
4415   std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
4416   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text, unknown_init, true);
4417   EXPECT_NE(lookup, nullptr);
4418 
4419   // Create Map operation on ds
4420   ds = ds->Map({lookup}, {"text"});
4421   EXPECT_NE(ds, nullptr);
4422 
4423   // Create an iterator over the result of the above dataset
4424   std::shared_ptr<Iterator> iter = ds->CreateIterator();
4425   EXPECT_NE(iter, nullptr);
4426 
4427   // Iterate the dataset and get each row
4428   std::unordered_map<std::string, mindspore::MSTensor> row;
4429   ASSERT_OK(iter->GetNextRow(&row));
4430 
4431   uint64_t i = 0;
4432   std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4433                                               {-1, -1, -1, -1, -1, -1},
4434                                               {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4435                                               {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4436                                               {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4437                                               {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4438                                               {-1, -1, -1, -1, -1, -1}};
4439   while (row.size() != 0) {
4440     auto ind = row["text"];
4441     MS_LOG(INFO) << ind.Shape();
4442     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4443     TensorPtr de_expected_item;
4444     dsize_t dim = 6;
4445     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4446     mindspore::MSTensor ms_expected_item =
4447       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4448     EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4449 
4450     ASSERT_OK(iter->GetNextRow(&row));
4451     i++;
4452   }
4453 
4454   EXPECT_EQ(i, 7);
4455 
4456   // Manually terminate the pipeline
4457   iter->Stop();
4458 }
4459 
4460 /// Feature: FastText
4461 /// Description: Test with pre-vectors set that have the different dimension
4462 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestFastTextDifferentDimension)4463 TEST_F(MindDataTestPipeline, TestFastTextDifferentDimension) {
4464   // Tokens don't have the same number of vectors.
4465   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDifferentDimension.";
4466 
4467   // Create a TextFile dataset
4468   std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4469   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4470   EXPECT_NE(ds, nullptr);
4471 
4472   std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_dim_different.vec";
4473   std::shared_ptr<FastText> fast_text;
4474   Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
4475   EXPECT_NE(s, Status::OK());
4476 }
4477 
4478 /// Feature: FastText
4479 /// Description: Test with the parameter max_vectors that is <= 0
4480 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestFastTextMaxVectorsLessThanZero)4481 TEST_F(MindDataTestPipeline, TestFastTextMaxVectorsLessThanZero) {
4482   // Test with max_vectors <= 0.
4483   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextMaxVectorsLessThanZero.";
4484 
4485   // Create a TextFile dataset
4486   std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4487   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4488   EXPECT_NE(ds, nullptr);
4489 
4490   std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
4491   std::shared_ptr<FastText> fast_text;
4492   Status s = FastText::BuildFromFile(&fast_text, vectors_dir, -1);
4493   EXPECT_NE(s, Status::OK());
4494 }
4495 
4496 /// Feature: FastText
4497 /// Description: Test with the pre-vectors file that is empty
4498 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestFastTextWithEmptyFile)4499 TEST_F(MindDataTestPipeline, TestFastTextWithEmptyFile) {
4500   // Read empty file.
4501   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithEmptyFile.";
4502 
4503   // Create a TextFile dataset
4504   std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4505   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4506   EXPECT_NE(ds, nullptr);
4507 
4508   std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_empty.vec";
4509   std::shared_ptr<FastText> fast_text;
4510   Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
4511   EXPECT_NE(s, Status::OK());
4512 }
4513 
4514 /// Feature: FastText
4515 /// Description: Test with the pre-vectors file that is not exist
4516 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestFastTextWithNotExistFile)4517 TEST_F(MindDataTestPipeline, TestFastTextWithNotExistFile) {
4518   // Test with not exist file.
4519   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithNotExistFile.";
4520 
4521   // Create a TextFile dataset
4522   std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4523   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4524   EXPECT_NE(ds, nullptr);
4525 
4526   std::string vectors_dir = datasets_root_path_ + "/test_fast_text/no_fasttext.vec";
4527   std::shared_ptr<FastText> fast_text;
4528   Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
4529   EXPECT_NE(s, Status::OK());
4530 }
4531 
4532 /// Feature: FastText
4533 /// Description: Test with the pre-vectors set that has a situation that info-head is not the first line in the set
4534 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestFastTextWithWrongInfoFile)4535 TEST_F(MindDataTestPipeline, TestFastTextWithWrongInfoFile) {
4536   // Wrong info.
4537   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongInfoFile.";
4538 
4539   // Create a TextFile dataset
4540   std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4541   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4542   EXPECT_NE(ds, nullptr);
4543 
4544   std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_with_wrong_info.vec";
4545   std::shared_ptr<FastText> fast_text;
4546   Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
4547   EXPECT_NE(s, Status::OK());
4548 }
4549 
4550 /// Feature: FastText
4551 /// Description: Test with the pre-vectors set that has a wrong suffix
4552 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestFastTextWithWrongSuffix)4553 TEST_F(MindDataTestPipeline, TestFastTextWithWrongSuffix) {
4554   // Wrong info.
4555   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongSuffix.";
4556 
4557   // Create a TextFile dataset
4558   std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4559   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4560   EXPECT_NE(ds, nullptr);
4561 
4562   std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.txt";
4563   std::shared_ptr<FastText> fast_text;
4564   Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
4565   EXPECT_NE(s, Status::OK());
4566 }
4567 
4568 /// Feature: GloVe
4569 /// Description: Test with default parameter in function BuildFromFile and function Lookup
4570 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestGloVeDefaultParam)4571 TEST_F(MindDataTestPipeline, TestGloVeDefaultParam) {
4572   // Test with default parameter.
4573   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeDefaultParam.";
4574 
4575   // Create a TextFile dataset
4576   std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4577   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4578   EXPECT_NE(ds, nullptr);
4579 
4580   std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
4581   std::shared_ptr<GloVe> glove;
4582   Status s = GloVe::BuildFromFile(&glove, vectors_dir);
4583   EXPECT_EQ(s, Status::OK());
4584 
4585   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove);
4586   EXPECT_NE(lookup, nullptr);
4587 
4588   // Create Map operation on ds
4589   ds = ds->Map({lookup}, {"text"});
4590   EXPECT_NE(ds, nullptr);
4591 
4592   // Create an iterator over the result of the above dataset
4593   std::shared_ptr<Iterator> iter = ds->CreateIterator();
4594   EXPECT_NE(iter, nullptr);
4595 
4596   // Iterate the dataset and get each row
4597   std::unordered_map<std::string, mindspore::MSTensor> row;
4598   ASSERT_OK(iter->GetNextRow(&row));
4599 
4600   uint64_t i = 0;
4601   std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4602                                               {0, 0, 0, 0, 0, 0},
4603                                               {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4604                                               {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4605                                               {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4606                                               {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4607                                               {0, 0, 0, 0, 0, 0}};
4608   while (row.size() != 0) {
4609     auto ind = row["text"];
4610     MS_LOG(INFO) << ind.Shape();
4611     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4612     TensorPtr de_expected_item;
4613     dsize_t dim = 6;
4614     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4615     mindspore::MSTensor ms_expected_item =
4616       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4617     EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4618 
4619     ASSERT_OK(iter->GetNextRow(&row));
4620     i++;
4621   }
4622 
4623   EXPECT_EQ(i, 7);
4624 
4625   // Manually terminate the pipeline
4626   iter->Stop();
4627 }
4628 
4629 /// Feature: GloVe
4630 /// Description: Test with all parameters which include `path` and `max_vector` in function BuildFromFile
4631 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestGloVeAllBuildfromfileParams)4632 TEST_F(MindDataTestPipeline, TestGloVeAllBuildfromfileParams) {
4633   // Test with two parameters.
4634   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeAllBuildfromfileParams.";
4635 
4636   // Create a TextFile dataset
4637   std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4638   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4639   EXPECT_NE(ds, nullptr);
4640 
4641   std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
4642   std::shared_ptr<GloVe> glove;
4643   Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100);
4644   EXPECT_EQ(s, Status::OK());
4645 
4646   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove);
4647   EXPECT_NE(lookup, nullptr);
4648 
4649   // Create Map operation on ds
4650   ds = ds->Map({lookup}, {"text"});
4651   EXPECT_NE(ds, nullptr);
4652 
4653   // Create an iterator over the result of the above dataset
4654   std::shared_ptr<Iterator> iter = ds->CreateIterator();
4655   EXPECT_NE(iter, nullptr);
4656 
4657   // Iterate the dataset and get each row
4658   std::unordered_map<std::string, mindspore::MSTensor> row;
4659   ASSERT_OK(iter->GetNextRow(&row));
4660 
4661   uint64_t i = 0;
4662   std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4663                                               {0, 0, 0, 0, 0, 0},
4664                                               {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4665                                               {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4666                                               {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4667                                               {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4668                                               {0, 0, 0, 0, 0, 0}};
4669   while (row.size() != 0) {
4670     auto ind = row["text"];
4671     MS_LOG(INFO) << ind.Shape();
4672     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4673     TensorPtr de_expected_item;
4674     dsize_t dim = 6;
4675     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4676     mindspore::MSTensor ms_expected_item =
4677       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4678     EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4679 
4680     ASSERT_OK(iter->GetNextRow(&row));
4681     i++;
4682   }
4683 
4684   EXPECT_EQ(i, 7);
4685 
4686   // Manually terminate the pipeline
4687   iter->Stop();
4688 }
4689 
4690 /// Feature: GloVe
4691 /// Description: Test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
4692 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestGloVeUnknownInit)4693 TEST_F(MindDataTestPipeline, TestGloVeUnknownInit) {
4694   // Test with two parameters.
4695   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeUnknownInit.";
4696 
4697   // Create a TextFile dataset
4698   std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4699   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4700   EXPECT_NE(ds, nullptr);
4701 
4702   std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
4703   std::shared_ptr<GloVe> glove;
4704   Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100);
4705   EXPECT_EQ(s, Status::OK());
4706 
4707   std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
4708   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove, unknown_init);
4709   EXPECT_NE(lookup, nullptr);
4710 
4711   // Create Map operation on ds
4712   ds = ds->Map({lookup}, {"text"});
4713   EXPECT_NE(ds, nullptr);
4714 
4715   // Create an iterator over the result of the above dataset
4716   std::shared_ptr<Iterator> iter = ds->CreateIterator();
4717   EXPECT_NE(iter, nullptr);
4718 
4719   // Iterate the dataset and get each row
4720   std::unordered_map<std::string, mindspore::MSTensor> row;
4721   ASSERT_OK(iter->GetNextRow(&row));
4722 
4723   uint64_t i = 0;
4724   std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4725                                               {-1, -1, -1, -1, -1, -1},
4726                                               {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4727                                               {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4728                                               {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4729                                               {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4730                                               {-1, -1, -1, -1, -1, -1}};
4731   while (row.size() != 0) {
4732     auto ind = row["text"];
4733     MS_LOG(INFO) << ind.Shape();
4734     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4735     TensorPtr de_expected_item;
4736     dsize_t dim = 6;
4737     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4738     mindspore::MSTensor ms_expected_item =
4739       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4740     EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4741 
4742     ASSERT_OK(iter->GetNextRow(&row));
4743     i++;
4744   }
4745 
4746   EXPECT_EQ(i, 7);
4747 
4748   // Manually terminate the pipeline
4749   iter->Stop();
4750 }
4751 
4752 /// Feature: GloVe
4753 /// Description: Test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
4754 ///     `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
4755 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestGloVeAllParams)4756 TEST_F(MindDataTestPipeline, TestGloVeAllParams) {
4757   // Test with all parameters.
4758   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeAllParams.";
4759   // Create a TextFile dataset
4760   std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4761   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4762   EXPECT_NE(ds, nullptr);
4763 
4764   std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
4765   std::shared_ptr<GloVe> glove;
4766   Status s = GloVe::BuildFromFile(&glove, vectors_dir);
4767   EXPECT_EQ(s, Status::OK());
4768 
4769   std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
4770   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove, unknown_init, true);
4771   EXPECT_NE(lookup, nullptr);
4772 
4773   // Create Map operation on ds
4774   ds = ds->Map({lookup}, {"text"});
4775   EXPECT_NE(ds, nullptr);
4776 
4777   // Create an iterator over the result of the above dataset
4778   std::shared_ptr<Iterator> iter = ds->CreateIterator();
4779   EXPECT_NE(iter, nullptr);
4780 
4781   // Iterate the dataset and get each row
4782   std::unordered_map<std::string, mindspore::MSTensor> row;
4783   ASSERT_OK(iter->GetNextRow(&row));
4784 
4785   uint64_t i = 0;
4786   std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4787                                               {-1, -1, -1, -1, -1, -1},
4788                                               {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4789                                               {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4790                                               {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4791                                               {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4792                                               {-1, -1, -1, -1, -1, -1}};
4793   while (row.size() != 0) {
4794     auto ind = row["text"];
4795     MS_LOG(INFO) << ind.Shape();
4796     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4797     TensorPtr de_expected_item;
4798     dsize_t dim = 6;
4799     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4800     mindspore::MSTensor ms_expected_item =
4801       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4802     EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4803 
4804     ASSERT_OK(iter->GetNextRow(&row));
4805     i++;
4806   }
4807 
4808   EXPECT_EQ(i, 7);
4809 
4810   // Manually terminate the pipeline
4811   iter->Stop();
4812 }
4813 
4814 /// Feature: GloVe
4815 /// Description: Test with pre-vectors set that have the different dimension
4816 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestGloVeDifferentDimension)4817 TEST_F(MindDataTestPipeline, TestGloVeDifferentDimension) {
4818   // Tokens don't have the same number of glove.
4819   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeDifferentDimension.";
4820 
4821   // Create a TextFile dataset
4822   std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4823   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4824   EXPECT_NE(ds, nullptr);
4825 
4826   std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.dim_different.txt";
4827   std::shared_ptr<GloVe> glove;
4828   Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100);
4829   EXPECT_NE(s, Status::OK());
4830 }
4831 
4832 /// Feature: GloVe
4833 /// Description: Test with the parameter max_vectors that is <= 0
4834 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestGloVeMaxVectorsLessThanZero)4835 TEST_F(MindDataTestPipeline, TestGloVeMaxVectorsLessThanZero) {
4836   // Test with max_vectors <= 0.
4837   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeMaxVectorsLessThanZero.";
4838 
4839   // Create a TextFile dataset
4840   std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4841   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4842   EXPECT_NE(ds, nullptr);
4843 
4844   std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
4845   std::shared_ptr<GloVe> glove;
4846   Status s = GloVe::BuildFromFile(&glove, vectors_dir, -1);
4847   EXPECT_NE(s, Status::OK());
4848 }
4849 
4850 /// Feature: GloVe
4851 /// Description: Test with the pre-vectors file that is empty
4852 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestGloVeWithEmptyFile)4853 TEST_F(MindDataTestPipeline, TestGloVeWithEmptyFile) {
4854   // Read empty file.
4855   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithEmptyFile.";
4856 
4857   // Create a TextFile dataset
4858   std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4859   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4860   EXPECT_NE(ds, nullptr);
4861 
4862   std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.empty.txt";
4863   std::shared_ptr<GloVe> glove;
4864   Status s = GloVe::BuildFromFile(&glove, vectors_dir);
4865   EXPECT_NE(s, Status::OK());
4866 }
4867 
4868 /// Feature: GloVe
4869 /// Description: Test with the pre-vectors file that is not exist
4870 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestGloVeWithNotExistFile)4871 TEST_F(MindDataTestPipeline, TestGloVeWithNotExistFile) {
4872   // Test with not exist file.
4873   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithNotExistFile.";
4874 
4875   // Create a TextFile dataset
4876   std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4877   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4878   EXPECT_NE(ds, nullptr);
4879 
4880   std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.empty.txt";
4881   std::shared_ptr<GloVe> glove;
4882   Status s = GloVe::BuildFromFile(&glove, vectors_dir);
4883   EXPECT_NE(s, Status::OK());
4884 }
4885 
4886 /// Feature: GloVe
4887 /// Description: Test with the pre-vectors set that has a situation that info-head is not the first line in the set
4888 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestGloVeWithWrongInfoFile)4889 TEST_F(MindDataTestPipeline, TestGloVeWithWrongInfoFile) {
4890   // Wrong info.
4891   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithWrongInfoFile.";
4892 
4893   // Create a TextFile dataset
4894   std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4895   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4896   EXPECT_NE(ds, nullptr);
4897 
4898   std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.with_wrong_info.txt";
4899   std::shared_ptr<GloVe> glove;
4900   Status s = GloVe::BuildFromFile(&glove, vectors_dir);
4901   EXPECT_NE(s, Status::OK());
4902 }
4903 
4904 /// Feature: GloVe
4905 /// Description: Test with the pre-vectors set that has a wrong format
4906 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestGloVeWithWrongFormat)4907 TEST_F(MindDataTestPipeline, TestGloVeWithWrongFormat) {
4908   // Wrong info.
4909   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithWrongFormat.";
4910 
4911   // Create a TextFile dataset
4912   std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4913   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4914   EXPECT_NE(ds, nullptr);
4915 
4916   std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.tests.vec";
4917   std::shared_ptr<GloVe> glove;
4918   Status s = GloVe::BuildFromFile(&glove, vectors_dir);
4919   EXPECT_NE(s, Status::OK());
4920 }
4921 
4922 /// Feature: CharNGram
4923 /// Description: Test with default parameter in function BuildFromFile and function Lookup
4924 /// Expectation: Return correct MSTensor which is equal to the excepted
TEST_F(MindDataTestPipeline,TestCharNGramDefaultParam)4925 TEST_F(MindDataTestPipeline, TestCharNGramDefaultParam) {
4926   // Test with default parameter.
4927   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramDefaultParam.";
4928 
4929   // Create a TextFile dataset
4930   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4931   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4932   EXPECT_NE(ds, nullptr);
4933 
4934   std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
4935   std::shared_ptr<CharNGram> char_n_gram;
4936   Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
4937   EXPECT_EQ(s, Status::OK());
4938   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram);
4939   EXPECT_NE(lookup, nullptr);
4940 
4941   // Create Map operation on ds
4942   ds = ds->Map({lookup}, {"text"});
4943   EXPECT_NE(ds, nullptr);
4944 
4945   // Create an iterator over the result of the above dataset
4946   std::shared_ptr<Iterator> iter = ds->CreateIterator();
4947   EXPECT_NE(iter, nullptr);
4948 
4949   // Iterate the dataset and get each row
4950   std::unordered_map<std::string, mindspore::MSTensor> row;
4951   ASSERT_OK(iter->GetNextRow(&row));
4952 
4953   uint64_t i = 0;
4954   std::vector<std::vector<float>> expected = {{0, 0, 0, 0, 0},
4955                                               {0, 0, 0, 0, 0},
4956                                               {0.117336, 0.362446, -0.983326, 0.939264, -0.05648},
4957                                               {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
4958                                               {0, 0, 0, 0, 0},
4959                                               {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
4960                                               {0, 0, 0, 0, 0}};
4961   while (row.size() != 0) {
4962     auto ind = row["text"];
4963     MS_LOG(INFO) << ind.Shape();
4964     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4965     TensorPtr de_expected_item;
4966     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
4967     mindspore::MSTensor ms_expected_item =
4968       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4969     std::vector<int64_t> ind_shape = ind.Shape();
4970     std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
4971     EXPECT_EQ(ind_shape, ms_expected_shape);
4972 
4973     ASSERT_OK(iter->GetNextRow(&row));
4974     i++;
4975   }
4976 
4977   EXPECT_EQ(i, 7);
4978 
4979   // Manually terminate the pipeline
4980   iter->Stop();
4981 }
4982 
4983 /// Feature: CharNGram.
4984 /// Description: Test with all parameters which include `path` and `max_vector` in function BuildFromFile
4985 /// Expectation: Return correct MSTensor which is equal to the excepted
TEST_F(MindDataTestPipeline,TestCharNGramAllBuildfromfileParams)4986 TEST_F(MindDataTestPipeline, TestCharNGramAllBuildfromfileParams) {
4987   // Test with two parameters.
4988   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramAllBuildfromfileParams.";
4989 
4990   // Create a TextFile dataset
4991   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4992   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4993   EXPECT_NE(ds, nullptr);
4994 
4995   std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
4996   std::shared_ptr<CharNGram> char_n_gram;
4997   Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, 18);
4998   EXPECT_EQ(s, Status::OK());
4999 
5000   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram);
5001   EXPECT_NE(lookup, nullptr);
5002 
5003   // Create Map operation on ds
5004   ds = ds->Map({lookup}, {"text"});
5005   EXPECT_NE(ds, nullptr);
5006 
5007   // Create an iterator over the result of the above dataset
5008   std::shared_ptr<Iterator> iter = ds->CreateIterator();
5009   EXPECT_NE(iter, nullptr);
5010 
5011   // Iterate the dataset and get each row
5012   std::unordered_map<std::string, mindspore::MSTensor> row;
5013   ASSERT_OK(iter->GetNextRow(&row));
5014 
5015   uint64_t i = 0;
5016   std::vector<std::vector<float>> expected = {{0, 0, 0, 0, 0},
5017                                               {0, 0, 0, 0, 0},
5018                                               {-0.155665, 0.664073, -0.538499, 1.22657, -0.2162},
5019                                               {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
5020                                               {0, 0, 0, 0, 0},
5021                                               {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
5022                                               {0, 0, 0, 0, 0}};
5023   while (row.size() != 0) {
5024     auto ind = row["text"];
5025     MS_LOG(INFO) << ind.Shape();
5026     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
5027     TensorPtr de_expected_item;
5028     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
5029     mindspore::MSTensor ms_expected_item =
5030       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
5031     std::vector<int64_t> ind_shape = ind.Shape();
5032     std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
5033     EXPECT_EQ(ind_shape, ms_expected_shape);
5034 
5035     ASSERT_OK(iter->GetNextRow(&row));
5036     i++;
5037   }
5038 
5039   EXPECT_EQ(i, 7);
5040 
5041   // Manually terminate the pipeline
5042   iter->Stop();
5043 }
5044 
5045 /// Feature: CharNGram
5046 /// Description: Test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
5047 /// Expectation: Return correct MSTensor which is equal to the excepted
TEST_F(MindDataTestPipeline,TestCharNGramUnknownInit)5048 TEST_F(MindDataTestPipeline, TestCharNGramUnknownInit) {
5049   // Test with two parameters.
5050   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramUnknownInit.";
5051 
5052   // Create a TextFile dataset
5053   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
5054   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5055   EXPECT_NE(ds, nullptr);
5056 
5057   std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
5058   std::shared_ptr<CharNGram> char_n_gram;
5059   Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, 18);
5060   EXPECT_EQ(s, Status::OK());
5061 
5062   std::vector<float> unknown_init(5, -1);
5063   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram, unknown_init);
5064   EXPECT_NE(lookup, nullptr);
5065 
5066   // Create Map operation on ds
5067   ds = ds->Map({lookup}, {"text"});
5068   EXPECT_NE(ds, nullptr);
5069 
5070   // Create an iterator over the result of the above dataset
5071   std::shared_ptr<Iterator> iter = ds->CreateIterator();
5072   EXPECT_NE(iter, nullptr);
5073 
5074   // Iterate the dataset and get each row
5075   std::unordered_map<std::string, mindspore::MSTensor> row;
5076   ASSERT_OK(iter->GetNextRow(&row));
5077 
5078   uint64_t i = 0;
5079   std::vector<std::vector<float>> expected = {{-1, -1, -1, -1, -1},
5080                                               {-1, -1, -1, -1, -1},
5081                                               {-0.155665, 0.664073, -0.538499, 1.22657, -0.2162},
5082                                               {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
5083                                               {-1, -1, -1, -1, -1},
5084                                               {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
5085                                               {-1, -1, -1, -1, -1}};
5086   while (row.size() != 0) {
5087     auto ind = row["text"];
5088     MS_LOG(INFO) << ind.Shape();
5089     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
5090     TensorPtr de_expected_item;
5091     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
5092     mindspore::MSTensor ms_expected_item =
5093       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
5094     std::vector<int64_t> ind_shape = ind.Shape();
5095     std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
5096     EXPECT_EQ(ind_shape, ms_expected_shape);
5097 
5098     ASSERT_OK(iter->GetNextRow(&row));
5099     i++;
5100   }
5101 
5102   EXPECT_EQ(i, 7);
5103 
5104   // Manually terminate the pipeline
5105   iter->Stop();
5106 }
5107 
5108 /// Feature: CharNGram
5109 /// Description: Test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
5110 ///     `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
5111 /// Expectation: Return correct MSTensor which is equal to the excepted
TEST_F(MindDataTestPipeline,TestCharNGramAllParams)5112 TEST_F(MindDataTestPipeline, TestCharNGramAllParams) {
5113   // Test with all parameters.
5114   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramAllParams.";
5115   // Create a TextFile dataset
5116   std::string data_file = datasets_root_path_ + "/testVectors/words_with_big_letter.txt";
5117   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5118   EXPECT_NE(ds, nullptr);
5119 
5120   std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
5121   std::shared_ptr<CharNGram> char_n_gram;
5122   Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
5123   EXPECT_EQ(s, Status::OK());
5124 
5125   std::vector<float> unknown_init(5, -1);
5126   std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram, unknown_init, true);
5127   EXPECT_NE(lookup, nullptr);
5128 
5129   // Create Map operation on ds
5130   ds = ds->Map({lookup}, {"text"});
5131   EXPECT_NE(ds, nullptr);
5132 
5133   // Create an iterator over the result of the above dataset
5134   std::shared_ptr<Iterator> iter = ds->CreateIterator();
5135   EXPECT_NE(iter, nullptr);
5136 
5137   // Iterate the dataset and get each row
5138   std::unordered_map<std::string, mindspore::MSTensor> row;
5139   ASSERT_OK(iter->GetNextRow(&row));
5140 
5141   uint64_t i = 0;
5142   std::vector<std::vector<float>> expected = {{-1, -1, -1, -1, -1},
5143                                               {-1, -1, -1, -1, -1},
5144                                               {0.117336, 0.362446, -0.983326, 0.939264, -0.05648},
5145                                               {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
5146                                               {-1, -1, -1, -1, -1},
5147                                               {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
5148                                               {-1, -1, -1, -1, -1}};
5149   while (row.size() != 0) {
5150     auto ind = row["text"];
5151     MS_LOG(INFO) << ind.Shape();
5152     TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
5153     TensorPtr de_expected_item;
5154     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
5155     mindspore::MSTensor ms_expected_item =
5156       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
5157     std::vector<int64_t> ind_shape = ind.Shape();
5158     std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
5159     EXPECT_EQ(ind_shape, ms_expected_shape);
5160 
5161     ASSERT_OK(iter->GetNextRow(&row));
5162     i++;
5163   }
5164 
5165   EXPECT_EQ(i, 7);
5166 
5167   // Manually terminate the pipeline
5168   iter->Stop();
5169 }
5170 
5171 /// Feature: CharNGram
5172 /// Description: Test with pre-vectors set that have the different dimension
5173 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestCharNGramDifferentDimension)5174 TEST_F(MindDataTestPipeline, TestCharNGramDifferentDimension) {
5175   // Tokens don't have the same number of vectors.
5176   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramDifferentDimension.";
5177 
5178   // Create a TextFile dataset
5179   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
5180   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5181   EXPECT_NE(ds, nullptr);
5182 
5183   std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20_dim_different.txt";
5184   std::shared_ptr<CharNGram> char_n_gram;
5185   Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
5186   EXPECT_NE(s, Status::OK());
5187 }
5188 
5189 /// Feature: CharNGram
5190 /// Description: Test with the parameter max_vectors that is <= 0
5191 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestCharNGramMaxVectorsLessThanZero)5192 TEST_F(MindDataTestPipeline, TestCharNGramMaxVectorsLessThanZero) {
5193   // Test with max_vectors <= 0.
5194   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramMaxVectorsLessThanZero.";
5195 
5196   // Create a TextFile dataset
5197   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
5198   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5199   EXPECT_NE(ds, nullptr);
5200 
5201   std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
5202   std::shared_ptr<CharNGram> char_n_gram;
5203   Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, -1);
5204   EXPECT_NE(s, Status::OK());
5205 }
5206 
5207 /// Feature: CharNGram
5208 /// Description: Test with the pre-vectors file that is empty
5209 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestCharNGramWithEmptyFile)5210 TEST_F(MindDataTestPipeline, TestCharNGramWithEmptyFile) {
5211   // Read empty file.
5212   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramWithEmptyFile.";
5213 
5214   // Create a TextFile dataset
5215   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
5216   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5217   EXPECT_NE(ds, nullptr);
5218 
5219   std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_empty.txt";
5220   std::shared_ptr<CharNGram> char_n_gram;
5221   Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
5222   EXPECT_NE(s, Status::OK());
5223 }
5224 
5225 /// Feature: CharNGram
5226 /// Description: Test with the pre-vectors file that is not exist
5227 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestCharNGramsWithNotExistFile)5228 TEST_F(MindDataTestPipeline, TestCharNGramsWithNotExistFile) {
5229   // Test with not exist file.
5230   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramsWithNotExistFile.";
5231 
5232   // Create a TextFile dataset
5233   std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
5234   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5235   EXPECT_NE(ds, nullptr);
5236 
5237   std::string vectors_dir = datasets_root_path_ + "/testVectors/no_vectors.txt";
5238   std::shared_ptr<CharNGram> char_n_gram;
5239   Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
5240   EXPECT_NE(s, Status::OK());
5241 }
5242 
5243 /// Feature: AddToken op
5244 /// Description: Test input 1d of AddToken op successfully
5245 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestAddTokenPipelineSuccess)5246 TEST_F(MindDataTestPipeline, TestAddTokenPipelineSuccess) {
5247   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAddTokenPipelineSuccess.";
5248 
5249   // Create a TextFile dataset
5250   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
5251   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5252   EXPECT_NE(ds, nullptr);
5253 
5254   // Create Take operation on ds
5255   ds = ds->Take(1);
5256   EXPECT_NE(ds, nullptr);
5257 
5258   // Create white_tokenizer operation on ds
5259   std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
5260   EXPECT_NE(white_tokenizer, nullptr);
5261 
5262   // Create add_token operation on ds
5263   std::shared_ptr<TensorTransform> add_token = std::make_shared<text::AddToken>("TOKEN", true);
5264   EXPECT_NE(add_token, nullptr);
5265 
5266   // Create Map operation on ds
5267   ds = ds->Map({white_tokenizer, add_token}, {"text"});
5268   EXPECT_NE(ds, nullptr);
5269 
5270   // Create an iterator over the result of the above dataset
5271   // This will trigger the creation of the Execution Tree and launch it.
5272   std::shared_ptr<Iterator> iter = ds->CreateIterator();
5273   EXPECT_NE(iter, nullptr);
5274 
5275   // Iterate the dataset and get each row
5276   std::unordered_map<std::string, mindspore::MSTensor> row;
5277   ASSERT_OK(iter->GetNextRow(&row));
5278 
5279   std::vector<std::string> expected = {"TOKEN", "This", "is", "a", "text", "file."};
5280   std::shared_ptr<Tensor> de_expected_tensor;
5281   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
5282   mindspore::MSTensor expected_tensor =
5283     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
5284 
5285   uint64_t i = 0;
5286   while (row.size() != 0) {
5287     auto ind = row["text"];
5288     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
5289     ASSERT_OK(iter->GetNextRow(&row));
5290     i++;
5291   }
5292 
5293   EXPECT_EQ(i, 1);
5294 
5295   // Manually terminate the pipeline
5296   iter->Stop();
5297 }
5298 
5299 /// Feature: Truncate
5300 /// Description: Test Truncate basic usage max_seq_len less length
5301 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestTruncateSuccess1D)5302 TEST_F(MindDataTestPipeline, TestTruncateSuccess1D) {
5303   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSuccess1D.";
5304   // Testing basic Truncate
5305 
5306   // Create a TextFile dataset
5307   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
5308   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5309   EXPECT_NE(ds, nullptr);
5310 
5311   // Create white_tokenizer operation on ds
5312   std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
5313   EXPECT_NE(white_tokenizer, nullptr);
5314 
5315   // Create a truncate operation on ds
5316   std::shared_ptr<TensorTransform> truncate = std::make_shared<text::Truncate>(3);
5317   EXPECT_NE(truncate, nullptr);
5318 
5319   // Create Map operation on ds
5320   ds = ds->Map({white_tokenizer, truncate}, {"text"});
5321   EXPECT_NE(ds, nullptr);
5322 
5323   // Create an iterator over the result of the above dataset
5324   // This will trigger the creation of the Execution Tree and launch it.
5325   std::shared_ptr<Iterator> iter = ds->CreateIterator();
5326   EXPECT_NE(iter, nullptr);
5327 
5328   // Iterate the dataset and get each row
5329   std::unordered_map<std::string, mindspore::MSTensor> row;
5330   ASSERT_OK(iter->GetNextRow(&row));
5331 
5332   std::vector<std::vector<std::string>> expected = {
5333     {"This", "is", "a"}, {"Be", "happy", "every"}, {"Good", "luck", "to"}};
5334 
5335   uint64_t i = 0;
5336   while (row.size() != 0) {
5337     auto ind = row["text"];
5338 
5339     std::shared_ptr<Tensor> de_expected_tensor;
5340     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
5341     mindspore::MSTensor expected_tensor =
5342       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
5343     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
5344 
5345     ASSERT_OK(iter->GetNextRow(&row));
5346     i++;
5347   }
5348 
5349   EXPECT_EQ(i, 3);
5350 
5351   // Manually terminate the pipeline
5352   iter->Stop();
5353 }
5354 
5355 /// Feature: Truncate
5356 /// Description: Test the incorrect parameter of Truncate interface
5357 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestTruncateFail)5358 TEST_F(MindDataTestPipeline, TestTruncateFail) {
5359   // Testing the incorrect parameter of Truncate interface.
5360   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateFail.";
5361 
5362   // Create a TextFile dataset
5363   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
5364   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5365   EXPECT_NE(ds, nullptr);
5366 
5367   // Testing the parameter max_seq_len less than 0
5368   // Create a truncate operation on ds
5369   std::shared_ptr<TensorTransform> truncate = std::make_shared<text::Truncate>(-1);
5370   EXPECT_NE(truncate, nullptr);
5371 
5372   // Create a Map operation on ds
5373   ds = ds->Map({truncate});
5374   EXPECT_NE(ds, nullptr);
5375 
5376   std::shared_ptr<Iterator> iter = ds->CreateIterator();
5377   // Expect failure: invalid Truncate input (The parameter max_seq_len must be greater than  0)
5378   EXPECT_EQ(iter, nullptr);
5379 }
5380