• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright 2020-2021 Huawei Technologies Co., Ltd
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #include <memory>
17 #include <vector>
18 #include <string>
19 
20 #include "common/common.h"
21 #include "include/api/status.h"
22 #include "minddata/dataset/include/dataset/config.h"
23 #include "minddata/dataset/include/dataset/datasets.h"
24 #include "minddata/dataset/include/dataset/text.h"
25 #include "minddata/dataset/include/dataset/transforms.h"
26 #include "minddata/dataset/text/vocab.h"
27 
28 using namespace mindspore::dataset;
29 using mindspore::Status;
30 using mindspore::dataset::ShuffleMode;
31 using mindspore::dataset::Tensor;
32 using mindspore::dataset::Vocab;
33 
34 class MindDataTestPipeline : public UT::DatasetOpTesting {
35  protected:
36 };
37 
38 TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess1) {
39   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess1.";
40   // Test BasicTokenizer with default parameters
41 
42   // Create a TextFile dataset
43   std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
44   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
45   EXPECT_NE(ds, nullptr);
46 
47   // Create Take operation on ds
48   ds = ds->Take(6);
49   EXPECT_NE(ds, nullptr);
50 
51   // Create BasicTokenizer operation on ds
52   std::shared_ptr<TensorTransform> basic_tokenizer = std::make_shared<text::BasicTokenizer>();
53   EXPECT_NE(basic_tokenizer, nullptr);
54 
55   // Create Map operation on ds
56   ds = ds->Map({basic_tokenizer}, {"text"});
57   EXPECT_NE(ds, nullptr);
58 
59   // Create an iterator over the result of the above dataset
60   // This will trigger the creation of the Execution Tree and launch it.
61   std::shared_ptr<Iterator> iter = ds->CreateIterator();
62   EXPECT_NE(iter, nullptr);
63 
64   // Iterate the dataset and get each row
65   std::unordered_map<std::string, mindspore::MSTensor> row;
66   ASSERT_OK(iter->GetNextRow(&row));
67 
68   std::vector<std::vector<std::string>> expected = {
69     {"Welcome", "to", "Beijing", "北", "京", "欢", "迎", "您"},
70     {"長", "風", "破", "浪", "會", "有", "時", ",", "直", "掛", "雲", "帆", "濟", "滄", "海"},
71     {"��", "嘿", "嘿", "��", "哈", "哈", "��", "大", "笑", "��", "嘻", "嘻"},
72     {"明", "朝", "(", "1368", "—",  "1644", "年", ")", "和", "清", "朝", "(", "1644", "—",  "1911", "年", ")",
73      ",", "是", "中", "国",   "封", "建",   "王", "朝", "史", "上", "最", "后", "两",   "个", "朝",   "代"},
74     {"明", "代",   "(", "1368",     "-",  "1644", ")",      "と", "清", "代",    "(", "1644",
75      "-",  "1911", ")", "は",       "、", "中",   "国",      "の", "封", "建",    "王", "朝",
76      "の", "歴",   "史", "における", "最", "後",   "の2つの", "王", "朝", "でした"},
77     {"명나라", "(", "1368", "-",    "1644", ")",      "와",       "청나라", "(",  "1644",    "-",
78      "1911",   ")", "는",   "중국", "봉건", "왕조의", "역사에서", "마지막", "두", "왕조였다"}};
79 
80   uint64_t i = 0;
81   while (row.size() != 0) {
82     auto ind = row["text"];
83     std::shared_ptr<Tensor> de_expected_tensor;
84     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
85     mindspore::MSTensor expected_tensor =
86       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
87     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
88 
89     ASSERT_OK(iter->GetNextRow(&row));
90     i++;
91   }
92 
93   EXPECT_EQ(i, 6);
94 
95   // Manually terminate the pipeline
96   iter->Stop();
97 }
98 
99 TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess2) {
100   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess2.";
101   // Test BasicTokenizer with lower_case true
102 
103   // Create a TextFile dataset
104   std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
105   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
106   EXPECT_NE(ds, nullptr);
107 
108   // Create Skip operation on ds
109   ds = ds->Skip(6);
110   EXPECT_NE(ds, nullptr);
111 
112   // Create BasicTokenizer operation on ds
113   std::shared_ptr<TensorTransform> basic_tokenizer = std::make_shared<text::BasicTokenizer>(true);
114   EXPECT_NE(basic_tokenizer, nullptr);
115 
116   // Create Map operation on ds
117   ds = ds->Map({basic_tokenizer}, {"text"});
118   EXPECT_NE(ds, nullptr);
119 
120   // Create an iterator over the result of the above dataset
121   // This will trigger the creation of the Execution Tree and launch it.
122   std::shared_ptr<Iterator> iter = ds->CreateIterator();
123   EXPECT_NE(iter, nullptr);
124 
125   // Iterate the dataset and get each row
126   std::unordered_map<std::string, mindspore::MSTensor> row;
127   ASSERT_OK(iter->GetNextRow(&row));
128 
129   std::vector<std::string> expected = {"this", "is", "a", "funky", "string"};
130   std::shared_ptr<Tensor> de_expected_tensor;
131   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
132   mindspore::MSTensor expected_tensor =
133     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
134 
135   uint64_t i = 0;
136   while (row.size() != 0) {
137     auto ind = row["text"];
138     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
139     ASSERT_OK(iter->GetNextRow(&row));
140     i++;
141   }
142 
143   EXPECT_EQ(i, 1);
144 
145   // Manually terminate the pipeline
146   iter->Stop();
147 }
148 
149 TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess3) {
150   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess3.";
151   // Test BasicTokenizer with with_offsets true and lower_case true
152 
153   // Create a TextFile dataset
154   std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
155   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
156   EXPECT_NE(ds, nullptr);
157 
158   // Create Skip operation on ds
159   ds = ds->Skip(6);
160   EXPECT_NE(ds, nullptr);
161 
162   // Create BasicTokenizer operation on ds
163   std::shared_ptr<TensorTransform> basic_tokenizer =
164     std::make_shared<text::BasicTokenizer>(true, false, NormalizeForm::kNone, true, true);
165   EXPECT_NE(basic_tokenizer, nullptr);
166 
167   // Create Map operation on ds
168   ds = ds->Map({basic_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
169   EXPECT_NE(ds, nullptr);
170 
171   // Create an iterator over the result of the above dataset
172   // This will trigger the creation of the Execution Tree and launch it.
173   std::shared_ptr<Iterator> iter = ds->CreateIterator();
174   EXPECT_NE(iter, nullptr);
175 
176   // Iterate the dataset and get each row
177   std::unordered_map<std::string, mindspore::MSTensor> row;
178   ASSERT_OK(iter->GetNextRow(&row));
179 
180   std::vector<std::string> expected_tokens = {"this", "is", "a", "funky", "string"};
181   std::vector<uint32_t> expected_offsets_start = {0, 5, 8, 10, 16};
182   std::vector<uint32_t> expected_offsets_limit = {4, 7, 9, 15, 22};
183 
184   std::shared_ptr<Tensor> de_expected_tokens;
185   ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
186   mindspore::MSTensor ms_expected_tokens =
187     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
188 
189   std::shared_ptr<Tensor> de_expected_offsets_start;
190   ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
191   mindspore::MSTensor ms_expected_offsets_start =
192     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
193 
194   std::shared_ptr<Tensor> de_expected_offsets_limit;
195   ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
196   mindspore::MSTensor ms_expected_offsets_limit =
197     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
198 
199   uint64_t i = 0;
200   while (row.size() != 0) {
201     auto ind = row["token"];
202     EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
203 
204     auto start = row["offsets_start"];
205     EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
206 
207     auto limit = row["offsets_limit"];
208     EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
209 
210     ASSERT_OK(iter->GetNextRow(&row));
211     i++;
212   }
213 
214   EXPECT_EQ(i, 1);
215 
216   // Manually terminate the pipeline
217   iter->Stop();
218 }
219 
220 std::vector<std::string> list = {
221   "床", "前", "明",    "月",    "光",    "疑",    "是",      "地",        "上",        "霜",   "举",    "头",
222   "望", "低", "思",    "故",    "乡",    "繁",    "體",      "字",        "嘿",        "哈",   "大",    "笑",
223   "嘻", "i",  "am",    "mak",   "make",  "small", "mistake", "##s",       "during",    "work", "##ing", "hour",
224   "��",  "��",  "��",     "��",     "+",     "/",     "-",       "=",         "12",        "28",   "40",    "16",
225   " ",  "I",  "[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]",  "[unused1]", "[unused10]"};
226 
227 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess1) {
228   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess1.";
229   // Test BertTokenizer with default parameters
230 
231   // Create a TextFile dataset
232   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
233   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
234   EXPECT_NE(ds, nullptr);
235 
236   // Create Take operation on ds
237   ds = ds->Take(4);
238   EXPECT_NE(ds, nullptr);
239 
240   // Create a vocab from vector
241   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
242   Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
243   EXPECT_EQ(s, Status::OK());
244 
245   // Create BertTokenizer operation on ds
246   std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(vocab);
247   EXPECT_NE(bert_tokenizer, nullptr);
248 
249   // Create Map operation on ds
250   ds = ds->Map({bert_tokenizer}, {"text"});
251   EXPECT_NE(ds, nullptr);
252 
253   // Create an iterator over the result of the above dataset
254   // This will trigger the creation of the Execution Tree and launch it.
255   std::shared_ptr<Iterator> iter = ds->CreateIterator();
256   EXPECT_NE(iter, nullptr);
257 
258   // Iterate the dataset and get each row
259   std::unordered_map<std::string, mindspore::MSTensor> row;
260   ASSERT_OK(iter->GetNextRow(&row));
261 
262   std::vector<std::vector<std::string>> expected = {{"床", "前", "明", "月", "光"},
263                                                     {"疑", "是", "地", "上", "霜"},
264                                                     {"举", "头", "望", "明", "月"},
265                                                     {"低", "头", "思", "故", "乡"}};
266 
267   uint64_t i = 0;
268   while (row.size() != 0) {
269     auto ind = row["text"];
270     std::shared_ptr<Tensor> de_expected_tensor;
271     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
272     mindspore::MSTensor expected_tensor =
273       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
274     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
275 
276     ASSERT_OK(iter->GetNextRow(&row));
277     i++;
278   }
279 
280   EXPECT_EQ(i, 4);
281 
282   // Manually terminate the pipeline
283   iter->Stop();
284 }
285 
286 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess2) {
287   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess2.";
288   // Test BertTokenizer with lower_case true
289 
290   // Create a TextFile dataset
291   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
292   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
293   EXPECT_NE(ds, nullptr);
294 
295   // Create Skip operation on ds
296   ds = ds->Skip(4);
297   EXPECT_NE(ds, nullptr);
298 
299   // Create Take operation on ds
300   ds = ds->Take(1);
301   EXPECT_NE(ds, nullptr);
302 
303   // Create a vocab from vector
304   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
305   Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
306   EXPECT_EQ(s, Status::OK());
307 
308   // Create BertTokenizer operation on ds
309   std::shared_ptr<TensorTransform> bert_tokenizer =
310     std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", true);
311   EXPECT_NE(bert_tokenizer, nullptr);
312 
313   // Create Map operation on ds
314   ds = ds->Map({bert_tokenizer}, {"text"});
315   EXPECT_NE(ds, nullptr);
316 
317   // Create an iterator over the result of the above dataset
318   // This will trigger the creation of the Execution Tree and launch it.
319   std::shared_ptr<Iterator> iter = ds->CreateIterator();
320   EXPECT_NE(iter, nullptr);
321 
322   // Iterate the dataset and get each row
323   std::unordered_map<std::string, mindspore::MSTensor> row;
324   ASSERT_OK(iter->GetNextRow(&row));
325 
326   std::vector<std::string> expected = {"i",   "am",     "mak",  "##ing", "small", "mistake",
327                                        "##s", "during", "work", "##ing", "hour",  "##s"};
328   std::shared_ptr<Tensor> de_expected_tensor;
329   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
330   mindspore::MSTensor expected_tensor =
331     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
332 
333   uint64_t i = 0;
334   while (row.size() != 0) {
335     auto ind = row["text"];
336     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
337     ASSERT_OK(iter->GetNextRow(&row));
338     i++;
339   }
340 
341   EXPECT_EQ(i, 1);
342 
343   // Manually terminate the pipeline
344   iter->Stop();
345 }
346 
347 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess3) {
348   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess3.";
349   // Test BertTokenizer with normalization_form NFKC
350 
351   // Create a TextFile dataset
352   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
353   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
354   EXPECT_NE(ds, nullptr);
355 
356   // Create Skip operation on ds
357   ds = ds->Skip(5);
358   EXPECT_NE(ds, nullptr);
359 
360   // Create Take operation on ds
361   ds = ds->Take(2);
362   EXPECT_NE(ds, nullptr);
363 
364   // Create a vocab from vector
365   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
366   Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
367   EXPECT_EQ(s, Status::OK());
368 
369   // Create BertTokenizer operation on ds
370   std::shared_ptr<TensorTransform> bert_tokenizer =
371     std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", false, false, NormalizeForm::kNfc);
372   EXPECT_NE(bert_tokenizer, nullptr);
373 
374   // Create Map operation on ds
375   ds = ds->Map({bert_tokenizer}, {"text"});
376   EXPECT_NE(ds, nullptr);
377 
378   // Create an iterator over the result of the above dataset
379   // This will trigger the creation of the Execution Tree and launch it.
380   std::shared_ptr<Iterator> iter = ds->CreateIterator();
381   EXPECT_NE(iter, nullptr);
382 
383   // Iterate the dataset and get each row
384   std::unordered_map<std::string, mindspore::MSTensor> row;
385   ASSERT_OK(iter->GetNextRow(&row));
386 
387   std::vector<std::vector<std::string>> expected = {
388     {"��", "嘿", "嘿", "��", "哈", "哈", "��", "大", "笑", "��", "嘻", "嘻"}, {"繁", "體", "字"}};
389 
390   uint64_t i = 0;
391   while (row.size() != 0) {
392     auto ind = row["text"];
393     std::shared_ptr<Tensor> de_expected_tensor;
394     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
395     mindspore::MSTensor expected_tensor =
396       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
397     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
398 
399     ASSERT_OK(iter->GetNextRow(&row));
400     i++;
401   }
402 
403   EXPECT_EQ(i, 2);
404 
405   // Manually terminate the pipeline
406   iter->Stop();
407 }
408 
409 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess4) {
410   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess4.";
411   // Test BertTokenizer with keep_whitespace true
412 
413   // Create a TextFile dataset
414   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
415   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
416   EXPECT_NE(ds, nullptr);
417 
418   // Create Skip operation on ds
419   ds = ds->Skip(7);
420   EXPECT_NE(ds, nullptr);
421 
422   // Create Take operation on ds
423   ds = ds->Take(1);
424   EXPECT_NE(ds, nullptr);
425 
426   // Create a vocab from vector
427   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
428   Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
429   EXPECT_EQ(s, Status::OK());
430 
431   // Create BertTokenizer operation on ds
432   std::shared_ptr<TensorTransform> bert_tokenizer =
433     std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", false, true);
434   EXPECT_NE(bert_tokenizer, nullptr);
435 
436   // Create Map operation on ds
437   ds = ds->Map({bert_tokenizer}, {"text"});
438   EXPECT_NE(ds, nullptr);
439 
440   // Create an iterator over the result of the above dataset
441   // This will trigger the creation of the Execution Tree and launch it.
442   std::shared_ptr<Iterator> iter = ds->CreateIterator();
443   EXPECT_NE(iter, nullptr);
444 
445   // Iterate the dataset and get each row
446   std::unordered_map<std::string, mindspore::MSTensor> row;
447   ASSERT_OK(iter->GetNextRow(&row));
448 
449   std::vector<std::string> expected = {"[UNK]", " ", "[CLS]"};
450   std::shared_ptr<Tensor> de_expected_tensor;
451   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
452   mindspore::MSTensor expected_tensor =
453     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
454 
455   uint64_t i = 0;
456   while (row.size() != 0) {
457     auto ind = row["text"];
458     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
459     ASSERT_OK(iter->GetNextRow(&row));
460     i++;
461   }
462 
463   EXPECT_EQ(i, 1);
464 
465   // Manually terminate the pipeline
466   iter->Stop();
467 }
468 
469 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess5) {
470   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess5.";
471   // Test BertTokenizer with unknown_token empty and keep_whitespace true
472 
473   // Create a TextFile dataset
474   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
475   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
476   EXPECT_NE(ds, nullptr);
477 
478   // Create Skip operation on ds
479   ds = ds->Skip(7);
480   EXPECT_NE(ds, nullptr);
481 
482   // Create Take operation on ds
483   ds = ds->Take(1);
484   EXPECT_NE(ds, nullptr);
485 
486   // Create a vocab from vector
487   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
488   Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
489   EXPECT_EQ(s, Status::OK());
490 
491   // Create BertTokenizer operation on ds
492   std::shared_ptr<TensorTransform> bert_tokenizer =
493     std::make_shared<text::BertTokenizer>(vocab, "##", 100, "", false, true);
494   EXPECT_NE(bert_tokenizer, nullptr);
495 
496   // Create Map operation on ds
497   ds = ds->Map({bert_tokenizer}, {"text"});
498   EXPECT_NE(ds, nullptr);
499 
500   // Create an iterator over the result of the above dataset
501   // This will trigger the creation of the Execution Tree and launch it.
502   std::shared_ptr<Iterator> iter = ds->CreateIterator();
503   EXPECT_NE(iter, nullptr);
504 
505   // Iterate the dataset and get each row
506   std::unordered_map<std::string, mindspore::MSTensor> row;
507   ASSERT_OK(iter->GetNextRow(&row));
508 
509   std::vector<std::string> expected = {"unused", " ", "[CLS]"};
510   std::shared_ptr<Tensor> de_expected_tensor;
511   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
512   mindspore::MSTensor expected_tensor =
513     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
514 
515   uint64_t i = 0;
516   while (row.size() != 0) {
517     auto ind = row["text"];
518     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
519     ASSERT_OK(iter->GetNextRow(&row));
520     i++;
521   }
522 
523   EXPECT_EQ(i, 1);
524 
525   // Manually terminate the pipeline
526   iter->Stop();
527 }
528 
529 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess6) {
530   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess6.";
531   // Test BertTokenizer with preserve_unused_token false, unknown_token empty and keep_whitespace true
532 
533   // Create a TextFile dataset
534   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
535   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
536   EXPECT_NE(ds, nullptr);
537 
538   // Create Skip operation on ds
539   ds = ds->Skip(7);
540   EXPECT_NE(ds, nullptr);
541 
542   // Create Take operation on ds
543   ds = ds->Take(1);
544   EXPECT_NE(ds, nullptr);
545 
546   // Create a vocab from vector
547   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
548   Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
549   EXPECT_EQ(s, Status::OK());
550 
551   // Create BertTokenizer operation on ds
552   std::shared_ptr<TensorTransform> bert_tokenizer =
553     std::make_shared<text::BertTokenizer>(vocab, "##", 100, "", false, true, NormalizeForm::kNone, false);
554   EXPECT_NE(bert_tokenizer, nullptr);
555 
556   // Create Map operation on ds
557   ds = ds->Map({bert_tokenizer}, {"text"});
558   EXPECT_NE(ds, nullptr);
559 
560   // Create an iterator over the result of the above dataset
561   // This will trigger the creation of the Execution Tree and launch it.
562   std::shared_ptr<Iterator> iter = ds->CreateIterator();
563   EXPECT_NE(iter, nullptr);
564 
565   // Iterate the dataset and get each row
566   std::unordered_map<std::string, mindspore::MSTensor> row;
567   ASSERT_OK(iter->GetNextRow(&row));
568 
569   std::vector<std::string> expected = {"unused", " ", "[", "CLS", "]"};
570   std::shared_ptr<Tensor> de_expected_tensor;
571   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
572   mindspore::MSTensor expected_tensor =
573     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
574 
575   uint64_t i = 0;
576   while (row.size() != 0) {
577     auto ind = row["text"];
578     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
579     ASSERT_OK(iter->GetNextRow(&row));
580     i++;
581   }
582 
583   EXPECT_EQ(i, 1);
584 
585   // Manually terminate the pipeline
586   iter->Stop();
587 }
588 
589 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess7) {
590   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess7.";
591   // Test BertTokenizer with with_offsets true and lower_case true
592 
593   // Create a TextFile dataset
594   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
595   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
596   EXPECT_NE(ds, nullptr);
597 
598   // Create Skip operation on ds
599   ds = ds->Skip(4);
600   EXPECT_NE(ds, nullptr);
601 
602   // Create Take operation on ds
603   ds = ds->Take(1);
604   EXPECT_NE(ds, nullptr);
605 
606   // Create a vocab from vector
607   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
608   Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
609   EXPECT_EQ(s, Status::OK());
610 
611   // Create BertTokenizer operation on ds
612   std::shared_ptr<TensorTransform> bert_tokenizer =
613     std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", true, false, NormalizeForm::kNone, true, true);
614   EXPECT_NE(bert_tokenizer, nullptr);
615 
616   // Create Map operation on ds
617   ds = ds->Map({bert_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
618   EXPECT_NE(ds, nullptr);
619 
620   // Create an iterator over the result of the above dataset
621   // This will trigger the creation of the Execution Tree and launch it.
622   std::shared_ptr<Iterator> iter = ds->CreateIterator();
623   EXPECT_NE(iter, nullptr);
624 
625   // Iterate the dataset and get each row
626   std::unordered_map<std::string, mindspore::MSTensor> row;
627   ASSERT_OK(iter->GetNextRow(&row));
628 
629   std::vector<std::string> expected_tokens = {"i",   "am",     "mak",  "##ing", "small", "mistake",
630                                               "##s", "during", "work", "##ing", "hour",  "##s"};
631   std::vector<uint32_t> expected_offsets_start = {0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46};
632   std::vector<uint32_t> expected_offsets_limit = {1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47};
633 
634   std::shared_ptr<Tensor> de_expected_tokens;
635   ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
636   mindspore::MSTensor ms_expected_tokens =
637     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
638 
639   std::shared_ptr<Tensor> de_expected_offsets_start;
640   ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
641   mindspore::MSTensor ms_expected_offsets_start =
642     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
643 
644   std::shared_ptr<Tensor> de_expected_offsets_limit;
645   ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
646   mindspore::MSTensor ms_expected_offsets_limit =
647     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
648 
649   uint64_t i = 0;
650   while (row.size() != 0) {
651     auto ind = row["token"];
652     EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
653 
654     auto start = row["offsets_start"];
655     EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
656 
657     auto limit = row["offsets_limit"];
658     EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
659 
660     ASSERT_OK(iter->GetNextRow(&row));
661     i++;
662   }
663 
664   EXPECT_EQ(i, 1);
665 
666   // Manually terminate the pipeline
667   iter->Stop();
668 }
669 
670 TEST_F(MindDataTestPipeline, TestBertTokenizerFail1) {
671   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail1.";
672   // Test BertTokenizer with nullptr vocab
673 
674   // Create a TextFile dataset
675   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
676   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
677   EXPECT_NE(ds, nullptr);
678 
679   // Create BertTokenizer operation on ds
680   std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(nullptr);
681   EXPECT_NE(bert_tokenizer, nullptr);
682 
683   // Create a Map operation on ds
684   ds = ds->Map({bert_tokenizer});
685   EXPECT_NE(ds, nullptr);
686 
687   std::shared_ptr<Iterator> iter = ds->CreateIterator();
688   // Expect failure: invalid BertTokenizer input with nullptr vocab
689   EXPECT_EQ(iter, nullptr);
690 }
691 
692 TEST_F(MindDataTestPipeline, TestBertTokenizerFail2) {
693   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail2.";
694   // Test BertTokenizer with negative max_bytes_per_token
695 
696   // Create a TextFile dataset
697   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
698   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
699   EXPECT_NE(ds, nullptr);
700 
701   // Create a vocab from vector
702   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
703   Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
704   EXPECT_EQ(s, Status::OK());
705 
706   // Create BertTokenizer operation on ds
707   std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(vocab, "##", -1);
708   EXPECT_NE(bert_tokenizer, nullptr);
709 
710   // Create a Map operation on ds
711   ds = ds->Map({bert_tokenizer});
712   EXPECT_NE(ds, nullptr);
713 
714   std::shared_ptr<Iterator> iter = ds->CreateIterator();
715   // Expect failure: invalid BertTokenizer input with nullptr vocab
716   EXPECT_EQ(iter, nullptr);
717 }
718 
719 TEST_F(MindDataTestPipeline, TestCaseFoldSuccess) {
720   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaseFoldSuccess.";
721 
722   // Create a TextFile dataset
723   std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
724   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
725   EXPECT_NE(ds, nullptr);
726 
727   // Create casefold operation on ds
728   std::shared_ptr<TensorTransform> casefold = std::make_shared<text::CaseFold>();
729   EXPECT_NE(casefold, nullptr);
730 
731   // Create Map operation on ds
732   ds = ds->Map({casefold}, {"text"});
733   EXPECT_NE(ds, nullptr);
734 
735   // Create an iterator over the result of the above dataset
736   // This will trigger the creation of the Execution Tree and launch it.
737   std::shared_ptr<Iterator> iter = ds->CreateIterator();
738   EXPECT_NE(iter, nullptr);
739 
740   // Iterate the dataset and get each row
741   std::unordered_map<std::string, mindspore::MSTensor> row;
742   ASSERT_OK(iter->GetNextRow(&row));
743 
744   std::vector<std::string> expected = {"welcome to beijing!", "北京欢迎您!", "我喜欢english!", "  "};
745 
746   uint64_t i = 0;
747   while (row.size() != 0) {
748     auto ind = row["text"];
749     std::shared_ptr<Tensor> de_expected_tensor;
750     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
751     mindspore::MSTensor ms_expected_tensor =
752       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
753     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
754     ASSERT_OK(iter->GetNextRow(&row));
755     i++;
756   }
757 
758   EXPECT_EQ(i, 4);
759 
760   // Manually terminate the pipeline
761   iter->Stop();
762 }
763 
764 TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess) {
765   // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is false.
766   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess.";
767 
768   // Create a TextFile dataset
769   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
770   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
771   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
772   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
773   EXPECT_NE(ds, nullptr);
774 
775   // Create jieba_tokenizer operation on ds
776   std::shared_ptr<TensorTransform> jieba_tokenizer =
777     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
778   EXPECT_NE(jieba_tokenizer, nullptr);
779 
780   // Create Map operation on ds
781   ds = ds->Map({jieba_tokenizer}, {"text"});
782   EXPECT_NE(ds, nullptr);
783 
784   // Create an iterator over the result of the above dataset
785   // This will trigger the creation of the Execution Tree and launch it.
786   std::shared_ptr<Iterator> iter = ds->CreateIterator();
787   EXPECT_NE(iter, nullptr);
788 
789   // Iterate the dataset and get each row
790   std::unordered_map<std::string, mindspore::MSTensor> row;
791   ASSERT_OK(iter->GetNextRow(&row));
792 
793   std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
794   std::shared_ptr<Tensor> de_expected_tensor;
795   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
796   mindspore::MSTensor expected_tensor =
797     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
798 
799   uint64_t i = 0;
800   while (row.size() != 0) {
801     auto ind = row["text"];
802     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
803     ASSERT_OK(iter->GetNextRow(&row));
804     i++;
805   }
806 
807   EXPECT_EQ(i, 1);
808 
809   // Manually terminate the pipeline
810   iter->Stop();
811 }
812 
813 TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess1) {
814   // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kHmm and the with_offsets is false.
815   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess1.";
816 
817   // Create a TextFile dataset
818   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
819   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
820   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
821   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
822   EXPECT_NE(ds, nullptr);
823 
824   // Create jieba_tokenizer operation on ds
825   std::shared_ptr<TensorTransform> jieba_tokenizer =
826     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kHmm);
827   EXPECT_NE(jieba_tokenizer, nullptr);
828 
829   // Create Map operation on ds
830   ds = ds->Map({jieba_tokenizer}, {"text"});
831   EXPECT_NE(ds, nullptr);
832 
833   // Create an iterator over the result of the above dataset
834   // This will trigger the creation of the Execution Tree and launch it.
835   std::shared_ptr<Iterator> iter = ds->CreateIterator();
836   EXPECT_NE(iter, nullptr);
837 
838   // Iterate the dataset and get each row
839   std::unordered_map<std::string, mindspore::MSTensor> row;
840   ASSERT_OK(iter->GetNextRow(&row));
841 
842   std::vector<std::string> expected = {"今天", "天气", "太", "好", "了", "我们", "一起", "去", "外面", "玩", "吧"};
843   std::shared_ptr<Tensor> de_expected_tensor;
844   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
845   mindspore::MSTensor expected_tensor =
846     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
847 
848   uint64_t i = 0;
849   while (row.size() != 0) {
850     auto ind = row["text"];
851     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
852     ASSERT_OK(iter->GetNextRow(&row));
853     i++;
854   }
855 
856   EXPECT_EQ(i, 1);
857 
858   // Manually terminate the pipeline
859   iter->Stop();
860 }
861 
862 TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess2) {
863   // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is true.
864   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess2.";
865 
866   // Create a TextFile dataset
867   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
868   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
869   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
870   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
871   EXPECT_NE(ds, nullptr);
872 
873   // Create jieba_tokenizer operation on ds
874   std::shared_ptr<TensorTransform> jieba_tokenizer =
875     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp, true);
876   EXPECT_NE(jieba_tokenizer, nullptr);
877 
878   // Create Map operation on ds
879   ds = ds->Map({jieba_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
880                {"token", "offsets_start", "offsets_limit"});
881   EXPECT_NE(ds, nullptr);
882 
883   // Create an iterator over the result of the above dataset
884   // This will trigger the creation of the Execution Tree and launch it.
885   std::shared_ptr<Iterator> iter = ds->CreateIterator();
886   EXPECT_NE(iter, nullptr);
887 
888   // Iterate the dataset and get each row
889   std::unordered_map<std::string, mindspore::MSTensor> row;
890   ASSERT_OK(iter->GetNextRow(&row));
891 
892   std::vector<std::string> expected_tokens = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
893   std::vector<uint32_t> expected_offsets_start = {0, 12, 21, 27, 33, 36, 42};
894   std::vector<uint32_t> expected_offsets_limit = {12, 21, 27, 33, 36, 42, 48};
895 
896   std::shared_ptr<Tensor> de_expected_tokens;
897   ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
898   mindspore::MSTensor ms_expected_tokens =
899     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
900 
901   std::shared_ptr<Tensor> de_expected_offsets_start;
902   ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
903   mindspore::MSTensor ms_expected_offsets_start =
904     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
905 
906   std::shared_ptr<Tensor> de_expected_offsets_limit;
907   ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
908   mindspore::MSTensor ms_expected_offsets_limit =
909     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
910 
911   uint64_t i = 0;
912   while (row.size() != 0) {
913     auto ind = row["token"];
914     EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
915 
916     auto start = row["offsets_start"];
917     EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
918 
919     auto limit = row["offsets_limit"];
920     EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
921 
922     ASSERT_OK(iter->GetNextRow(&row));
923     i++;
924   }
925 
926   EXPECT_EQ(i, 1);
927 
928   // Manually terminate the pipeline
929   iter->Stop();
930 }
931 
932 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail1) {
933   // Testing the incorrect parameter of JiebaTokenizer interface.
934   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail1.";
935 
936   // Create a TextFile dataset
937   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
938   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
939   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
940   EXPECT_NE(ds, nullptr);
941 
942   // Create jieba_tokenizer operation on ds
943   // Testing the parameter hmm_path is empty
944   std::shared_ptr<TensorTransform> jieba_tokenizer =
945     std::make_shared<text::JiebaTokenizer>("", mp_path, JiebaMode::kMp);
946   EXPECT_NE(jieba_tokenizer, nullptr);
947 
948   // Create a Map operation on ds
949   ds = ds->Map({jieba_tokenizer});
950   EXPECT_NE(ds, nullptr);
951 
952   std::shared_ptr<Iterator> iter = ds->CreateIterator();
953   // Expect failure: invalid JiebaTokenizer input (parameter hmm_path is empty)
954   EXPECT_EQ(iter, nullptr);
955 }
956 
957 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail2) {
958   // Testing the incorrect parameter of JiebaTokenizer interface.
959   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail2.";
960 
961   // Create a TextFile dataset
962   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
963   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
964   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
965   EXPECT_NE(ds, nullptr);
966 
967   // Create jieba_tokenizer operation on ds
968   // Testing the parameter mp_path is empty
969   std::shared_ptr<TensorTransform> jieba_tokenizer =
970     std::make_shared<text::JiebaTokenizer>(hmm_path, "", JiebaMode::kMp);
971   EXPECT_NE(jieba_tokenizer, nullptr);
972 
973   // Create a Map operation on ds
974   ds = ds->Map({jieba_tokenizer});
975   EXPECT_NE(ds, nullptr);
976 
977   std::shared_ptr<Iterator> iter = ds->CreateIterator();
978   // Expect failure: invalid JiebaTokenizer input (parameter mp_path is empty)
979   EXPECT_EQ(iter, nullptr);
980 }
981 
982 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail3) {
983   // Testing the incorrect parameter of JiebaTokenizer interface.
984   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail3.";
985 
986   // Create a TextFile dataset
987   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
988   std::string hmm_path_invalid = datasets_root_path_ + "/jiebadict/1.txt";
989   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
990   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
991   EXPECT_NE(ds, nullptr);
992 
993   // Create jieba_tokenizer operation on ds
994   // Testing the parameter hmm_path is invalid path
995   std::shared_ptr<TensorTransform> jieba_tokenizer =
996     std::make_shared<text::JiebaTokenizer>(hmm_path_invalid, mp_path, JiebaMode::kMp);
997   EXPECT_NE(jieba_tokenizer, nullptr);
998 
999   // Create a Map operation on ds
1000   ds = ds->Map({jieba_tokenizer});
1001   EXPECT_NE(ds, nullptr);
1002 
1003   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1004   // Expect failure: invalid JiebaTokenizer input (parameter hmm_path is invalid path)
1005   EXPECT_EQ(iter, nullptr);
1006 }
1007 
1008 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail4) {
1009   // Testing the incorrect parameter of JiebaTokenizer interface.
1010   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail4.";
1011 
1012   // Create a TextFile dataset
1013   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1014   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1015   std::string mp_path_invalid = datasets_root_path_ + "/jiebadict/1.txt";
1016   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1017   EXPECT_NE(ds, nullptr);
1018 
1019   // Create jieba_tokenizer operation on ds
1020   // Testing the parameter mp_path is invalid path
1021   std::shared_ptr<TensorTransform> jieba_tokenizer =
1022     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path_invalid, JiebaMode::kMp);
1023   EXPECT_NE(jieba_tokenizer, nullptr);
1024 
1025   // Create a Map operation on ds
1026   ds = ds->Map({jieba_tokenizer});
1027   EXPECT_NE(ds, nullptr);
1028 
1029   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1030   // Expect failure: invalid JiebaTokenizer input (parameter mp_path is invalid path)
1031   EXPECT_EQ(iter, nullptr);
1032 }
1033 
1034 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord) {
1035   // Testing the parameter AddWord of JiebaTokenizer when the freq is not provided (default 0).
1036   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord.";
1037 
1038   // Create a TextFile dataset
1039   std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
1040   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1041   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1042   std::shared_ptr<Dataset> ds = TextFile({data_file});
1043   EXPECT_NE(ds, nullptr);
1044 
1045   // Create jieba_tokenizer operation on ds
1046   std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1047     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1048   EXPECT_NE(jieba_tokenizer, nullptr);
1049 
1050   // Add word with freq not provided (default 0)
1051   ASSERT_OK(jieba_tokenizer->AddWord("男默女泪"));
1052 
1053   // Create Map operation on ds
1054   ds = ds->Map({jieba_tokenizer}, {"text"});
1055   EXPECT_NE(ds, nullptr);
1056 
1057   // Create an iterator over the result of the above dataset
1058   // This will trigger the creation of the Execution Tree and launch it.
1059   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1060   EXPECT_NE(iter, nullptr);
1061 
1062   // Iterate the dataset and get each row
1063   std::unordered_map<std::string, mindspore::MSTensor> row;
1064   ASSERT_OK(iter->GetNextRow(&row));
1065 
1066   std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
1067   std::shared_ptr<Tensor> de_expected_tensor;
1068   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1069   mindspore::MSTensor expected_tensor =
1070     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1071 
1072   uint64_t i = 0;
1073   while (row.size() != 0) {
1074     auto ind = row["text"];
1075     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1076     ASSERT_OK(iter->GetNextRow(&row));
1077     i++;
1078   }
1079 
1080   EXPECT_EQ(i, 1);
1081 
1082   // Manually terminate the pipeline
1083   iter->Stop();
1084 }
1085 
1086 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord1) {
1087   // Testing the parameter AddWord of JiebaTokenizer when the freq is set explicitly to 0.
1088   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord1.";
1089 
1090   // Create a TextFile dataset
1091   std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
1092   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1093   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1094   std::shared_ptr<Dataset> ds = TextFile({data_file});
1095   EXPECT_NE(ds, nullptr);
1096 
1097   // Create jieba_tokenizer operation on ds
1098   std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1099     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1100   EXPECT_NE(jieba_tokenizer, nullptr);
1101 
1102   // Add word with freq is set explicitly to 0
1103   ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 0));
1104 
1105   // Create Map operation on ds
1106   ds = ds->Map({jieba_tokenizer}, {"text"});
1107   EXPECT_NE(ds, nullptr);
1108 
1109   // Create an iterator over the result of the above dataset
1110   // This will trigger the creation of the Execution Tree and launch it.
1111   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1112   EXPECT_NE(iter, nullptr);
1113 
1114   // Iterate the dataset and get each row
1115   std::unordered_map<std::string, mindspore::MSTensor> row;
1116   ASSERT_OK(iter->GetNextRow(&row));
1117 
1118   std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
1119   std::shared_ptr<Tensor> de_expected_tensor;
1120   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1121   mindspore::MSTensor expected_tensor =
1122     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1123 
1124   uint64_t i = 0;
1125   while (row.size() != 0) {
1126     auto ind = row["text"];
1127     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1128     ASSERT_OK(iter->GetNextRow(&row));
1129     i++;
1130   }
1131 
1132   EXPECT_EQ(i, 1);
1133 
1134   // Manually terminate the pipeline
1135   iter->Stop();
1136 }
1137 
1138 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord2) {
1139   // Testing the parameter AddWord of JiebaTokenizer when the freq is 10.
1140   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord2.";
1141 
1142   // Create a TextFile dataset
1143   std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
1144   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1145   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1146   std::shared_ptr<Dataset> ds = TextFile({data_file});
1147   EXPECT_NE(ds, nullptr);
1148 
1149   // Create jieba_tokenizer operation on ds
1150   std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1151     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1152   EXPECT_NE(jieba_tokenizer, nullptr);
1153 
1154   // Add word with freq 10
1155   ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 10));
1156 
1157   // Create Map operation on ds
1158   ds = ds->Map({jieba_tokenizer}, {"text"});
1159   EXPECT_NE(ds, nullptr);
1160 
1161   // Create an iterator over the result of the above dataset
1162   // This will trigger the creation of the Execution Tree and launch it.
1163   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1164   EXPECT_NE(iter, nullptr);
1165 
1166   // Iterate the dataset and get each row
1167   std::unordered_map<std::string, mindspore::MSTensor> row;
1168   ASSERT_OK(iter->GetNextRow(&row));
1169 
1170   std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
1171   std::shared_ptr<Tensor> de_expected_tensor;
1172   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1173   mindspore::MSTensor expected_tensor =
1174     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1175 
1176   uint64_t i = 0;
1177   while (row.size() != 0) {
1178     auto ind = row["text"];
1179     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1180     ASSERT_OK(iter->GetNextRow(&row));
1181     i++;
1182   }
1183 
1184   EXPECT_EQ(i, 1);
1185 
1186   // Manually terminate the pipeline
1187   iter->Stop();
1188 }
1189 
1190 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord3) {
1191   // Testing the parameter AddWord of JiebaTokenizer when the freq is 20000 which affects the result of segmentation.
1192   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord3.";
1193 
1194   // Create a TextFile dataset
1195   std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt";
1196   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1197   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1198   std::shared_ptr<Dataset> ds = TextFile({data_file});
1199   EXPECT_NE(ds, nullptr);
1200 
1201   // Create jieba_tokenizer operation on ds
1202   std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1203     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1204   EXPECT_NE(jieba_tokenizer, nullptr);
1205 
1206   // Add word with freq 20000
1207   ASSERT_OK(jieba_tokenizer->AddWord("江大桥", 20000));
1208 
1209   // Create Map operation on ds
1210   ds = ds->Map({jieba_tokenizer}, {"text"});
1211   EXPECT_NE(ds, nullptr);
1212 
1213   // Create an iterator over the result of the above dataset
1214   // This will trigger the creation of the Execution Tree and launch it.
1215   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1216   EXPECT_NE(iter, nullptr);
1217 
1218   // Iterate the dataset and get each row
1219   std::unordered_map<std::string, mindspore::MSTensor> row;
1220   ASSERT_OK(iter->GetNextRow(&row));
1221 
1222   std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"};
1223   std::shared_ptr<Tensor> de_expected_tensor;
1224   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1225   mindspore::MSTensor expected_tensor =
1226     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1227 
1228   uint64_t i = 0;
1229   while (row.size() != 0) {
1230     auto ind = row["text"];
1231     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1232     ASSERT_OK(iter->GetNextRow(&row));
1233     i++;
1234   }
1235 
1236   EXPECT_EQ(i, 1);
1237 
1238   // Manually terminate the pipeline
1239   iter->Stop();
1240 }
1241 
1242 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWordFail) {
1243   // Testing the incorrect parameter of AddWord in JiebaTokenizer.
1244   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWordFail.";
1245 
1246   // Create a TextFile dataset
1247   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1248   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1249   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1250   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1251   EXPECT_NE(ds, nullptr);
1252 
1253   // Testing the parameter word of AddWord is empty
1254   std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1255     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1256   EXPECT_NE(jieba_tokenizer, nullptr);
1257   EXPECT_NE(jieba_tokenizer->AddWord("", 10), Status::OK());
1258   // Testing the parameter freq of AddWord is negative
1259   std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer1 =
1260     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1261   EXPECT_NE(jieba_tokenizer1, nullptr);
1262   EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK());
1263 }
1264 
1265 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDict) {
1266   // Testing AddDict of JiebaTokenizer when the input is a vector of word-freq pair.
1267   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDict.";
1268 
1269   // Create a TextFile dataset
1270   std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt";
1271   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1272   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1273   std::shared_ptr<Dataset> ds = TextFile({data_file});
1274   EXPECT_NE(ds, nullptr);
1275 
1276   // Create jieba_tokenizer operation on ds
1277   std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1278     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1279   EXPECT_NE(jieba_tokenizer, nullptr);
1280 
1281   // Add word with freq 20000
1282   std::vector<std::pair<std::string, int64_t>> user_dict = {{"江大桥", 20000}};
1283   ASSERT_OK(jieba_tokenizer->AddDict(user_dict));
1284 
1285   // Create Map operation on ds
1286   ds = ds->Map({jieba_tokenizer}, {"text"});
1287   EXPECT_NE(ds, nullptr);
1288 
1289   // Create an iterator over the result of the above dataset
1290   // This will trigger the creation of the Execution Tree and launch it.
1291   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1292   EXPECT_NE(iter, nullptr);
1293 
1294   // Iterate the dataset and get each row
1295   std::unordered_map<std::string, mindspore::MSTensor> row;
1296   ASSERT_OK(iter->GetNextRow(&row));
1297 
1298   std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"};
1299   std::shared_ptr<Tensor> de_expected_tensor;
1300   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1301   mindspore::MSTensor expected_tensor =
1302     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1303 
1304   uint64_t i = 0;
1305   while (row.size() != 0) {
1306     auto txt = row["text"];
1307     EXPECT_MSTENSOR_EQ(txt, expected_tensor);
1308     ASSERT_OK(iter->GetNextRow(&row));
1309     i++;
1310   }
1311 
1312   EXPECT_EQ(i, 1);
1313 
1314   // Manually terminate the pipeline
1315   iter->Stop();
1316 }
1317 
1318 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDictFromFile) {
1319   // Testing AddDict of JiebaTokenizer when the input is a path to dict.
1320   // Test error scenario for AddDict: invalid path
1321   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDictFromFile.";
1322 
1323   // Create a TextFile dataset
1324   std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1325   std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1326   std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1327   std::shared_ptr<Dataset> ds = TextFile({data_file});
1328   EXPECT_NE(ds, nullptr);
1329 
1330   // Create jieba_tokenizer operation on ds
1331   std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1332     std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1333   EXPECT_NE(jieba_tokenizer, nullptr);
1334 
1335   // Load dict from txt file
1336   std::string user_dict_path = datasets_root_path_ + "/testJiebaDataset/user_dict.txt";
1337   std::string invalid_path = datasets_root_path_ + "/testJiebaDataset/invalid_path.txt";
1338   EXPECT_ERROR(jieba_tokenizer->AddDict(invalid_path));
1339   ASSERT_OK(jieba_tokenizer->AddDict(user_dict_path));
1340 
1341   // Create Map operation on ds
1342   ds = ds->Map({jieba_tokenizer}, {"text"});
1343   EXPECT_NE(ds, nullptr);
1344 
1345   // Create an iterator over the result of the above dataset
1346   // This will trigger the creation of the Execution Tree and launch it.
1347   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1348   EXPECT_NE(iter, nullptr);
1349 
1350   // Iterate the dataset and get each row
1351   std::unordered_map<std::string, mindspore::MSTensor> row;
1352   ASSERT_OK(iter->GetNextRow(&row));
1353 
1354   std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
1355   std::shared_ptr<Tensor> de_expected_tensor;
1356   ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1357   mindspore::MSTensor expected_tensor =
1358     mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1359 
1360   uint64_t i = 0;
1361   while (row.size() != 0) {
1362     auto txt = row["text"];
1363     EXPECT_MSTENSOR_EQ(txt, expected_tensor);
1364     ASSERT_OK(iter->GetNextRow(&row));
1365     i++;
1366   }
1367 
1368   EXPECT_EQ(i, 1);
1369 
1370   // Manually terminate the pipeline
1371   iter->Stop();
1372 }
1373 
1374 TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) {
1375   // Testing the parameter of SlidingWindow interface when the axis is 0.
1376   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess.";
1377 
1378   // Create a TextFile dataset
1379   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
1380   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1381   EXPECT_NE(ds, nullptr);
1382 
1383   // Create white_tokenizer operation on ds
1384   std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
1385   EXPECT_NE(white_tokenizer, nullptr);
1386   // Create sliding_window operation on ds
1387   std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(3, 0);
1388   EXPECT_NE(sliding_window, nullptr);
1389 
1390   // Create Map operation on ds
1391   ds = ds->Map({white_tokenizer, sliding_window}, {"text"});
1392   EXPECT_NE(ds, nullptr);
1393 
1394   // Create an iterator over the result of the above dataset
1395   // This will trigger the creation of the Execution Tree and launch it.
1396   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1397   EXPECT_NE(iter, nullptr);
1398 
1399   // Iterate the dataset and get each row
1400   std::unordered_map<std::string, mindspore::MSTensor> row;
1401   ASSERT_OK(iter->GetNextRow(&row));
1402 
1403   std::vector<std::vector<std::string>> expected = {{"This", "is", "a", "is", "a", "text", "a", "text", "file."},
1404                                                     {"Be", "happy", "every", "happy", "every", "day."},
1405                                                     {"Good", "luck", "to", "luck", "to", "everyone."}};
1406 
1407   uint64_t i = 0;
1408   while (row.size() != 0) {
1409     auto ind = row["text"];
1410 
1411     std::shared_ptr<Tensor> de_expected_tensor;
1412     int x = expected[i].size() / 3;
1413     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x, 3}), &de_expected_tensor));
1414     mindspore::MSTensor expected_tensor =
1415       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1416     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1417 
1418     ASSERT_OK(iter->GetNextRow(&row));
1419     i++;
1420   }
1421 
1422   EXPECT_EQ(i, 3);
1423 
1424   // Manually terminate the pipeline
1425   iter->Stop();
1426 }
1427 
1428 TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess1) {
1429   // Testing the parameter of SlidingWindow interface when the axis is -1.
1430   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess1.";
1431 
1432   // Create a TextFile dataset
1433   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
1434   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1435   EXPECT_NE(ds, nullptr);
1436 
1437   // Create white_tokenizer operation on ds
1438   std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
1439   EXPECT_NE(white_tokenizer, nullptr);
1440   // Create sliding_window operation on ds
1441   std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(2, -1);
1442   EXPECT_NE(sliding_window, nullptr);
1443 
1444   // Create Map operation on ds
1445   ds = ds->Map({white_tokenizer, sliding_window}, {"text"});
1446   EXPECT_NE(ds, nullptr);
1447 
1448   // Create an iterator over the result of the above dataset
1449   // This will trigger the creation of the Execution Tree and launch it.
1450   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1451   EXPECT_NE(iter, nullptr);
1452 
1453   // Iterate the dataset and get each row
1454   std::unordered_map<std::string, mindspore::MSTensor> row;
1455   ASSERT_OK(iter->GetNextRow(&row));
1456 
1457   std::vector<std::vector<std::string>> expected = {{"This", "is", "is", "a", "a", "text", "text", "file."},
1458                                                     {"Be", "happy", "happy", "every", "every", "day."},
1459                                                     {"Good", "luck", "luck", "to", "to", "everyone."}};
1460   uint64_t i = 0;
1461   while (row.size() != 0) {
1462     auto ind = row["text"];
1463 
1464     std::shared_ptr<Tensor> de_expected_tensor;
1465     int x = expected[i].size() / 2;
1466     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x, 2}), &de_expected_tensor));
1467     mindspore::MSTensor expected_tensor =
1468       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1469     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1470 
1471     ASSERT_OK(iter->GetNextRow(&row));
1472     i++;
1473   }
1474 
1475   EXPECT_EQ(i, 3);
1476 
1477   // Manually terminate the pipeline
1478   iter->Stop();
1479 }
1480 
1481 TEST_F(MindDataTestPipeline, TestSlidingWindowFail1) {
1482   // Testing the incorrect parameter of SlidingWindow interface.
1483   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail1.";
1484 
1485   // Create a TextFile dataset
1486   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
1487   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1488   EXPECT_NE(ds, nullptr);
1489 
1490   // Create sliding_window operation on ds
1491   // Testing the parameter width less than or equal to 0
1492   // The parameter axis support 0 or -1 only for now
1493   std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(0, 0);
1494   EXPECT_NE(sliding_window, nullptr);
1495 
1496   // Create a Map operation on ds
1497   ds = ds->Map({sliding_window});
1498   EXPECT_NE(ds, nullptr);
1499 
1500   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1501   // Expect failure: invalid SlidingWindow input (width less than or equal to 0)
1502   EXPECT_EQ(iter, nullptr);
1503 }
1504 
1505 TEST_F(MindDataTestPipeline, TestSlidingWindowFail2) {
1506   // Testing the incorrect parameter of SlidingWindow interface.
1507   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail2.";
1508 
1509   // Create a TextFile dataset
1510   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
1511   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1512   EXPECT_NE(ds, nullptr);
1513 
1514   // Create sliding_window operation on ds
1515   // Testing the parameter width less than or equal to 0
1516   // The parameter axis support 0 or -1 only for now
1517   std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(-2, 0);
1518   EXPECT_NE(sliding_window, nullptr);
1519 
1520   // Create a Map operation on ds
1521   ds = ds->Map({sliding_window});
1522   EXPECT_NE(ds, nullptr);
1523 
1524   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1525   // Expect failure: invalid SlidingWindow input (width less than or equal to 0)
1526   EXPECT_EQ(iter, nullptr);
1527 }
1528 
1529 TEST_F(MindDataTestPipeline, TestToNumberSuccess1) {
1530   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberSuccess1.";
1531   // Test ToNumber with integer numbers
1532 
1533   std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1534 
1535   // Create a TextFile dataset
1536   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1537   EXPECT_NE(ds, nullptr);
1538 
1539   // Create a Take operation on ds
1540   ds = ds->Take(8);
1541   EXPECT_NE(ds, nullptr);
1542 
1543   // Create ToNumber operation on ds
1544   std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
1545   EXPECT_NE(to_number, nullptr);
1546 
1547   // Create a Map operation on ds
1548   ds = ds->Map({to_number}, {"text"});
1549   EXPECT_NE(ds, nullptr);
1550 
1551   // Create an iterator over the result of the above dataset
1552   // This will trigger the creation of the Execution Tree and launch it.
1553   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1554   EXPECT_NE(iter, nullptr);
1555 
1556   // Iterate the dataset and get each row
1557   std::unordered_map<std::string, mindspore::MSTensor> row;
1558   ASSERT_OK(iter->GetNextRow(&row));
1559 
1560   std::vector<int64_t> expected = {-121, 14, -2219, 7623, -8162536, 162371864, -1726483716, 98921728421};
1561 
1562   uint64_t i = 0;
1563   while (row.size() != 0) {
1564     auto ind = row["text"];
1565     std::shared_ptr<Tensor> de_expected_tensor;
1566     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
1567     mindspore::MSTensor ms_expected_tensor =
1568       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1569     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
1570     ASSERT_OK(iter->GetNextRow(&row));
1571     i++;
1572   }
1573 
1574   EXPECT_EQ(i, 8);
1575 
1576   // Manually terminate the pipeline
1577   iter->Stop();
1578 }
1579 
1580 TEST_F(MindDataTestPipeline, TestToNumberSuccess2) {
1581   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberSuccess2.";
1582   // Test ToNumber with float numbers
1583 
1584   std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1585 
1586   // Create a TextFile dataset
1587   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1588   EXPECT_NE(ds, nullptr);
1589 
1590   // Create a Skip operation on ds
1591   ds = ds->Skip(8);
1592   EXPECT_NE(ds, nullptr);
1593 
1594   // Create a Take operation on ds
1595   ds = ds->Take(6);
1596   EXPECT_NE(ds, nullptr);
1597 
1598   // Create ToNumber operation on ds
1599   std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat64);
1600   EXPECT_NE(to_number, nullptr);
1601 
1602   // Create a Map operation on ds
1603   ds = ds->Map({to_number}, {"text"});
1604   EXPECT_NE(ds, nullptr);
1605 
1606   // Create an iterator over the result of the above dataset
1607   // This will trigger the creation of the Execution Tree and launch it.
1608   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1609   EXPECT_NE(iter, nullptr);
1610 
1611   // Iterate the dataset and get each row
1612   std::unordered_map<std::string, mindspore::MSTensor> row;
1613   ASSERT_OK(iter->GetNextRow(&row));
1614 
1615   std::vector<double_t> expected = {-1.1, 1.4, -2219.321, 7623.453, -816256.234282, 162371864.243243};
1616 
1617   uint64_t i = 0;
1618   while (row.size() != 0) {
1619     auto ind = row["text"];
1620     std::shared_ptr<Tensor> de_expected_tensor;
1621     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
1622     mindspore::MSTensor ms_expected_tensor =
1623       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1624     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
1625     ASSERT_OK(iter->GetNextRow(&row));
1626     i++;
1627   }
1628 
1629   EXPECT_EQ(i, 6);
1630 
1631   // Manually terminate the pipeline
1632   iter->Stop();
1633 }
1634 
1635 TEST_F(MindDataTestPipeline, TestToNumberFail1) {
1636   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail1.";
1637   // Test ToNumber with overflow integer numbers
1638 
1639   std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1640 
1641   // Create a TextFile dataset
1642   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1643   EXPECT_NE(ds, nullptr);
1644 
1645   // Create a Skip operation on ds
1646   ds = ds->Skip(2);
1647   EXPECT_NE(ds, nullptr);
1648 
1649   // Create a Take operation on ds
1650   ds = ds->Take(6);
1651   EXPECT_NE(ds, nullptr);
1652 
1653   // Create ToNumber operation on ds
1654   std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
1655   EXPECT_NE(to_number, nullptr);
1656 
1657   // Create a Map operation on ds
1658   ds = ds->Map({to_number}, {"text"});
1659   EXPECT_NE(ds, nullptr);
1660 
1661   // Create an iterator over the result of the above dataset
1662   // This will trigger the creation of the Execution Tree and launch it.
1663   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1664   EXPECT_NE(iter, nullptr);
1665 
1666   // Iterate the dataset and get each row
1667   std::unordered_map<std::string, mindspore::MSTensor> row;
1668 
1669   // Expect error: input out of bounds of int8
1670   EXPECT_ERROR(iter->GetNextRow(&row));
1671 
1672   uint64_t i = 0;
1673   while (row.size() != 0) {
1674     EXPECT_ERROR(iter->GetNextRow(&row));
1675     i++;
1676   }
1677 
1678   // Expect failure: GetNextRow fail and return nothing
1679   EXPECT_EQ(i, 0);
1680 
1681   // Manually terminate the pipeline
1682   iter->Stop();
1683 }
1684 
1685 TEST_F(MindDataTestPipeline, TestToNumberFail2) {
1686   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail2.";
1687   // Test ToNumber with overflow float numbers
1688 
1689   std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1690 
1691   // Create a TextFile dataset
1692   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1693   EXPECT_NE(ds, nullptr);
1694 
1695   // Create a Skip operation on ds
1696   ds = ds->Skip(12);
1697   EXPECT_NE(ds, nullptr);
1698 
1699   // Create a Take operation on ds
1700   ds = ds->Take(2);
1701   EXPECT_NE(ds, nullptr);
1702 
1703   // Create ToNumber operation on ds
1704   std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
1705   EXPECT_NE(to_number, nullptr);
1706 
1707   // Create a Map operation on ds
1708   ds = ds->Map({to_number}, {"text"});
1709   EXPECT_NE(ds, nullptr);
1710 
1711   // Create an iterator over the result of the above dataset
1712   // This will trigger the creation of the Execution Tree and launch it.
1713   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1714   EXPECT_NE(iter, nullptr);
1715 
1716   // Iterate the dataset and get each row
1717   std::unordered_map<std::string, mindspore::MSTensor> row;
1718 
1719   // Expect error: input out of bounds of float16
1720   EXPECT_ERROR(iter->GetNextRow(&row));
1721 
1722   uint64_t i = 0;
1723   while (row.size() != 0) {
1724     EXPECT_ERROR(iter->GetNextRow(&row));
1725     i++;
1726   }
1727 
1728   // Expect failure: GetNextRow fail and return nothing
1729   EXPECT_EQ(i, 0);
1730 
1731   // Manually terminate the pipeline
1732   iter->Stop();
1733 }
1734 
1735 TEST_F(MindDataTestPipeline, TestToNumberFail3) {
1736   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail3.";
1737   // Test ToNumber with non numerical input
1738 
1739   std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1740 
1741   // Create a TextFile dataset
1742   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1743   EXPECT_NE(ds, nullptr);
1744 
1745   // Create a Skip operation on ds
1746   ds = ds->Skip(14);
1747   EXPECT_NE(ds, nullptr);
1748 
1749   // Create ToNumber operation on ds
1750   std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
1751   EXPECT_NE(to_number, nullptr);
1752 
1753   // Create a Map operation on ds
1754   ds = ds->Map({to_number}, {"text"});
1755   EXPECT_NE(ds, nullptr);
1756 
1757   // Create an iterator over the result of the above dataset
1758   // This will trigger the creation of the Execution Tree and launch it.
1759   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1760   EXPECT_NE(iter, nullptr);
1761 
1762   // Iterate the dataset and get each row
1763   std::unordered_map<std::string, mindspore::MSTensor> row;
1764 
1765   // Expect error: invalid input which is non numerical
1766   EXPECT_ERROR(iter->GetNextRow(&row));
1767 
1768   uint64_t i = 0;
1769   while (row.size() != 0) {
1770     EXPECT_ERROR(iter->GetNextRow(&row));
1771     i++;
1772   }
1773 
1774   // Expect failure: GetNextRow fail and return nothing
1775   EXPECT_EQ(i, 0);
1776 
1777   // Manually terminate the pipeline
1778   iter->Stop();
1779 }
1780 
1781 TEST_F(MindDataTestPipeline, TestToNumberFail4) {
1782   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail4.";
1783   // Test ToNumber with non numerical data type
1784 
1785   std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1786 
1787   // Create a TextFile dataset
1788   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1789   EXPECT_NE(ds, nullptr);
1790 
1791   // Create ToNumber operation on ds
1792   std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kObjectTypeString);
1793   EXPECT_NE(to_number, nullptr);
1794 
1795   // Create a Map operation on ds
1796   ds = ds->Map({to_number}, {"text"});
1797   EXPECT_NE(ds, nullptr);
1798 
1799   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1800   // Expect failure: invalid parameter with non numerical data type
1801   EXPECT_EQ(iter, nullptr);
1802 }
1803 
1804 TEST_F(MindDataTestPipeline, TestToNumberFail5) {
1805   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail5.";
1806   // Test ToNumber with non numerical data type
1807 
1808   std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1809 
1810   // Create a TextFile dataset
1811   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1812   EXPECT_NE(ds, nullptr);
1813 
1814   // Create ToNumber operation on ds
1815   std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeBool);
1816   EXPECT_NE(to_number, nullptr);
1817 
1818   // Create a Map operation on ds
1819   ds = ds->Map({to_number}, {"text"});
1820   EXPECT_NE(ds, nullptr);
1821 
1822   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1823   // Expect failure: invalid parameter with non numerical data type
1824   EXPECT_EQ(iter, nullptr);
1825 }
1826 
1827 TEST_F(MindDataTestPipeline, TestTruncateSequencePairSuccess1) {
1828   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairSuccess1.";
1829   // Testing basic TruncateSequencePair
1830 
1831   // Set seed for RandomDataset
1832   auto original_seed = config::get_seed();
1833   bool status_set_seed = config::set_seed(0);
1834   EXPECT_EQ(status_set_seed, true);
1835 
1836   // Set num_parallel_workers for RandomDataset
1837   auto original_worker = config::get_num_parallel_workers();
1838   bool status_set_worker = config::set_num_parallel_workers(1);
1839   EXPECT_EQ(status_set_worker, true);
1840 
1841   // Create a RandomDataset which has column names "col1" and "col2"
1842   std::shared_ptr<SchemaObj> schema = Schema();
1843   ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt16, {5}));
1844   ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt32, {3}));
1845   std::shared_ptr<Dataset> ds = RandomData(3, schema);
1846   EXPECT_NE(ds, nullptr);
1847 
1848   // Create a truncate_sequence_pair operation on ds
1849   std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(4);
1850   EXPECT_NE(truncate_sequence_pair, nullptr);
1851 
1852   // Create Map operation on ds
1853   ds = ds->Map({truncate_sequence_pair}, {"col1", "col2"});
1854   EXPECT_NE(ds, nullptr);
1855 
1856   // Create an iterator over the result of the above dataset
1857   // This will trigger the creation of the Execution Tree and launch it.
1858   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1859   EXPECT_NE(iter, nullptr);
1860 
1861   // Iterate the dataset and get each row
1862   std::unordered_map<std::string, mindspore::MSTensor> row;
1863   ASSERT_OK(iter->GetNextRow(&row));
1864 
1865   std::vector<std::vector<int16_t>> expected1 = {{-29556, -29556}, {-18505, -18505}, {-25958, -25958}};
1866   std::vector<std::vector<int32_t>> expected2 = {
1867     {-1751672937, -1751672937}, {-656877352, -656877352}, {-606348325, -606348325}};
1868 
1869   uint64_t i = 0;
1870   while (row.size() != 0) {
1871     auto ind1 = row["col1"];
1872     auto ind2 = row["col2"];
1873 
1874     std::shared_ptr<Tensor> de_expected_tensor1;
1875     ASSERT_OK(Tensor::CreateFromVector(expected1[i], &de_expected_tensor1));
1876     mindspore::MSTensor expected_tensor1 =
1877       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor1));
1878     EXPECT_MSTENSOR_EQ(ind1, expected_tensor1);
1879 
1880     std::shared_ptr<Tensor> de_expected_tensor2;
1881     ASSERT_OK(Tensor::CreateFromVector(expected2[i], &de_expected_tensor2));
1882     mindspore::MSTensor expected_tensor2 =
1883       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor2));
1884     EXPECT_MSTENSOR_EQ(ind2, expected_tensor2);
1885 
1886     ASSERT_OK(iter->GetNextRow(&row));
1887     i++;
1888   }
1889 
1890   EXPECT_EQ(i, 3);
1891 
1892   // Manually terminate the pipeline
1893   iter->Stop();
1894 
1895   // Restore original seed and num_parallel_workers
1896   status_set_seed = config::set_seed(original_seed);
1897   EXPECT_EQ(status_set_seed, true);
1898   status_set_worker = config::set_num_parallel_workers(original_worker);
1899   EXPECT_EQ(status_set_worker, true);
1900 }
1901 
1902 TEST_F(MindDataTestPipeline, TestTruncateSequencePairSuccess2) {
1903   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairSuccess2.";
1904   // Testing basic TruncateSequencePair with odd max_length
1905 
1906   // Set seed for RandomDataset
1907   auto original_seed = config::get_seed();
1908   bool status_set_seed = config::set_seed(1);
1909   EXPECT_EQ(status_set_seed, true);
1910 
1911   // Set num_parallel_workers for RandomDataset
1912   auto original_worker = config::get_num_parallel_workers();
1913   bool status_set_worker = config::set_num_parallel_workers(1);
1914   EXPECT_EQ(status_set_worker, true);
1915 
1916   // Create a RandomDataset which has column names "col1" and "col2"
1917   std::shared_ptr<SchemaObj> schema = Schema();
1918   ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt32, {4}));
1919   ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt64, {4}));
1920   std::shared_ptr<Dataset> ds = RandomData(4, schema);
1921   EXPECT_NE(ds, nullptr);
1922 
1923   // Create a truncate_sequence_pair operation on ds
1924   std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(5);
1925   EXPECT_NE(truncate_sequence_pair, nullptr);
1926 
1927   // Create Map operation on ds
1928   ds = ds->Map({truncate_sequence_pair}, {"col1", "col2"});
1929   EXPECT_NE(ds, nullptr);
1930 
1931   // Create an iterator over the result of the above dataset
1932   // This will trigger the creation of the Execution Tree and launch it.
1933   std::shared_ptr<Iterator> iter = ds->CreateIterator();
1934   EXPECT_NE(iter, nullptr);
1935 
1936   // Iterate the dataset and get each row
1937   std::unordered_map<std::string, mindspore::MSTensor> row;
1938   ASSERT_OK(iter->GetNextRow(&row));
1939 
1940   std::vector<std::vector<int32_t>> expected1 = {{1785358954, 1785358954, 1785358954},
1941                                                  {-1195853640, -1195853640, -1195853640},
1942                                                  {0, 0, 0},
1943                                                  {1296911693, 1296911693, 1296911693}};
1944   std::vector<std::vector<int64_t>> expected2 = {
1945     {-1, -1}, {-1229782938247303442, -1229782938247303442}, {2314885530818453536, 2314885530818453536}, {-1, -1}};
1946 
1947   uint64_t i = 0;
1948   while (row.size() != 0) {
1949     auto ind1 = row["col1"];
1950     auto ind2 = row["col2"];
1951 
1952     std::shared_ptr<Tensor> de_expected_tensor1;
1953     ASSERT_OK(Tensor::CreateFromVector(expected1[i], &de_expected_tensor1));
1954     mindspore::MSTensor expected_tensor1 =
1955       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor1));
1956     EXPECT_MSTENSOR_EQ(ind1, expected_tensor1);
1957 
1958     std::shared_ptr<Tensor> de_expected_tensor2;
1959     ASSERT_OK(Tensor::CreateFromVector(expected2[i], &de_expected_tensor2));
1960     mindspore::MSTensor expected_tensor2 =
1961       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor2));
1962     EXPECT_MSTENSOR_EQ(ind2, expected_tensor2);
1963 
1964     ASSERT_OK(iter->GetNextRow(&row));
1965     i++;
1966   }
1967 
1968   EXPECT_EQ(i, 4);
1969 
1970   // Manually terminate the pipeline
1971   iter->Stop();
1972 
1973   // Restore original seed and num_parallel_workers
1974   status_set_seed = config::set_seed(original_seed);
1975   EXPECT_EQ(status_set_seed, true);
1976   status_set_worker = config::set_num_parallel_workers(original_worker);
1977   EXPECT_EQ(status_set_worker, true);
1978 }
1979 
1980 TEST_F(MindDataTestPipeline, TestTruncateSequencePairFail) {
1981   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairFail.";
1982   // Testing TruncateSequencePair with negative max_length
1983 
1984   // Create a RandomDataset which has column names "col1" and "col2"
1985   std::shared_ptr<SchemaObj> schema = Schema();
1986   ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt8, {3}));
1987   ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt8, {3}));
1988   std::shared_ptr<Dataset> ds = RandomData(3, schema);
1989   EXPECT_NE(ds, nullptr);
1990 
1991   // Create a truncate_sequence_pair operation on ds
1992   std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(-1);
1993   EXPECT_NE(truncate_sequence_pair, nullptr);
1994 
1995   // Create a Map operation on ds
1996   ds = ds->Map({truncate_sequence_pair});
1997   EXPECT_NE(ds, nullptr);
1998 
1999   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2000   // Expect failure: invalid TruncateSequencePair input (invalid parameter with negative max_length)
2001   EXPECT_EQ(iter, nullptr);
2002 }
2003 
2004 TEST_F(MindDataTestPipeline, TestNgramSuccess) {
2005   // Testing the parameter of Ngram interface.
2006   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess.";
2007 
2008   // Create a TextFile dataset
2009   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2010   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2011   EXPECT_NE(ds, nullptr);
2012 
2013   // Create white_tokenizer operation on ds
2014   std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
2015   EXPECT_NE(white_tokenizer, nullptr);
2016   // Create sliding_window operation on ds
2017   std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2}, {"_", 1}, {"_", 1}, " "));
2018   EXPECT_NE(ngram_op, nullptr);
2019 
2020   // Create Map operation on ds
2021   ds = ds->Map({white_tokenizer, ngram_op}, {"text"});
2022   EXPECT_NE(ds, nullptr);
2023 
2024   // Create an iterator over the result of the above dataset
2025   // This will trigger the creation of the Execution Tree and launch it.
2026   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2027   EXPECT_NE(iter, nullptr);
2028 
2029   // Iterate the dataset and get each row
2030   std::unordered_map<std::string, mindspore::MSTensor> row;
2031   ASSERT_OK(iter->GetNextRow(&row));
2032 
2033   std::vector<std::vector<std::string>> expected = {{"_ This", "This is", "is a", "a text", "text file.", "file. _"},
2034                                                     {"_ Be", "Be happy", "happy every", "every day.", "day. _"},
2035                                                     {"_ Good", "Good luck", "luck to", "to everyone.", "everyone. _"}};
2036 
2037   uint64_t i = 0;
2038   while (row.size() != 0) {
2039     auto ind = row["text"];
2040 
2041     std::shared_ptr<Tensor> de_expected_tensor;
2042     int x = expected[i].size();
2043     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
2044     mindspore::MSTensor expected_tensor =
2045       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2046     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
2047 
2048     ASSERT_OK(iter->GetNextRow(&row));
2049     i++;
2050   }
2051 
2052   EXPECT_EQ(i, 3);
2053 
2054   // Manually terminate the pipeline
2055   iter->Stop();
2056 }
2057 
2058 TEST_F(MindDataTestPipeline, TestNgramSuccess1) {
2059   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess1.";
2060 
2061   // Create a TextFile dataset
2062   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2063   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2064   EXPECT_NE(ds, nullptr);
2065 
2066   // Create white_tokenizer operation on ds
2067   std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
2068   EXPECT_NE(white_tokenizer, nullptr);
2069   // Create sliding_window operation on ds
2070   std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2, 3}, {"&", 2}, {"&", 2}, "-"));
2071   EXPECT_NE(ngram_op, nullptr);
2072 
2073   // Create Map operation on ds
2074   ds = ds->Map({white_tokenizer, ngram_op}, {"text"});
2075   EXPECT_NE(ds, nullptr);
2076 
2077   // Create an iterator over the result of the above dataset
2078   // This will trigger the creation of the Execution Tree and launch it.
2079   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2080   EXPECT_NE(iter, nullptr);
2081 
2082   // Iterate the dataset and get each row
2083   std::unordered_map<std::string, mindspore::MSTensor> row;
2084   ASSERT_OK(iter->GetNextRow(&row));
2085 
2086   std::vector<std::vector<std::string>> expected = {
2087     {"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a",
2088      "is-a-text",
2089      "a-text-file.", "text-file.-&", "file.-&-&"},
2090     {"&-Be", "Be-happy", "happy-every", "every-day.", "day.-&", "&-&-Be", "&-Be-happy", "Be-happy-every",
2091      "happy-every-day.", "every-day.-&", "day.-&-&"},
2092     {"&-Good", "Good-luck", "luck-to", "to-everyone.", "everyone.-&", "&-&-Good", "&-Good-luck", "Good-luck-to",
2093      "luck-to-everyone.", "to-everyone.-&", "everyone.-&-&"}};
2094 
2095   uint64_t i = 0;
2096   while (row.size() != 0) {
2097     auto ind = row["text"];
2098 
2099     std::shared_ptr<Tensor> de_expected_tensor;
2100     int x = expected[i].size();
2101     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
2102     mindspore::MSTensor expected_tensor =
2103       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2104     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
2105 
2106     ASSERT_OK(iter->GetNextRow(&row));
2107     i++;
2108   }
2109 
2110   EXPECT_EQ(i, 3);
2111 
2112   // Manually terminate the pipeline
2113   iter->Stop();
2114 }
2115 
2116 TEST_F(MindDataTestPipeline, TestNgramFail1) {
2117   // Testing the incorrect parameter of Ngram interface.
2118   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail1.";
2119 
2120   // Create a TextFile dataset
2121   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2122   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2123   EXPECT_NE(ds, nullptr);
2124 
2125   // Create sliding_window operation on ds
2126   // Testing the vector of ngram is empty
2127   std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({}));
2128   EXPECT_NE(ngram_op, nullptr);
2129 
2130   // Create a Map operation on ds
2131   ds = ds->Map({ngram_op});
2132   EXPECT_NE(ds, nullptr);
2133 
2134   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2135   // Expect failure: invalid Ngram input (the vector of ngram is empty)
2136   EXPECT_EQ(iter, nullptr);
2137 }
2138 
2139 TEST_F(MindDataTestPipeline, TestNgramFail2) {
2140   // Testing the incorrect parameter of Ngram interface.
2141   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail2.";
2142 
2143   // Create a TextFile dataset
2144   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2145   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2146   EXPECT_NE(ds, nullptr);
2147 
2148   // Create sliding_window operation on ds
2149   // Testing the value of ngrams vector less than and equal to 0
2150   std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({0}));
2151   EXPECT_NE(ngram_op, nullptr);
2152 
2153   // Create a Map operation on ds
2154   ds = ds->Map({ngram_op});
2155   EXPECT_NE(ds, nullptr);
2156 
2157   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2158   // Expect failure: invalid Ngram input (the value of ngrams vector less than and equal to 0)
2159   EXPECT_EQ(iter, nullptr);
2160 }
2161 
2162 TEST_F(MindDataTestPipeline, TestNgramFail3) {
2163   // Testing the incorrect parameter of Ngram interface.
2164   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail3.";
2165 
2166   // Create a TextFile dataset
2167   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2168   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2169   EXPECT_NE(ds, nullptr);
2170 
2171   // Create sliding_window operation on ds
2172   // Testing the value of ngrams vector less than and equal to 0
2173   std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({-2}));
2174   EXPECT_NE(ngram_op, nullptr);
2175 
2176   // Create a Map operation on ds
2177   ds = ds->Map({ngram_op});
2178   EXPECT_NE(ds, nullptr);
2179 
2180   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2181   // Expect failure: invalid Ngram input (the value of ngrams vector less than and equal to 0)
2182   EXPECT_EQ(iter, nullptr);
2183 }
2184 
2185 TEST_F(MindDataTestPipeline, TestNgramFail4) {
2186   // Testing the incorrect parameter of Ngram interface.
2187   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail4.";
2188 
2189   // Create a TextFile dataset
2190   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2191   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2192   EXPECT_NE(ds, nullptr);
2193 
2194   // Create sliding_window operation on ds
2195   // Testing the second parameter pad_width in left_pad vector less than 0
2196   std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2}, {"", -1}));
2197   EXPECT_NE(ngram_op, nullptr);
2198 
2199   // Create a Map operation on ds
2200   ds = ds->Map({ngram_op});
2201   EXPECT_NE(ds, nullptr);
2202 
2203   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2204   // Expect failure: invalid Ngram input (the second parameter pad_width in left_pad vector less than 0)
2205   EXPECT_EQ(iter, nullptr);
2206 }
2207 
2208 TEST_F(MindDataTestPipeline, TestNgramFail5) {
2209   // Testing the incorrect parameter of Ngram interface.
2210   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail5.";
2211 
2212   // Create a TextFile dataset
2213   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2214   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2215   EXPECT_NE(ds, nullptr);
2216 
2217   // Create sliding_window operation on ds
2218   // Testing the second parameter pad_width in right_pad vector less than 0
2219   std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2}, {"", 1}, {"", -1}));
2220   EXPECT_NE(ngram_op, nullptr);
2221 
2222   // Create a Map operation on ds
2223   ds = ds->Map({ngram_op});
2224   EXPECT_NE(ds, nullptr);
2225 
2226   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2227   // Expect failure: invalid Ngram input (the second parameter pad_width in left_pad vector less than 0)
2228   EXPECT_EQ(iter, nullptr);
2229 }
2230 
2231 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success) {
2232   // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkc.
2233   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success.";
2234 
2235   // Create a TextFile dataset
2236   std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
2237   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2238   EXPECT_NE(ds, nullptr);
2239 
2240   // Create normalizeutf8 operation on ds
2241   std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfkc);
2242   EXPECT_NE(normalizeutf8, nullptr);
2243 
2244   // Create Map operation on ds
2245   ds = ds->Map({normalizeutf8}, {"text"});
2246   EXPECT_NE(ds, nullptr);
2247 
2248   // Create an iterator over the result of the above dataset
2249   // This will trigger the creation of the Execution Tree and launch it.
2250   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2251   EXPECT_NE(iter, nullptr);
2252 
2253   // Iterate the dataset and get each row
2254   std::unordered_map<std::string, mindspore::MSTensor> row;
2255   ASSERT_OK(iter->GetNextRow(&row));
2256 
2257   std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"};
2258 
2259   uint64_t i = 0;
2260   while (row.size() != 0) {
2261     auto ind = row["text"];
2262     std::shared_ptr<Tensor> de_expected_tensor;
2263     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2264     mindspore::MSTensor ms_expected_tensor =
2265       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2266     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2267     ASSERT_OK(iter->GetNextRow(&row));
2268     i++;
2269   }
2270 
2271   EXPECT_EQ(i, 6);
2272 
2273   // Manually terminate the pipeline
2274   iter->Stop();
2275 }
2276 
2277 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success1) {
2278   // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfc.
2279   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success1.";
2280 
2281   // Create a TextFile dataset
2282   std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
2283   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2284   EXPECT_NE(ds, nullptr);
2285 
2286   // Create normalizeutf8 operation on ds
2287   std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfc);
2288   EXPECT_NE(normalizeutf8, nullptr);
2289 
2290   // Create Map operation on ds
2291   ds = ds->Map({normalizeutf8}, {"text"});
2292   EXPECT_NE(ds, nullptr);
2293 
2294   // Create an iterator over the result of the above dataset
2295   // This will trigger the creation of the Execution Tree and launch it.
2296   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2297   EXPECT_NE(iter, nullptr);
2298 
2299   // Iterate the dataset and get each row
2300   std::unordered_map<std::string, mindspore::MSTensor> row;
2301   ASSERT_OK(iter->GetNextRow(&row));
2302 
2303   std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"};
2304 
2305   uint64_t i = 0;
2306   while (row.size() != 0) {
2307     auto ind = row["text"];
2308     std::shared_ptr<Tensor> de_expected_tensor;
2309     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2310     mindspore::MSTensor ms_expected_tensor =
2311       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2312     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2313     ASSERT_OK(iter->GetNextRow(&row));
2314     i++;
2315   }
2316 
2317   EXPECT_EQ(i, 6);
2318 
2319   // Manually terminate the pipeline
2320   iter->Stop();
2321 }
2322 
2323 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success2) {
2324   // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfd.
2325   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success2.";
2326 
2327   // Create a TextFile dataset
2328   std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
2329   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2330   EXPECT_NE(ds, nullptr);
2331 
2332   // Create normalizeutf8 operation on ds
2333   std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfd);
2334   EXPECT_NE(normalizeutf8, nullptr);
2335 
2336   // Create Map operation on ds
2337   ds = ds->Map({normalizeutf8}, {"text"});
2338   EXPECT_NE(ds, nullptr);
2339 
2340   // Create an iterator over the result of the above dataset
2341   // This will trigger the creation of the Execution Tree and launch it.
2342   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2343   EXPECT_NE(iter, nullptr);
2344 
2345   // Iterate the dataset and get each row
2346   std::unordered_map<std::string, mindspore::MSTensor> row;
2347   ASSERT_OK(iter->GetNextRow(&row));
2348 
2349   std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"};
2350 
2351   uint64_t i = 0;
2352   while (row.size() != 0) {
2353     auto ind = row["text"];
2354     std::shared_ptr<Tensor> de_expected_tensor;
2355     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2356     mindspore::MSTensor ms_expected_tensor =
2357       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2358     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2359     ASSERT_OK(iter->GetNextRow(&row));
2360     i++;
2361   }
2362 
2363   EXPECT_EQ(i, 6);
2364 
2365   // Manually terminate the pipeline
2366   iter->Stop();
2367 }
2368 
2369 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success3) {
2370   // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkd.
2371   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success3.";
2372 
2373   // Create a TextFile dataset
2374   std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
2375   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2376   EXPECT_NE(ds, nullptr);
2377 
2378   // Create normalizeutf8 operation on ds
2379   std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfkd);
2380   EXPECT_NE(normalizeutf8, nullptr);
2381 
2382   // Create Map operation on ds
2383   ds = ds->Map({normalizeutf8}, {"text"});
2384   EXPECT_NE(ds, nullptr);
2385 
2386   // Create an iterator over the result of the above dataset
2387   // This will trigger the creation of the Execution Tree and launch it.
2388   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2389   EXPECT_NE(iter, nullptr);
2390 
2391   // Iterate the dataset and get each row
2392   std::unordered_map<std::string, mindspore::MSTensor> row;
2393   ASSERT_OK(iter->GetNextRow(&row));
2394 
2395   std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"};
2396 
2397   uint64_t i = 0;
2398   while (row.size() != 0) {
2399     auto ind = row["text"];
2400     std::shared_ptr<Tensor> de_expected_tensor;
2401     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2402     mindspore::MSTensor ms_expected_tensor =
2403       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2404     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2405     ASSERT_OK(iter->GetNextRow(&row));
2406     i++;
2407   }
2408 
2409   EXPECT_EQ(i, 6);
2410 
2411   // Manually terminate the pipeline
2412   iter->Stop();
2413 }
2414 
2415 TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess) {
2416   // Testing the parameter of RegexReplace interface when the replace_all is true.
2417   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess.";
2418 
2419   // Create a TextFile dataset
2420   std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
2421   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2422   EXPECT_NE(ds, nullptr);
2423 
2424   // Create regex_replace operation on ds
2425   std::shared_ptr<TensorTransform> regex_replace = std::make_shared<text::RegexReplace>("\\s+", "_", true);
2426   EXPECT_NE(regex_replace, nullptr);
2427 
2428   // Create Map operation on ds
2429   ds = ds->Map({regex_replace}, {"text"});
2430   EXPECT_NE(ds, nullptr);
2431 
2432   // Create an iterator over the result of the above dataset
2433   // This will trigger the creation of the Execution Tree and launch it.
2434   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2435   EXPECT_NE(iter, nullptr);
2436 
2437   // Iterate the dataset and get each row
2438   std::unordered_map<std::string, mindspore::MSTensor> row;
2439   ASSERT_OK(iter->GetNextRow(&row));
2440 
2441   std::vector<std::string> expected = {"Hello_World", "Let's_Go",          "1:hello",        "2:world",
2442                                        "31:beijing",  "Welcome_to_China!", "_我_不想_长大_", "Welcome_to_Shenzhen!"};
2443 
2444   uint64_t i = 0;
2445   while (row.size() != 0) {
2446     auto ind = row["text"];
2447     std::shared_ptr<Tensor> de_expected_tensor;
2448     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2449     mindspore::MSTensor ms_expected_tensor =
2450       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2451     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2452     ASSERT_OK(iter->GetNextRow(&row));
2453     i++;
2454   }
2455 
2456   EXPECT_EQ(i, 8);
2457 
2458   // Manually terminate the pipeline
2459   iter->Stop();
2460 }
2461 
2462 TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess1) {
2463   // Testing the parameter of RegexReplace interface when the replace_all is false.
2464   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess1.";
2465 
2466   // Create a TextFile dataset
2467   std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
2468   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2469   EXPECT_NE(ds, nullptr);
2470 
2471   // Create regex_replace operation on ds
2472   std::shared_ptr<TensorTransform> regex_replace = std::make_shared<text::RegexReplace>("\\s+", "_", false);
2473   EXPECT_NE(regex_replace, nullptr);
2474 
2475   // Create Map operation on ds
2476   ds = ds->Map({regex_replace}, {"text"});
2477   EXPECT_NE(ds, nullptr);
2478 
2479   // Create an iterator over the result of the above dataset
2480   // This will trigger the creation of the Execution Tree and launch it.
2481   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2482   EXPECT_NE(iter, nullptr);
2483 
2484   // Iterate the dataset and get each row
2485   std::unordered_map<std::string, mindspore::MSTensor> row;
2486   ASSERT_OK(iter->GetNextRow(&row));
2487 
2488   std::vector<std::string> expected = {"Hello_World", "Let's_Go",          "1:hello",          "2:world",
2489                                        "31:beijing",  "Welcome_to China!", "_我	不想  长大	", "Welcome_to Shenzhen!"};
2490 
2491   uint64_t i = 0;
2492   while (row.size() != 0) {
2493     auto ind = row["text"];
2494     std::shared_ptr<Tensor> de_expected_tensor;
2495     ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2496     mindspore::MSTensor ms_expected_tensor =
2497       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2498     EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2499     ASSERT_OK(iter->GetNextRow(&row));
2500     i++;
2501   }
2502 
2503   EXPECT_EQ(i, 8);
2504 
2505   // Manually terminate the pipeline
2506   iter->Stop();
2507 }
2508 
2509 TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess) {
2510   // Testing the parameter of RegexTokenizer interface when the with_offsets is false.
2511   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess.";
2512 
2513   // Create a TextFile dataset
2514   std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
2515   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2516   EXPECT_NE(ds, nullptr);
2517 
2518   // Create regex_tokenizer operation on ds
2519   std::shared_ptr<TensorTransform> regex_tokenizer = std::make_shared<text::RegexTokenizer>("\\s+", "\\s+", false);
2520   EXPECT_NE(regex_tokenizer, nullptr);
2521 
2522   // Create Map operation on ds
2523   ds = ds->Map({regex_tokenizer}, {"text"});
2524   EXPECT_NE(ds, nullptr);
2525 
2526   // Create an iterator over the result of the above dataset
2527   // This will trigger the creation of the Execution Tree and launch it.
2528   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2529   EXPECT_NE(iter, nullptr);
2530 
2531   // Iterate the dataset and get each row
2532   std::unordered_map<std::string, mindspore::MSTensor> row;
2533   ASSERT_OK(iter->GetNextRow(&row));
2534 
2535   std::vector<std::vector<std::string>> expected = {{"Hello", " ", "World"},
2536                                                     {"Let's", " ", "Go"},
2537                                                     {"1:hello"},
2538                                                     {"2:world"},
2539                                                     {"31:beijing"},
2540                                                     {"Welcome", " ", "to", " ", "China!"},
2541                                                     {"  ", "我", "	", "不想", "  ", "长大", "	"},
2542                                                     {"Welcome", " ", "to", " ", "Shenzhen!"}};
2543 
2544   uint64_t i = 0;
2545   while (row.size() != 0) {
2546     auto ind = row["text"];
2547 
2548     std::shared_ptr<Tensor> de_expected_tensor;
2549     int x = expected[i].size();
2550     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
2551     mindspore::MSTensor expected_tensor =
2552       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2553     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
2554 
2555     ASSERT_OK(iter->GetNextRow(&row));
2556     i++;
2557   }
2558 
2559   EXPECT_EQ(i, 8);
2560 
2561   // Manually terminate the pipeline
2562   iter->Stop();
2563 }
2564 
2565 TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess1) {
2566   // Testing the parameter of RegexTokenizer interface when the with_offsets is true.
2567   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess1.";
2568 
2569   // Create a TextFile dataset
2570   std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
2571   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2572   EXPECT_NE(ds, nullptr);
2573 
2574   // Create regex_tokenizer operation on ds
2575   std::shared_ptr<TensorTransform> regex_tokenizer = std::make_shared<text::RegexTokenizer>("\\s+", "\\s+", true);
2576   EXPECT_NE(regex_tokenizer, nullptr);
2577 
2578   // Create Map operation on ds
2579   ds = ds->Map({regex_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
2580                {"token", "offsets_start", "offsets_limit"});
2581   EXPECT_NE(ds, nullptr);
2582 
2583   // Create an iterator over the result of the above dataset
2584   // This will trigger the creation of the Execution Tree and launch it.
2585   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2586   EXPECT_NE(iter, nullptr);
2587 
2588   // Iterate the dataset and get each row
2589   std::unordered_map<std::string, mindspore::MSTensor> row;
2590   ASSERT_OK(iter->GetNextRow(&row));
2591 
2592   std::vector<std::vector<std::string>> expected_tokens = {{"Hello", " ", "World"},
2593                                                            {"Let's", " ", "Go"},
2594                                                            {"1:hello"},
2595                                                            {"2:world"},
2596                                                            {"31:beijing"},
2597                                                            {"Welcome", " ", "to", " ", "China!"},
2598                                                            {"  ", "我", "	", "不想", "  ", "长大", "	"},
2599                                                            {"Welcome", " ", "to", " ", "Shenzhen!"}};
2600 
2601   std::vector<std::vector<uint32_t>> expected_offsets_start = {
2602     {0, 5, 6}, {0, 5, 6}, {0}, {0}, {0}, {0, 7, 8, 10, 11}, {0, 2, 5, 6, 12, 14, 20}, {0, 7, 8, 10, 11}};
2603   std::vector<std::vector<uint32_t>> expected_offsets_limit = {
2604     {5, 6, 11}, {5, 6, 8}, {7}, {7}, {10}, {7, 8, 10, 11, 17}, {2, 5, 6, 12, 14, 20, 21}, {7, 8, 10, 11, 20}};
2605 
2606   uint64_t i = 0;
2607   while (row.size() != 0) {
2608     auto token = row["token"];
2609     auto start = row["offsets_start"];
2610     auto limit = row["offsets_limit"];
2611 
2612     std::shared_ptr<Tensor> de_expected_tokens;
2613     int x = expected_tokens[i].size();
2614     ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
2615     mindspore::MSTensor ms_expected_tokens =
2616       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
2617     EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
2618 
2619     std::shared_ptr<Tensor> de_expected_offsets_start;
2620     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
2621     mindspore::MSTensor ms_expected_offsets_start =
2622       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
2623     EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
2624 
2625     std::shared_ptr<Tensor> de_expected_offsets_limit;
2626     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
2627     mindspore::MSTensor ms_expected_offsets_limit =
2628       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
2629     EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
2630 
2631     ASSERT_OK(iter->GetNextRow(&row));
2632     i++;
2633   }
2634 
2635   EXPECT_EQ(i, 8);
2636 
2637   // Manually terminate the pipeline
2638   iter->Stop();
2639 }
2640 
2641 TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess) {
2642   // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is default.
2643   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess.";
2644 
2645   // Create a TextFile dataset
2646   std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
2647   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2648   EXPECT_NE(ds, nullptr);
2649 
2650   // Create unicodechar_tokenizer operation on ds
2651   std::shared_ptr<TensorTransform> unicodechar_tokenizer = std::make_shared<text::UnicodeCharTokenizer>();
2652   EXPECT_NE(unicodechar_tokenizer, nullptr);
2653 
2654   // Create Map operation on ds
2655   ds = ds->Map({unicodechar_tokenizer}, {"text"});
2656   EXPECT_NE(ds, nullptr);
2657 
2658   // Create an iterator over the result of the above dataset
2659   // This will trigger the creation of the Execution Tree and launch it.
2660   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2661   EXPECT_NE(iter, nullptr);
2662 
2663   // Iterate the dataset and get each row
2664   std::unordered_map<std::string, mindspore::MSTensor> row;
2665   ASSERT_OK(iter->GetNextRow(&row));
2666 
2667   std::vector<std::vector<std::string>> expected = {
2668     {"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"},
2669     {"北", "京", "欢", "迎", "您", "!"},
2670     {"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"},
2671     {" ", " "}};
2672 
2673   uint64_t i = 0;
2674   while (row.size() != 0) {
2675     auto ind = row["text"];
2676 
2677     std::shared_ptr<Tensor> de_expected_tensor;
2678     int x = expected[i].size();
2679     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
2680     mindspore::MSTensor expected_tensor =
2681       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2682     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
2683 
2684     ASSERT_OK(iter->GetNextRow(&row));
2685     i++;
2686   }
2687 
2688   EXPECT_EQ(i, 4);
2689 
2690   // Manually terminate the pipeline
2691   iter->Stop();
2692 }
2693 
2694 TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess1) {
2695   // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is true.
2696   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess1.";
2697 
2698   // Create a TextFile dataset
2699   std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
2700   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2701   EXPECT_NE(ds, nullptr);
2702 
2703   // Create unicodechar_tokenizer operation on ds
2704   std::shared_ptr<TensorTransform> unicodechar_tokenizer = std::make_shared<text::UnicodeCharTokenizer>(true);
2705   EXPECT_NE(unicodechar_tokenizer, nullptr);
2706 
2707   // Create Map operation on ds
2708   ds = ds->Map({unicodechar_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
2709                {"token", "offsets_start", "offsets_limit"});
2710   EXPECT_NE(ds, nullptr);
2711 
2712   // Create an iterator over the result of the above dataset
2713   // This will trigger the creation of the Execution Tree and launch it.
2714   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2715   EXPECT_NE(iter, nullptr);
2716 
2717   // Iterate the dataset and get each row
2718   std::unordered_map<std::string, mindspore::MSTensor> row;
2719   ASSERT_OK(iter->GetNextRow(&row));
2720 
2721   std::vector<std::vector<std::string>> expected_tokens = {
2722     {"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"},
2723     {"北", "京", "欢", "迎", "您", "!"},
2724     {"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"},
2725     {" ", " "}};
2726 
2727   std::vector<std::vector<uint32_t>> expected_offsets_start = {
2728     {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18},
2729     {0, 3, 6, 9, 12, 15},
2730     {0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16},
2731     {0, 1}};
2732 
2733   std::vector<std::vector<uint32_t>> expected_offsets_limit = {
2734     {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
2735     {3, 6, 9, 12, 15, 18},
2736     {3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17},
2737     {1, 2}};
2738 
2739   uint64_t i = 0;
2740   while (row.size() != 0) {
2741     auto token = row["token"];
2742     auto start = row["offsets_start"];
2743     auto limit = row["offsets_limit"];
2744 
2745     std::shared_ptr<Tensor> de_expected_tokens;
2746     int x = expected_tokens[i].size();
2747     ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
2748     mindspore::MSTensor ms_expected_tokens =
2749       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
2750     EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
2751 
2752     std::shared_ptr<Tensor> de_expected_offsets_start;
2753     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
2754     mindspore::MSTensor ms_expected_offsets_start =
2755       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
2756     EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
2757 
2758     std::shared_ptr<Tensor> de_expected_offsets_limit;
2759     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
2760     mindspore::MSTensor ms_expected_offsets_limit =
2761       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
2762     EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
2763 
2764     ASSERT_OK(iter->GetNextRow(&row));
2765     i++;
2766   }
2767 
2768   EXPECT_EQ(i, 4);
2769 
2770   // Manually terminate the pipeline
2771   iter->Stop();
2772 }
2773 
2774 std::vector<std::string> vocab_english = {"book", "cholera", "era", "favor", "##ite", "my",
2775                                           "is",   "love",    "dur", "##ing", "the"};
2776 
2777 std::vector<std::string> vocab_chinese = {"我", "最", "喜", "欢", "的", "书", "是", "霍", "乱", "时", "期", "爱", "情"};
2778 
2779 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess1) {
2780   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess1.";
2781   // Test WordpieceTokenizer with default parameters on English vocab
2782 
2783   // Create a TextFile dataset
2784   std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
2785   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2786   EXPECT_NE(ds, nullptr);
2787 
2788   // Create Take operation on ds
2789   ds = ds->Take(10);
2790   EXPECT_NE(ds, nullptr);
2791 
2792   // Create a vocab from vector
2793   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
2794   Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
2795   EXPECT_EQ(s, Status::OK());
2796 
2797   // Create WordpieceTokenizer operation on ds
2798   std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab);
2799   EXPECT_NE(wordpiece_tokenizer, nullptr);
2800 
2801   // Create Map operation on ds
2802   ds = ds->Map({wordpiece_tokenizer}, {"text"});
2803   EXPECT_NE(ds, nullptr);
2804 
2805   // Create an iterator over the result of the above dataset
2806   // This will trigger the creation of the Execution Tree and launch it.
2807   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2808   EXPECT_NE(iter, nullptr);
2809 
2810   // Iterate the dataset and get each row
2811   std::unordered_map<std::string, mindspore::MSTensor> row;
2812   ASSERT_OK(iter->GetNextRow(&row));
2813 
2814   std::vector<std::vector<std::string>> expected = {
2815     {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}};
2816 
2817   uint64_t i = 0;
2818   while (row.size() != 0) {
2819     auto txt = row["text"];
2820     std::shared_ptr<Tensor> de_expected_tensor;
2821     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
2822     mindspore::MSTensor expected_tensor =
2823       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2824     EXPECT_MSTENSOR_EQ(txt, expected_tensor);
2825     ASSERT_OK(iter->GetNextRow(&row));
2826     i++;
2827   }
2828 
2829   EXPECT_EQ(i, 10);
2830 
2831   // Manually terminate the pipeline
2832   iter->Stop();
2833 }
2834 
2835 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess2) {
2836   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess2.";
2837   // Test WordpieceTokenizer with empty unknown_token
2838 
2839   // Create a TextFile dataset
2840   std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
2841   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2842   EXPECT_NE(ds, nullptr);
2843 
2844   // Create Take operation on ds
2845   ds = ds->Take(10);
2846   EXPECT_NE(ds, nullptr);
2847 
2848   // Create a vocab from vector
2849   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
2850   Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
2851   EXPECT_EQ(s, Status::OK());
2852 
2853   // Create WordpieceTokenizer operation on ds
2854   std::shared_ptr<TensorTransform> wordpiece_tokenizer =
2855     std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "", false);
2856   EXPECT_NE(wordpiece_tokenizer, nullptr);
2857 
2858   // Create Map operation on ds
2859   ds = ds->Map({wordpiece_tokenizer}, {"text"});
2860   EXPECT_NE(ds, nullptr);
2861 
2862   // Create an iterator over the result of the above dataset
2863   // This will trigger the creation of the Execution Tree and launch it.
2864   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2865   EXPECT_NE(iter, nullptr);
2866 
2867   // Iterate the dataset and get each row
2868   std::unordered_map<std::string, mindspore::MSTensor> row;
2869   ASSERT_OK(iter->GetNextRow(&row));
2870 
2871   std::vector<std::vector<std::string>> expected = {
2872     {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"what"}};
2873 
2874   uint64_t i = 0;
2875   while (row.size() != 0) {
2876     auto txt = row["text"];
2877     std::shared_ptr<Tensor> de_expected_tensor;
2878     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
2879     mindspore::MSTensor expected_tensor =
2880       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2881     EXPECT_MSTENSOR_EQ(txt, expected_tensor);
2882     ASSERT_OK(iter->GetNextRow(&row));
2883     i++;
2884   }
2885 
2886   EXPECT_EQ(i, 10);
2887 
2888   // Manually terminate the pipeline
2889   iter->Stop();
2890 }
2891 
2892 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess3) {
2893   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess3.";
2894   // Test WordpieceTokenizer with non-default max_bytes_per_token
2895 
2896   // Create a TextFile dataset
2897   std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
2898   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2899   EXPECT_NE(ds, nullptr);
2900 
2901   // Create Take operation on ds
2902   ds = ds->Take(10);
2903   EXPECT_NE(ds, nullptr);
2904 
2905   // Create a vocab from vector
2906   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
2907   Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
2908   EXPECT_EQ(s, Status::OK());
2909 
2910   // Create WordpieceTokenizer operation on ds
2911   std::shared_ptr<TensorTransform> wordpiece_tokenizer =
2912     std::make_shared<text::WordpieceTokenizer>(vocab, "##", 4, "[UNK]", false);
2913   EXPECT_NE(wordpiece_tokenizer, nullptr);
2914 
2915   // Create Map operation on ds
2916   ds = ds->Map({wordpiece_tokenizer}, {"text"});
2917   EXPECT_NE(ds, nullptr);
2918 
2919   // Create an iterator over the result of the above dataset
2920   // This will trigger the creation of the Execution Tree and launch it.
2921   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2922   EXPECT_NE(iter, nullptr);
2923 
2924   // Iterate the dataset and get each row
2925   std::unordered_map<std::string, mindspore::MSTensor> row;
2926   ASSERT_OK(iter->GetNextRow(&row));
2927 
2928   std::vector<std::vector<std::string>> expected = {{"my"},    {"[UNK]"}, {"book"},  {"is"},  {"love"},
2929                                                     {"[UNK]"}, {"the"},   {"[UNK]"}, {"era"}, {"[UNK]"}};
2930 
2931   uint64_t i = 0;
2932   while (row.size() != 0) {
2933     auto txt = row["text"];
2934     std::shared_ptr<Tensor> de_expected_tensor;
2935     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
2936     mindspore::MSTensor expected_tensor =
2937       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2938     EXPECT_MSTENSOR_EQ(txt, expected_tensor);
2939     ASSERT_OK(iter->GetNextRow(&row));
2940     i++;
2941   }
2942 
2943   EXPECT_EQ(i, 10);
2944 
2945   // Manually terminate the pipeline
2946   iter->Stop();
2947 }
2948 
2949 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess4) {
2950   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess4.";
2951   // Test WordpieceTokenizer with default parameters on Chinese vocab
2952 
2953   // Create a TextFile dataset
2954   std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
2955   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2956   EXPECT_NE(ds, nullptr);
2957 
2958   // Create Skip operation on ds
2959   ds = ds->Skip(10);
2960   EXPECT_NE(ds, nullptr);
2961 
2962   // Create Take operation on ds
2963   ds = ds->Take(15);
2964   EXPECT_NE(ds, nullptr);
2965 
2966   // Create a vocab from vector
2967   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
2968   Status s = Vocab::BuildFromVector(vocab_chinese, {}, true, &vocab);
2969   EXPECT_EQ(s, Status::OK());
2970 
2971   // Create WordpieceTokenizer operation on ds
2972   std::shared_ptr<TensorTransform> wordpiece_tokenizer =
2973     std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", false);
2974   EXPECT_NE(wordpiece_tokenizer, nullptr);
2975 
2976   // Create Map operation on ds
2977   ds = ds->Map({wordpiece_tokenizer}, {"text"});
2978   EXPECT_NE(ds, nullptr);
2979 
2980   // Create an iterator over the result of the above dataset
2981   // This will trigger the creation of the Execution Tree and launch it.
2982   std::shared_ptr<Iterator> iter = ds->CreateIterator();
2983   EXPECT_NE(iter, nullptr);
2984 
2985   // Iterate the dataset and get each row
2986   std::unordered_map<std::string, mindspore::MSTensor> row;
2987   ASSERT_OK(iter->GetNextRow(&row));
2988 
2989   std::vector<std::vector<std::string>> expected = {{"我"}, {"最"}, {"喜"}, {"欢"}, {"的"}, {"书"}, {"是"},   {"霍"},
2990                                                     {"乱"}, {"时"}, {"期"}, {"的"}, {"爱"}, {"情"}, {"[UNK]"}};
2991 
2992   uint64_t i = 0;
2993   while (row.size() != 0) {
2994     auto txt = row["text"];
2995     std::shared_ptr<Tensor> de_expected_tensor;
2996     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
2997     mindspore::MSTensor expected_tensor =
2998       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2999     EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3000     ASSERT_OK(iter->GetNextRow(&row));
3001     i++;
3002   }
3003 
3004   EXPECT_EQ(i, 15);
3005 
3006   // Manually terminate the pipeline
3007   iter->Stop();
3008 }
3009 
3010 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess5) {
3011   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess5.";
3012   // Test WordpieceTokenizer with with_offsets true
3013 
3014   // Create a TextFile dataset
3015   std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
3016   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3017   EXPECT_NE(ds, nullptr);
3018 
3019   // Create Take operation on ds
3020   ds = ds->Take(10);
3021   EXPECT_NE(ds, nullptr);
3022 
3023   // Create a vocab from vector
3024   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3025   Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3026   EXPECT_EQ(s, Status::OK());
3027 
3028   // Create WordpieceTokenizer operation on ds
3029   std::shared_ptr<TensorTransform> wordpiece_tokenizer =
3030     std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", true);
3031   EXPECT_NE(wordpiece_tokenizer, nullptr);
3032 
3033   // Create Map operation on ds
3034   ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
3035   EXPECT_NE(ds, nullptr);
3036 
3037   // Create an iterator over the result of the above dataset
3038   // This will trigger the creation of the Execution Tree and launch it.
3039   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3040   EXPECT_NE(iter, nullptr);
3041 
3042   // Iterate the dataset and get each row
3043   std::unordered_map<std::string, mindspore::MSTensor> row;
3044   ASSERT_OK(iter->GetNextRow(&row));
3045 
3046   std::vector<std::vector<std::string>> expected = {
3047     {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}};
3048   std::vector<std::vector<uint32_t>> expected_offsets_start = {{0}, {0, 5}, {0}, {0}, {0}, {0, 3}, {0}, {0}, {0}, {0}};
3049   std::vector<std::vector<uint32_t>> expected_offsets_limit = {{2}, {5, 8}, {4}, {2}, {4}, {3, 6}, {3}, {7}, {3}, {4}};
3050 
3051   uint64_t i = 0;
3052   while (row.size() != 0) {
3053     auto txt = row["token"];
3054     std::shared_ptr<Tensor> de_expected_tensor;
3055     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
3056     mindspore::MSTensor expected_tensor =
3057       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3058     EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3059 
3060     auto start = row["offsets_start"];
3061     std::shared_ptr<Tensor> de_expected_start_tensor;
3062     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], &de_expected_start_tensor));
3063     mindspore::MSTensor expected_start_tensor =
3064       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_start_tensor));
3065     EXPECT_MSTENSOR_EQ(start, expected_start_tensor);
3066 
3067     auto limit = row["offsets_limit"];
3068     std::shared_ptr<Tensor> de_expected_limit_tensor;
3069     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], &de_expected_limit_tensor));
3070     mindspore::MSTensor expected_limit_tensor =
3071       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_limit_tensor));
3072     EXPECT_MSTENSOR_EQ(limit, expected_limit_tensor);
3073     ASSERT_OK(iter->GetNextRow(&row));
3074     i++;
3075   }
3076 
3077   EXPECT_EQ(i, 10);
3078 
3079   // Manually terminate the pipeline
3080   iter->Stop();
3081 }
3082 
3083 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess6) {
3084   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess6.";
3085   // Test WordpieceTokenizer with max_bytes_per_token equals to 0
3086 
3087   // Create a TextFile dataset
3088   std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
3089   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3090   EXPECT_NE(ds, nullptr);
3091 
3092   // Create Take operation on ds
3093   ds = ds->Take(10);
3094   EXPECT_NE(ds, nullptr);
3095 
3096   // Create a vocab from vector
3097   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3098   Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3099   EXPECT_EQ(s, Status::OK());
3100 
3101   // Create WordpieceTokenizer operation on ds
3102   std::shared_ptr<TensorTransform> wordpiece_tokenizer =
3103     std::make_shared<text::WordpieceTokenizer>(vocab, "##", 0, "[UNK]", true);
3104   EXPECT_NE(wordpiece_tokenizer, nullptr);
3105 
3106   // Create Map operation on ds
3107   ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
3108   EXPECT_NE(ds, nullptr);
3109 
3110   // Create an iterator over the result of the above dataset
3111   // This will trigger the creation of the Execution Tree and launch it.
3112   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3113   EXPECT_NE(iter, nullptr);
3114 
3115   // Iterate the dataset and get each row
3116   std::unordered_map<std::string, mindspore::MSTensor> row;
3117   ASSERT_OK(iter->GetNextRow(&row));
3118 
3119   std::vector<std::vector<std::string>> expected = {{"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"},
3120                                                     {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}};
3121 
3122   uint64_t i = 0;
3123   while (row.size() != 0) {
3124     auto txt = row["token"];
3125     std::shared_ptr<Tensor> de_expected_tensor;
3126     ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
3127     mindspore::MSTensor expected_tensor =
3128       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3129     EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3130     ASSERT_OK(iter->GetNextRow(&row));
3131     i++;
3132   }
3133 
3134   EXPECT_EQ(i, 10);
3135 
3136   // Manually terminate the pipeline
3137   iter->Stop();
3138 }
3139 
3140 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail1) {
3141   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail1.";
3142   // Test WordpieceTokenizer with nullptr vocab
3143 
3144   // Create a TextFile dataset
3145   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
3146   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3147   EXPECT_NE(ds, nullptr);
3148 
3149   // Create WordpieceTokenizer operation on ds
3150   std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(nullptr);
3151   EXPECT_NE(wordpiece_tokenizer, nullptr);
3152 
3153   // Create a Map operation on ds
3154   ds = ds->Map({wordpiece_tokenizer});
3155   EXPECT_NE(ds, nullptr);
3156 
3157   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3158   // Expect failure: invalid WordpieceTokenizer input with nullptr vocab
3159   EXPECT_EQ(iter, nullptr);
3160 }
3161 
3162 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail2) {
3163   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail2.";
3164   // Test WordpieceTokenizer with negative max_bytes_per_token
3165 
3166   // Create a TextFile dataset
3167   std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
3168   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3169   EXPECT_NE(ds, nullptr);
3170 
3171   // Create a vocab from vector
3172   std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3173   Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3174   EXPECT_EQ(s, Status::OK());
3175 
3176   // Create WordpieceTokenizer operation on ds
3177   std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab, "##", -1);
3178   EXPECT_NE(wordpiece_tokenizer, nullptr);
3179 
3180   // Create a Map operation on ds
3181   ds = ds->Map({wordpiece_tokenizer});
3182   EXPECT_NE(ds, nullptr);
3183 
3184   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3185   // Expect failure: invalid WordpieceTokenizer input with nullptr vocab
3186   EXPECT_EQ(iter, nullptr);
3187 }
3188 
3189 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess) {
3190   // Testing the parameter of UnicodeScriptTokenizer interface when the with_offsets and the keep_whitespace is default.
3191   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess.";
3192 
3193   // Create a TextFile dataset
3194   std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3195   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3196   EXPECT_NE(ds, nullptr);
3197 
3198   // Create unicodescript_tokenizer operation on ds
3199   std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>();
3200   EXPECT_NE(unicodescript_tokenizer, nullptr);
3201 
3202   // Create Map operation on ds
3203   ds = ds->Map({unicodescript_tokenizer}, {"text"});
3204   EXPECT_NE(ds, nullptr);
3205 
3206   // Create an iterator over the result of the above dataset
3207   // This will trigger the creation of the Execution Tree and launch it.
3208   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3209   EXPECT_NE(iter, nullptr);
3210 
3211   // Iterate the dataset and get each row
3212   std::unordered_map<std::string, mindspore::MSTensor> row;
3213   ASSERT_OK(iter->GetNextRow(&row));
3214 
3215   std::vector<std::vector<std::string>> expected = {
3216     {"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}};
3217 
3218   uint64_t i = 0;
3219   while (row.size() != 0) {
3220     auto ind = row["text"];
3221 
3222     std::shared_ptr<Tensor> de_expected_tensor;
3223     int x = expected[i].size();
3224     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
3225     mindspore::MSTensor expected_tensor =
3226       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3227     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
3228 
3229     ASSERT_OK(iter->GetNextRow(&row));
3230     i++;
3231   }
3232 
3233   EXPECT_EQ(i, 4);
3234 
3235   // Manually terminate the pipeline
3236   iter->Stop();
3237 }
3238 
3239 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess1) {
3240   // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is
3241   // false.
3242   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess1.";
3243 
3244   // Create a TextFile dataset
3245   std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3246   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3247   EXPECT_NE(ds, nullptr);
3248 
3249   // Create unicodescript_tokenizer operation on ds
3250   std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>(true);
3251   EXPECT_NE(unicodescript_tokenizer, nullptr);
3252 
3253   // Create Map operation on ds
3254   ds = ds->Map({unicodescript_tokenizer}, {"text"});
3255   EXPECT_NE(ds, nullptr);
3256 
3257   // Create an iterator over the result of the above dataset
3258   // This will trigger the creation of the Execution Tree and launch it.
3259   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3260   EXPECT_NE(iter, nullptr);
3261 
3262   // Iterate the dataset and get each row
3263   std::unordered_map<std::string, mindspore::MSTensor> row;
3264   ASSERT_OK(iter->GetNextRow(&row));
3265 
3266   std::vector<std::vector<std::string>> expected = {
3267     {"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {"  "}};
3268 
3269   uint64_t i = 0;
3270   while (row.size() != 0) {
3271     auto ind = row["text"];
3272 
3273     std::shared_ptr<Tensor> de_expected_tensor;
3274     int x = expected[i].size();
3275     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
3276     mindspore::MSTensor expected_tensor =
3277       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3278     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
3279 
3280     ASSERT_OK(iter->GetNextRow(&row));
3281     i++;
3282   }
3283 
3284   EXPECT_EQ(i, 4);
3285 
3286   // Manually terminate the pipeline
3287   iter->Stop();
3288 }
3289 
3290 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess2) {
3291   // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is false and the with_offsets is
3292   // true.
3293   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess2.";
3294 
3295   // Create a TextFile dataset
3296   std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3297   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3298   EXPECT_NE(ds, nullptr);
3299 
3300   // Create unicodescript_tokenizer operation on ds
3301   std::shared_ptr<TensorTransform> unicodescript_tokenizer =
3302     std::make_shared<text::UnicodeScriptTokenizer>(false, true);
3303   EXPECT_NE(unicodescript_tokenizer, nullptr);
3304 
3305   // Create Map operation on ds
3306   ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
3307                {"token", "offsets_start", "offsets_limit"});
3308   EXPECT_NE(ds, nullptr);
3309 
3310   // Create an iterator over the result of the above dataset
3311   // This will trigger the creation of the Execution Tree and launch it.
3312   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3313   EXPECT_NE(iter, nullptr);
3314 
3315   // Iterate the dataset and get each row
3316   std::unordered_map<std::string, mindspore::MSTensor> row;
3317   ASSERT_OK(iter->GetNextRow(&row));
3318 
3319   std::vector<std::vector<std::string>> expected_tokens = {
3320     {"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}};
3321 
3322   std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11, 18}, {0, 15}, {0, 9, 16}, {0}};
3323   std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 18, 19}, {15, 18}, {9, 16, 17}, {0}};
3324 
3325   uint64_t i = 0;
3326   while (row.size() != 0) {
3327     auto token = row["token"];
3328     auto start = row["offsets_start"];
3329     auto limit = row["offsets_limit"];
3330 
3331     std::shared_ptr<Tensor> de_expected_tokens;
3332     int x = expected_tokens[i].size();
3333     ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
3334     mindspore::MSTensor ms_expected_tokens =
3335       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
3336     EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
3337 
3338     std::shared_ptr<Tensor> de_expected_offsets_start;
3339     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
3340     mindspore::MSTensor ms_expected_offsets_start =
3341       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
3342     EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
3343 
3344     std::shared_ptr<Tensor> de_expected_offsets_limit;
3345     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
3346     mindspore::MSTensor ms_expected_offsets_limit =
3347       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
3348     EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
3349 
3350     ASSERT_OK(iter->GetNextRow(&row));
3351     i++;
3352   }
3353 
3354   EXPECT_EQ(i, 4);
3355 
3356   // Manually terminate the pipeline
3357   iter->Stop();
3358 }
3359 
3360 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess3) {
3361   // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is
3362   // true.
3363   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess3.";
3364 
3365   // Create a TextFile dataset
3366   std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3367   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3368   EXPECT_NE(ds, nullptr);
3369 
3370   // Create unicodescript_tokenizer operation on ds
3371   std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>(true, true);
3372   EXPECT_NE(unicodescript_tokenizer, nullptr);
3373 
3374   // Create Map operation on ds
3375   ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
3376                {"token", "offsets_start", "offsets_limit"});
3377   EXPECT_NE(ds, nullptr);
3378 
3379   // Create an iterator over the result of the above dataset
3380   // This will trigger the creation of the Execution Tree and launch it.
3381   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3382   EXPECT_NE(iter, nullptr);
3383 
3384   // Iterate the dataset and get each row
3385   std::unordered_map<std::string, mindspore::MSTensor> row;
3386   ASSERT_OK(iter->GetNextRow(&row));
3387 
3388   std::vector<std::vector<std::string>> expected_tokens = {
3389     {"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {"  "}};
3390 
3391   std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 7, 8, 10, 11, 18}, {0, 15}, {0, 9, 16}, {0}};
3392   std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 8, 10, 11, 18, 19}, {15, 18}, {9, 16, 17}, {2}};
3393 
3394   uint64_t i = 0;
3395   while (row.size() != 0) {
3396     auto token = row["token"];
3397     auto start = row["offsets_start"];
3398     auto limit = row["offsets_limit"];
3399 
3400     std::shared_ptr<Tensor> de_expected_tokens;
3401     int x = expected_tokens[i].size();
3402     ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
3403     mindspore::MSTensor ms_expected_tokens =
3404       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
3405     EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
3406 
3407     std::shared_ptr<Tensor> de_expected_offsets_start;
3408     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
3409     mindspore::MSTensor ms_expected_offsets_start =
3410       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
3411     EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
3412 
3413     std::shared_ptr<Tensor> de_expected_offsets_limit;
3414     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
3415     mindspore::MSTensor ms_expected_offsets_limit =
3416       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
3417     EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
3418 
3419     ASSERT_OK(iter->GetNextRow(&row));
3420     i++;
3421   }
3422 
3423   EXPECT_EQ(i, 4);
3424 
3425   // Manually terminate the pipeline
3426   iter->Stop();
3427 }
3428 
3429 TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess) {
3430   // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is default.
3431   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess.";
3432 
3433   // Create a TextFile dataset
3434   std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
3435   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3436   EXPECT_NE(ds, nullptr);
3437 
3438   // Create white_tokenizer operation on ds
3439   std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
3440   EXPECT_NE(white_tokenizer, nullptr);
3441 
3442   // Create Map operation on ds
3443   ds = ds->Map({white_tokenizer}, {"text"});
3444   EXPECT_NE(ds, nullptr);
3445 
3446   // Create an iterator over the result of the above dataset
3447   // This will trigger the creation of the Execution Tree and launch it.
3448   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3449   EXPECT_NE(iter, nullptr);
3450 
3451   // Iterate the dataset and get each row
3452   std::unordered_map<std::string, mindspore::MSTensor> row;
3453   ASSERT_OK(iter->GetNextRow(&row));
3454 
3455   std::vector<std::vector<std::string>> expected = {
3456     {"This", "is", "a", "text", "file."}, {"Be", "happy", "every", "day."}, {"Good", "luck", "to", "everyone."}};
3457 
3458   uint64_t i = 0;
3459   while (row.size() != 0) {
3460     auto ind = row["text"];
3461 
3462     std::shared_ptr<Tensor> de_expected_tensor;
3463     int x = expected[i].size();
3464     ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
3465     mindspore::MSTensor expected_tensor =
3466       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3467     EXPECT_MSTENSOR_EQ(ind, expected_tensor);
3468 
3469     ASSERT_OK(iter->GetNextRow(&row));
3470     i++;
3471   }
3472 
3473   EXPECT_EQ(i, 3);
3474 
3475   // Manually terminate the pipeline
3476   iter->Stop();
3477 }
3478 
3479 TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess1) {
3480   // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is true.
3481   MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess1.";
3482 
3483   // Create a TextFile dataset
3484   std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3485   std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3486   EXPECT_NE(ds, nullptr);
3487 
3488   // Create white_tokenizer operation on ds
3489   std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>(true);
3490   EXPECT_NE(white_tokenizer, nullptr);
3491 
3492   // Create Map operation on ds
3493   ds = ds->Map({white_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
3494                {"token", "offsets_start", "offsets_limit"});
3495   EXPECT_NE(ds, nullptr);
3496 
3497   // Create an iterator over the result of the above dataset
3498   // This will trigger the creation of the Execution Tree and launch it.
3499   std::shared_ptr<Iterator> iter = ds->CreateIterator();
3500   EXPECT_NE(iter, nullptr);
3501 
3502   // Iterate the dataset and get each row
3503   std::unordered_map<std::string, mindspore::MSTensor> row;
3504   ASSERT_OK(iter->GetNextRow(&row));
3505 
3506   std::vector<std::vector<std::string>> expected_tokens = {
3507     {"Welcome", "to", "Beijing!"}, {"北京欢迎您!"}, {"我喜欢English!"}, {""}};
3508 
3509   std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11}, {0}, {0}, {0}};
3510   std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 19}, {18}, {17}, {0}};
3511 
3512   uint64_t i = 0;
3513   while (row.size() != 0) {
3514     auto token = row["token"];
3515     auto start = row["offsets_start"];
3516     auto limit = row["offsets_limit"];
3517 
3518     std::shared_ptr<Tensor> de_expected_tokens;
3519     int x = expected_tokens[i].size();
3520     ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
3521     mindspore::MSTensor ms_expected_tokens =
3522       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
3523     EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
3524 
3525     std::shared_ptr<Tensor> de_expected_offsets_start;
3526     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
3527     mindspore::MSTensor ms_expected_offsets_start =
3528       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
3529     EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
3530 
3531     std::shared_ptr<Tensor> de_expected_offsets_limit;
3532     ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
3533     mindspore::MSTensor ms_expected_offsets_limit =
3534       mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
3535     EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
3536 
3537     ASSERT_OK(iter->GetNextRow(&row));
3538     i++;
3539   }
3540 
3541   EXPECT_EQ(i, 4);
3542 
3543   // Manually terminate the pipeline
3544   iter->Stop();
3545 }
3546