1 /**
2 * Copyright 2020-2021 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include <memory>
17 #include <vector>
18 #include <string>
19
20 #include "common/common.h"
21 #include "include/api/status.h"
22 #include "minddata/dataset/include/dataset/config.h"
23 #include "minddata/dataset/include/dataset/datasets.h"
24 #include "minddata/dataset/include/dataset/text.h"
25 #include "minddata/dataset/include/dataset/transforms.h"
26 #include "minddata/dataset/text/vocab.h"
27
28 using namespace mindspore::dataset;
29 using mindspore::Status;
30 using mindspore::dataset::ShuffleMode;
31 using mindspore::dataset::Tensor;
32 using mindspore::dataset::Vocab;
33
34 class MindDataTestPipeline : public UT::DatasetOpTesting {
35 protected:
36 };
37
TEST_F(MindDataTestPipeline,TestBasicTokenizerSuccess1)38 TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess1) {
39 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess1.";
40 // Test BasicTokenizer with default parameters
41
42 // Create a TextFile dataset
43 std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
44 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
45 EXPECT_NE(ds, nullptr);
46
47 // Create Take operation on ds
48 ds = ds->Take(6);
49 EXPECT_NE(ds, nullptr);
50
51 // Create BasicTokenizer operation on ds
52 std::shared_ptr<TensorTransform> basic_tokenizer = std::make_shared<text::BasicTokenizer>();
53 EXPECT_NE(basic_tokenizer, nullptr);
54
55 // Create Map operation on ds
56 ds = ds->Map({basic_tokenizer}, {"text"});
57 EXPECT_NE(ds, nullptr);
58
59 // Create an iterator over the result of the above dataset
60 // This will trigger the creation of the Execution Tree and launch it.
61 std::shared_ptr<Iterator> iter = ds->CreateIterator();
62 EXPECT_NE(iter, nullptr);
63
64 // Iterate the dataset and get each row
65 std::unordered_map<std::string, mindspore::MSTensor> row;
66 ASSERT_OK(iter->GetNextRow(&row));
67
68 std::vector<std::vector<std::string>> expected = {
69 {"Welcome", "to", "Beijing", "北", "京", "欢", "迎", "您"},
70 {"長", "風", "破", "浪", "會", "有", "時", ",", "直", "掛", "雲", "帆", "濟", "滄", "海"},
71 {"", "嘿", "嘿", "", "哈", "哈", "", "大", "笑", "", "嘻", "嘻"},
72 {"明", "朝", "(", "1368", "—", "1644", "年", ")", "和", "清", "朝", "(", "1644", "—", "1911", "年", ")",
73 ",", "是", "中", "国", "封", "建", "王", "朝", "史", "上", "最", "后", "两", "个", "朝", "代"},
74 {"明", "代", "(", "1368", "-", "1644", ")", "と", "清", "代", "(", "1644",
75 "-", "1911", ")", "は", "、", "中", "国", "の", "封", "建", "王", "朝",
76 "の", "歴", "史", "における", "最", "後", "の2つの", "王", "朝", "でした"},
77 {"명나라", "(", "1368", "-", "1644", ")", "와", "청나라", "(", "1644", "-",
78 "1911", ")", "는", "중국", "봉건", "왕조의", "역사에서", "마지막", "두", "왕조였다"}};
79
80 uint64_t i = 0;
81 while (row.size() != 0) {
82 auto ind = row["text"];
83 std::shared_ptr<Tensor> de_expected_tensor;
84 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
85 mindspore::MSTensor expected_tensor =
86 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
87 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
88
89 ASSERT_OK(iter->GetNextRow(&row));
90 i++;
91 }
92
93 EXPECT_EQ(i, 6);
94
95 // Manually terminate the pipeline
96 iter->Stop();
97 }
98
TEST_F(MindDataTestPipeline,TestBasicTokenizerSuccess2)99 TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess2) {
100 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess2.";
101 // Test BasicTokenizer with lower_case true
102
103 // Create a TextFile dataset
104 std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
105 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
106 EXPECT_NE(ds, nullptr);
107
108 // Create Skip operation on ds
109 ds = ds->Skip(6);
110 EXPECT_NE(ds, nullptr);
111
112 // Create BasicTokenizer operation on ds
113 std::shared_ptr<TensorTransform> basic_tokenizer = std::make_shared<text::BasicTokenizer>(true);
114 EXPECT_NE(basic_tokenizer, nullptr);
115
116 // Create Map operation on ds
117 ds = ds->Map({basic_tokenizer}, {"text"});
118 EXPECT_NE(ds, nullptr);
119
120 // Create an iterator over the result of the above dataset
121 // This will trigger the creation of the Execution Tree and launch it.
122 std::shared_ptr<Iterator> iter = ds->CreateIterator();
123 EXPECT_NE(iter, nullptr);
124
125 // Iterate the dataset and get each row
126 std::unordered_map<std::string, mindspore::MSTensor> row;
127 ASSERT_OK(iter->GetNextRow(&row));
128
129 std::vector<std::string> expected = {"this", "is", "a", "funky", "string"};
130 std::shared_ptr<Tensor> de_expected_tensor;
131 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
132 mindspore::MSTensor expected_tensor =
133 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
134
135 uint64_t i = 0;
136 while (row.size() != 0) {
137 auto ind = row["text"];
138 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
139 ASSERT_OK(iter->GetNextRow(&row));
140 i++;
141 }
142
143 EXPECT_EQ(i, 1);
144
145 // Manually terminate the pipeline
146 iter->Stop();
147 }
148
TEST_F(MindDataTestPipeline,TestBasicTokenizerSuccess3)149 TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess3) {
150 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess3.";
151 // Test BasicTokenizer with with_offsets true and lower_case true
152
153 // Create a TextFile dataset
154 std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
155 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
156 EXPECT_NE(ds, nullptr);
157
158 // Create Skip operation on ds
159 ds = ds->Skip(6);
160 EXPECT_NE(ds, nullptr);
161
162 // Create BasicTokenizer operation on ds
163 std::shared_ptr<TensorTransform> basic_tokenizer =
164 std::make_shared<text::BasicTokenizer>(true, false, NormalizeForm::kNone, true, true);
165 EXPECT_NE(basic_tokenizer, nullptr);
166
167 // Create Map operation on ds
168 ds = ds->Map({basic_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
169 EXPECT_NE(ds, nullptr);
170
171 // Create an iterator over the result of the above dataset
172 // This will trigger the creation of the Execution Tree and launch it.
173 std::shared_ptr<Iterator> iter = ds->CreateIterator();
174 EXPECT_NE(iter, nullptr);
175
176 // Iterate the dataset and get each row
177 std::unordered_map<std::string, mindspore::MSTensor> row;
178 ASSERT_OK(iter->GetNextRow(&row));
179
180 std::vector<std::string> expected_tokens = {"this", "is", "a", "funky", "string"};
181 std::vector<uint32_t> expected_offsets_start = {0, 5, 8, 10, 16};
182 std::vector<uint32_t> expected_offsets_limit = {4, 7, 9, 15, 22};
183
184 std::shared_ptr<Tensor> de_expected_tokens;
185 ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
186 mindspore::MSTensor ms_expected_tokens =
187 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
188
189 std::shared_ptr<Tensor> de_expected_offsets_start;
190 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
191 mindspore::MSTensor ms_expected_offsets_start =
192 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
193
194 std::shared_ptr<Tensor> de_expected_offsets_limit;
195 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
196 mindspore::MSTensor ms_expected_offsets_limit =
197 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
198
199 uint64_t i = 0;
200 while (row.size() != 0) {
201 auto ind = row["token"];
202 EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
203
204 auto start = row["offsets_start"];
205 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
206
207 auto limit = row["offsets_limit"];
208 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
209
210 ASSERT_OK(iter->GetNextRow(&row));
211 i++;
212 }
213
214 EXPECT_EQ(i, 1);
215
216 // Manually terminate the pipeline
217 iter->Stop();
218 }
219
220 std::vector<std::string> list = {
221 "床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头",
222 "望", "低", "思", "故", "乡", "繁", "體", "字", "嘿", "哈", "大", "笑",
223 "嘻", "i", "am", "mak", "make", "small", "mistake", "##s", "during", "work", "##ing", "hour",
224 "", "", "", "", "+", "/", "-", "=", "12", "28", "40", "16",
225 " ", "I", "[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]", "[unused1]", "[unused10]"};
226
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess1)227 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess1) {
228 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess1.";
229 // Test BertTokenizer with default parameters
230
231 // Create a TextFile dataset
232 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
233 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
234 EXPECT_NE(ds, nullptr);
235
236 // Create Take operation on ds
237 ds = ds->Take(4);
238 EXPECT_NE(ds, nullptr);
239
240 // Create a vocab from vector
241 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
242 Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
243 EXPECT_EQ(s, Status::OK());
244
245 // Create BertTokenizer operation on ds
246 std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(vocab);
247 EXPECT_NE(bert_tokenizer, nullptr);
248
249 // Create Map operation on ds
250 ds = ds->Map({bert_tokenizer}, {"text"});
251 EXPECT_NE(ds, nullptr);
252
253 // Create an iterator over the result of the above dataset
254 // This will trigger the creation of the Execution Tree and launch it.
255 std::shared_ptr<Iterator> iter = ds->CreateIterator();
256 EXPECT_NE(iter, nullptr);
257
258 // Iterate the dataset and get each row
259 std::unordered_map<std::string, mindspore::MSTensor> row;
260 ASSERT_OK(iter->GetNextRow(&row));
261
262 std::vector<std::vector<std::string>> expected = {{"床", "前", "明", "月", "光"},
263 {"疑", "是", "地", "上", "霜"},
264 {"举", "头", "望", "明", "月"},
265 {"低", "头", "思", "故", "乡"}};
266
267 uint64_t i = 0;
268 while (row.size() != 0) {
269 auto ind = row["text"];
270 std::shared_ptr<Tensor> de_expected_tensor;
271 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
272 mindspore::MSTensor expected_tensor =
273 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
274 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
275
276 ASSERT_OK(iter->GetNextRow(&row));
277 i++;
278 }
279
280 EXPECT_EQ(i, 4);
281
282 // Manually terminate the pipeline
283 iter->Stop();
284 }
285
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess2)286 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess2) {
287 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess2.";
288 // Test BertTokenizer with lower_case true
289
290 // Create a TextFile dataset
291 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
292 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
293 EXPECT_NE(ds, nullptr);
294
295 // Create Skip operation on ds
296 ds = ds->Skip(4);
297 EXPECT_NE(ds, nullptr);
298
299 // Create Take operation on ds
300 ds = ds->Take(1);
301 EXPECT_NE(ds, nullptr);
302
303 // Create a vocab from vector
304 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
305 Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
306 EXPECT_EQ(s, Status::OK());
307
308 // Create BertTokenizer operation on ds
309 std::shared_ptr<TensorTransform> bert_tokenizer =
310 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", true);
311 EXPECT_NE(bert_tokenizer, nullptr);
312
313 // Create Map operation on ds
314 ds = ds->Map({bert_tokenizer}, {"text"});
315 EXPECT_NE(ds, nullptr);
316
317 // Create an iterator over the result of the above dataset
318 // This will trigger the creation of the Execution Tree and launch it.
319 std::shared_ptr<Iterator> iter = ds->CreateIterator();
320 EXPECT_NE(iter, nullptr);
321
322 // Iterate the dataset and get each row
323 std::unordered_map<std::string, mindspore::MSTensor> row;
324 ASSERT_OK(iter->GetNextRow(&row));
325
326 std::vector<std::string> expected = {"i", "am", "mak", "##ing", "small", "mistake",
327 "##s", "during", "work", "##ing", "hour", "##s"};
328 std::shared_ptr<Tensor> de_expected_tensor;
329 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
330 mindspore::MSTensor expected_tensor =
331 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
332
333 uint64_t i = 0;
334 while (row.size() != 0) {
335 auto ind = row["text"];
336 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
337 ASSERT_OK(iter->GetNextRow(&row));
338 i++;
339 }
340
341 EXPECT_EQ(i, 1);
342
343 // Manually terminate the pipeline
344 iter->Stop();
345 }
346
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess3)347 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess3) {
348 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess3.";
349 // Test BertTokenizer with normalization_form NFKC
350
351 // Create a TextFile dataset
352 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
353 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
354 EXPECT_NE(ds, nullptr);
355
356 // Create Skip operation on ds
357 ds = ds->Skip(5);
358 EXPECT_NE(ds, nullptr);
359
360 // Create Take operation on ds
361 ds = ds->Take(2);
362 EXPECT_NE(ds, nullptr);
363
364 // Create a vocab from vector
365 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
366 Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
367 EXPECT_EQ(s, Status::OK());
368
369 // Create BertTokenizer operation on ds
370 std::shared_ptr<TensorTransform> bert_tokenizer =
371 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", false, false, NormalizeForm::kNfc);
372 EXPECT_NE(bert_tokenizer, nullptr);
373
374 // Create Map operation on ds
375 ds = ds->Map({bert_tokenizer}, {"text"});
376 EXPECT_NE(ds, nullptr);
377
378 // Create an iterator over the result of the above dataset
379 // This will trigger the creation of the Execution Tree and launch it.
380 std::shared_ptr<Iterator> iter = ds->CreateIterator();
381 EXPECT_NE(iter, nullptr);
382
383 // Iterate the dataset and get each row
384 std::unordered_map<std::string, mindspore::MSTensor> row;
385 ASSERT_OK(iter->GetNextRow(&row));
386
387 std::vector<std::vector<std::string>> expected = {
388 {"", "嘿", "嘿", "", "哈", "哈", "", "大", "笑", "", "嘻", "嘻"}, {"繁", "體", "字"}};
389
390 uint64_t i = 0;
391 while (row.size() != 0) {
392 auto ind = row["text"];
393 std::shared_ptr<Tensor> de_expected_tensor;
394 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
395 mindspore::MSTensor expected_tensor =
396 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
397 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
398
399 ASSERT_OK(iter->GetNextRow(&row));
400 i++;
401 }
402
403 EXPECT_EQ(i, 2);
404
405 // Manually terminate the pipeline
406 iter->Stop();
407 }
408
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess4)409 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess4) {
410 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess4.";
411 // Test BertTokenizer with keep_whitespace true
412
413 // Create a TextFile dataset
414 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
415 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
416 EXPECT_NE(ds, nullptr);
417
418 // Create Skip operation on ds
419 ds = ds->Skip(7);
420 EXPECT_NE(ds, nullptr);
421
422 // Create Take operation on ds
423 ds = ds->Take(1);
424 EXPECT_NE(ds, nullptr);
425
426 // Create a vocab from vector
427 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
428 Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
429 EXPECT_EQ(s, Status::OK());
430
431 // Create BertTokenizer operation on ds
432 std::shared_ptr<TensorTransform> bert_tokenizer =
433 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", false, true);
434 EXPECT_NE(bert_tokenizer, nullptr);
435
436 // Create Map operation on ds
437 ds = ds->Map({bert_tokenizer}, {"text"});
438 EXPECT_NE(ds, nullptr);
439
440 // Create an iterator over the result of the above dataset
441 // This will trigger the creation of the Execution Tree and launch it.
442 std::shared_ptr<Iterator> iter = ds->CreateIterator();
443 EXPECT_NE(iter, nullptr);
444
445 // Iterate the dataset and get each row
446 std::unordered_map<std::string, mindspore::MSTensor> row;
447 ASSERT_OK(iter->GetNextRow(&row));
448
449 std::vector<std::string> expected = {"[UNK]", " ", "[CLS]"};
450 std::shared_ptr<Tensor> de_expected_tensor;
451 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
452 mindspore::MSTensor expected_tensor =
453 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
454
455 uint64_t i = 0;
456 while (row.size() != 0) {
457 auto ind = row["text"];
458 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
459 ASSERT_OK(iter->GetNextRow(&row));
460 i++;
461 }
462
463 EXPECT_EQ(i, 1);
464
465 // Manually terminate the pipeline
466 iter->Stop();
467 }
468
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess5)469 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess5) {
470 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess5.";
471 // Test BertTokenizer with unknown_token empty and keep_whitespace true
472
473 // Create a TextFile dataset
474 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
475 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
476 EXPECT_NE(ds, nullptr);
477
478 // Create Skip operation on ds
479 ds = ds->Skip(7);
480 EXPECT_NE(ds, nullptr);
481
482 // Create Take operation on ds
483 ds = ds->Take(1);
484 EXPECT_NE(ds, nullptr);
485
486 // Create a vocab from vector
487 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
488 Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
489 EXPECT_EQ(s, Status::OK());
490
491 // Create BertTokenizer operation on ds
492 std::shared_ptr<TensorTransform> bert_tokenizer =
493 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "", false, true);
494 EXPECT_NE(bert_tokenizer, nullptr);
495
496 // Create Map operation on ds
497 ds = ds->Map({bert_tokenizer}, {"text"});
498 EXPECT_NE(ds, nullptr);
499
500 // Create an iterator over the result of the above dataset
501 // This will trigger the creation of the Execution Tree and launch it.
502 std::shared_ptr<Iterator> iter = ds->CreateIterator();
503 EXPECT_NE(iter, nullptr);
504
505 // Iterate the dataset and get each row
506 std::unordered_map<std::string, mindspore::MSTensor> row;
507 ASSERT_OK(iter->GetNextRow(&row));
508
509 std::vector<std::string> expected = {"unused", " ", "[CLS]"};
510 std::shared_ptr<Tensor> de_expected_tensor;
511 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
512 mindspore::MSTensor expected_tensor =
513 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
514
515 uint64_t i = 0;
516 while (row.size() != 0) {
517 auto ind = row["text"];
518 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
519 ASSERT_OK(iter->GetNextRow(&row));
520 i++;
521 }
522
523 EXPECT_EQ(i, 1);
524
525 // Manually terminate the pipeline
526 iter->Stop();
527 }
528
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess6)529 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess6) {
530 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess6.";
531 // Test BertTokenizer with preserve_unused_token false, unknown_token empty and keep_whitespace true
532
533 // Create a TextFile dataset
534 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
535 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
536 EXPECT_NE(ds, nullptr);
537
538 // Create Skip operation on ds
539 ds = ds->Skip(7);
540 EXPECT_NE(ds, nullptr);
541
542 // Create Take operation on ds
543 ds = ds->Take(1);
544 EXPECT_NE(ds, nullptr);
545
546 // Create a vocab from vector
547 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
548 Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
549 EXPECT_EQ(s, Status::OK());
550
551 // Create BertTokenizer operation on ds
552 std::shared_ptr<TensorTransform> bert_tokenizer =
553 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "", false, true, NormalizeForm::kNone, false);
554 EXPECT_NE(bert_tokenizer, nullptr);
555
556 // Create Map operation on ds
557 ds = ds->Map({bert_tokenizer}, {"text"});
558 EXPECT_NE(ds, nullptr);
559
560 // Create an iterator over the result of the above dataset
561 // This will trigger the creation of the Execution Tree and launch it.
562 std::shared_ptr<Iterator> iter = ds->CreateIterator();
563 EXPECT_NE(iter, nullptr);
564
565 // Iterate the dataset and get each row
566 std::unordered_map<std::string, mindspore::MSTensor> row;
567 ASSERT_OK(iter->GetNextRow(&row));
568
569 std::vector<std::string> expected = {"unused", " ", "[", "CLS", "]"};
570 std::shared_ptr<Tensor> de_expected_tensor;
571 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
572 mindspore::MSTensor expected_tensor =
573 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
574
575 uint64_t i = 0;
576 while (row.size() != 0) {
577 auto ind = row["text"];
578 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
579 ASSERT_OK(iter->GetNextRow(&row));
580 i++;
581 }
582
583 EXPECT_EQ(i, 1);
584
585 // Manually terminate the pipeline
586 iter->Stop();
587 }
588
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess7)589 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess7) {
590 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess7.";
591 // Test BertTokenizer with with_offsets true and lower_case true
592
593 // Create a TextFile dataset
594 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
595 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
596 EXPECT_NE(ds, nullptr);
597
598 // Create Skip operation on ds
599 ds = ds->Skip(4);
600 EXPECT_NE(ds, nullptr);
601
602 // Create Take operation on ds
603 ds = ds->Take(1);
604 EXPECT_NE(ds, nullptr);
605
606 // Create a vocab from vector
607 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
608 Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
609 EXPECT_EQ(s, Status::OK());
610
611 // Create BertTokenizer operation on ds
612 std::shared_ptr<TensorTransform> bert_tokenizer =
613 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", true, false, NormalizeForm::kNone, true, true);
614 EXPECT_NE(bert_tokenizer, nullptr);
615
616 // Create Map operation on ds
617 ds = ds->Map({bert_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
618 EXPECT_NE(ds, nullptr);
619
620 // Create an iterator over the result of the above dataset
621 // This will trigger the creation of the Execution Tree and launch it.
622 std::shared_ptr<Iterator> iter = ds->CreateIterator();
623 EXPECT_NE(iter, nullptr);
624
625 // Iterate the dataset and get each row
626 std::unordered_map<std::string, mindspore::MSTensor> row;
627 ASSERT_OK(iter->GetNextRow(&row));
628
629 std::vector<std::string> expected_tokens = {"i", "am", "mak", "##ing", "small", "mistake",
630 "##s", "during", "work", "##ing", "hour", "##s"};
631 std::vector<uint32_t> expected_offsets_start = {0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46};
632 std::vector<uint32_t> expected_offsets_limit = {1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47};
633
634 std::shared_ptr<Tensor> de_expected_tokens;
635 ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
636 mindspore::MSTensor ms_expected_tokens =
637 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
638
639 std::shared_ptr<Tensor> de_expected_offsets_start;
640 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
641 mindspore::MSTensor ms_expected_offsets_start =
642 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
643
644 std::shared_ptr<Tensor> de_expected_offsets_limit;
645 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
646 mindspore::MSTensor ms_expected_offsets_limit =
647 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
648
649 uint64_t i = 0;
650 while (row.size() != 0) {
651 auto ind = row["token"];
652 EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
653
654 auto start = row["offsets_start"];
655 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
656
657 auto limit = row["offsets_limit"];
658 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
659
660 ASSERT_OK(iter->GetNextRow(&row));
661 i++;
662 }
663
664 EXPECT_EQ(i, 1);
665
666 // Manually terminate the pipeline
667 iter->Stop();
668 }
669
TEST_F(MindDataTestPipeline,TestBertTokenizerFail1)670 TEST_F(MindDataTestPipeline, TestBertTokenizerFail1) {
671 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail1.";
672 // Test BertTokenizer with nullptr vocab
673
674 // Create a TextFile dataset
675 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
676 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
677 EXPECT_NE(ds, nullptr);
678
679 // Create BertTokenizer operation on ds
680 std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(nullptr);
681 EXPECT_NE(bert_tokenizer, nullptr);
682
683 // Create a Map operation on ds
684 ds = ds->Map({bert_tokenizer});
685 EXPECT_NE(ds, nullptr);
686
687 std::shared_ptr<Iterator> iter = ds->CreateIterator();
688 // Expect failure: invalid BertTokenizer input with nullptr vocab
689 EXPECT_EQ(iter, nullptr);
690 }
691
TEST_F(MindDataTestPipeline,TestBertTokenizerFail2)692 TEST_F(MindDataTestPipeline, TestBertTokenizerFail2) {
693 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail2.";
694 // Test BertTokenizer with negative max_bytes_per_token
695
696 // Create a TextFile dataset
697 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
698 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
699 EXPECT_NE(ds, nullptr);
700
701 // Create a vocab from vector
702 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
703 Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
704 EXPECT_EQ(s, Status::OK());
705
706 // Create BertTokenizer operation on ds
707 std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(vocab, "##", -1);
708 EXPECT_NE(bert_tokenizer, nullptr);
709
710 // Create a Map operation on ds
711 ds = ds->Map({bert_tokenizer});
712 EXPECT_NE(ds, nullptr);
713
714 std::shared_ptr<Iterator> iter = ds->CreateIterator();
715 // Expect failure: invalid BertTokenizer input with nullptr vocab
716 EXPECT_EQ(iter, nullptr);
717 }
718
TEST_F(MindDataTestPipeline,TestCaseFoldSuccess)719 TEST_F(MindDataTestPipeline, TestCaseFoldSuccess) {
720 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaseFoldSuccess.";
721
722 // Create a TextFile dataset
723 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
724 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
725 EXPECT_NE(ds, nullptr);
726
727 // Create casefold operation on ds
728 std::shared_ptr<TensorTransform> casefold = std::make_shared<text::CaseFold>();
729 EXPECT_NE(casefold, nullptr);
730
731 // Create Map operation on ds
732 ds = ds->Map({casefold}, {"text"});
733 EXPECT_NE(ds, nullptr);
734
735 // Create an iterator over the result of the above dataset
736 // This will trigger the creation of the Execution Tree and launch it.
737 std::shared_ptr<Iterator> iter = ds->CreateIterator();
738 EXPECT_NE(iter, nullptr);
739
740 // Iterate the dataset and get each row
741 std::unordered_map<std::string, mindspore::MSTensor> row;
742 ASSERT_OK(iter->GetNextRow(&row));
743
744 std::vector<std::string> expected = {"welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "};
745
746 uint64_t i = 0;
747 while (row.size() != 0) {
748 auto ind = row["text"];
749 std::shared_ptr<Tensor> de_expected_tensor;
750 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
751 mindspore::MSTensor ms_expected_tensor =
752 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
753 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
754 ASSERT_OK(iter->GetNextRow(&row));
755 i++;
756 }
757
758 EXPECT_EQ(i, 4);
759
760 // Manually terminate the pipeline
761 iter->Stop();
762 }
763
TEST_F(MindDataTestPipeline,TestJiebaTokenizerSuccess)764 TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess) {
765 // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is false.
766 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess.";
767
768 // Create a TextFile dataset
769 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
770 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
771 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
772 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
773 EXPECT_NE(ds, nullptr);
774
775 // Create jieba_tokenizer operation on ds
776 std::shared_ptr<TensorTransform> jieba_tokenizer =
777 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
778 EXPECT_NE(jieba_tokenizer, nullptr);
779
780 // Create Map operation on ds
781 ds = ds->Map({jieba_tokenizer}, {"text"});
782 EXPECT_NE(ds, nullptr);
783
784 // Create an iterator over the result of the above dataset
785 // This will trigger the creation of the Execution Tree and launch it.
786 std::shared_ptr<Iterator> iter = ds->CreateIterator();
787 EXPECT_NE(iter, nullptr);
788
789 // Iterate the dataset and get each row
790 std::unordered_map<std::string, mindspore::MSTensor> row;
791 ASSERT_OK(iter->GetNextRow(&row));
792
793 std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
794 std::shared_ptr<Tensor> de_expected_tensor;
795 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
796 mindspore::MSTensor expected_tensor =
797 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
798
799 uint64_t i = 0;
800 while (row.size() != 0) {
801 auto ind = row["text"];
802 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
803 ASSERT_OK(iter->GetNextRow(&row));
804 i++;
805 }
806
807 EXPECT_EQ(i, 1);
808
809 // Manually terminate the pipeline
810 iter->Stop();
811 }
812
TEST_F(MindDataTestPipeline,TestJiebaTokenizerSuccess1)813 TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess1) {
814 // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kHmm and the with_offsets is false.
815 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess1.";
816
817 // Create a TextFile dataset
818 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
819 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
820 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
821 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
822 EXPECT_NE(ds, nullptr);
823
824 // Create jieba_tokenizer operation on ds
825 std::shared_ptr<TensorTransform> jieba_tokenizer =
826 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kHmm);
827 EXPECT_NE(jieba_tokenizer, nullptr);
828
829 // Create Map operation on ds
830 ds = ds->Map({jieba_tokenizer}, {"text"});
831 EXPECT_NE(ds, nullptr);
832
833 // Create an iterator over the result of the above dataset
834 // This will trigger the creation of the Execution Tree and launch it.
835 std::shared_ptr<Iterator> iter = ds->CreateIterator();
836 EXPECT_NE(iter, nullptr);
837
838 // Iterate the dataset and get each row
839 std::unordered_map<std::string, mindspore::MSTensor> row;
840 ASSERT_OK(iter->GetNextRow(&row));
841
842 std::vector<std::string> expected = {"今天", "天气", "太", "好", "了", "我们", "一起", "去", "外面", "玩", "吧"};
843 std::shared_ptr<Tensor> de_expected_tensor;
844 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
845 mindspore::MSTensor expected_tensor =
846 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
847
848 uint64_t i = 0;
849 while (row.size() != 0) {
850 auto ind = row["text"];
851 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
852 ASSERT_OK(iter->GetNextRow(&row));
853 i++;
854 }
855
856 EXPECT_EQ(i, 1);
857
858 // Manually terminate the pipeline
859 iter->Stop();
860 }
861
TEST_F(MindDataTestPipeline,TestJiebaTokenizerSuccess2)862 TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess2) {
863 // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is true.
864 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess2.";
865
866 // Create a TextFile dataset
867 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
868 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
869 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
870 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
871 EXPECT_NE(ds, nullptr);
872
873 // Create jieba_tokenizer operation on ds
874 std::shared_ptr<TensorTransform> jieba_tokenizer =
875 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp, true);
876 EXPECT_NE(jieba_tokenizer, nullptr);
877
878 // Create Map operation on ds
879 ds = ds->Map({jieba_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
880 {"token", "offsets_start", "offsets_limit"});
881 EXPECT_NE(ds, nullptr);
882
883 // Create an iterator over the result of the above dataset
884 // This will trigger the creation of the Execution Tree and launch it.
885 std::shared_ptr<Iterator> iter = ds->CreateIterator();
886 EXPECT_NE(iter, nullptr);
887
888 // Iterate the dataset and get each row
889 std::unordered_map<std::string, mindspore::MSTensor> row;
890 ASSERT_OK(iter->GetNextRow(&row));
891
892 std::vector<std::string> expected_tokens = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
893 std::vector<uint32_t> expected_offsets_start = {0, 12, 21, 27, 33, 36, 42};
894 std::vector<uint32_t> expected_offsets_limit = {12, 21, 27, 33, 36, 42, 48};
895
896 std::shared_ptr<Tensor> de_expected_tokens;
897 ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
898 mindspore::MSTensor ms_expected_tokens =
899 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
900
901 std::shared_ptr<Tensor> de_expected_offsets_start;
902 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
903 mindspore::MSTensor ms_expected_offsets_start =
904 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
905
906 std::shared_ptr<Tensor> de_expected_offsets_limit;
907 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
908 mindspore::MSTensor ms_expected_offsets_limit =
909 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
910
911 uint64_t i = 0;
912 while (row.size() != 0) {
913 auto ind = row["token"];
914 EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
915
916 auto start = row["offsets_start"];
917 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
918
919 auto limit = row["offsets_limit"];
920 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
921
922 ASSERT_OK(iter->GetNextRow(&row));
923 i++;
924 }
925
926 EXPECT_EQ(i, 1);
927
928 // Manually terminate the pipeline
929 iter->Stop();
930 }
931
TEST_F(MindDataTestPipeline,TestJiebaTokenizerFail1)932 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail1) {
933 // Testing the incorrect parameter of JiebaTokenizer interface.
934 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail1.";
935
936 // Create a TextFile dataset
937 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
938 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
939 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
940 EXPECT_NE(ds, nullptr);
941
942 // Create jieba_tokenizer operation on ds
943 // Testing the parameter hmm_path is empty
944 std::shared_ptr<TensorTransform> jieba_tokenizer =
945 std::make_shared<text::JiebaTokenizer>("", mp_path, JiebaMode::kMp);
946 EXPECT_NE(jieba_tokenizer, nullptr);
947
948 // Create a Map operation on ds
949 ds = ds->Map({jieba_tokenizer});
950 EXPECT_NE(ds, nullptr);
951
952 std::shared_ptr<Iterator> iter = ds->CreateIterator();
953 // Expect failure: invalid JiebaTokenizer input (parameter hmm_path is empty)
954 EXPECT_EQ(iter, nullptr);
955 }
956
TEST_F(MindDataTestPipeline,TestJiebaTokenizerFail2)957 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail2) {
958 // Testing the incorrect parameter of JiebaTokenizer interface.
959 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail2.";
960
961 // Create a TextFile dataset
962 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
963 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
964 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
965 EXPECT_NE(ds, nullptr);
966
967 // Create jieba_tokenizer operation on ds
968 // Testing the parameter mp_path is empty
969 std::shared_ptr<TensorTransform> jieba_tokenizer =
970 std::make_shared<text::JiebaTokenizer>(hmm_path, "", JiebaMode::kMp);
971 EXPECT_NE(jieba_tokenizer, nullptr);
972
973 // Create a Map operation on ds
974 ds = ds->Map({jieba_tokenizer});
975 EXPECT_NE(ds, nullptr);
976
977 std::shared_ptr<Iterator> iter = ds->CreateIterator();
978 // Expect failure: invalid JiebaTokenizer input (parameter mp_path is empty)
979 EXPECT_EQ(iter, nullptr);
980 }
981
TEST_F(MindDataTestPipeline,TestJiebaTokenizerFail3)982 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail3) {
983 // Testing the incorrect parameter of JiebaTokenizer interface.
984 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail3.";
985
986 // Create a TextFile dataset
987 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
988 std::string hmm_path_invalid = datasets_root_path_ + "/jiebadict/1.txt";
989 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
990 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
991 EXPECT_NE(ds, nullptr);
992
993 // Create jieba_tokenizer operation on ds
994 // Testing the parameter hmm_path is invalid path
995 std::shared_ptr<TensorTransform> jieba_tokenizer =
996 std::make_shared<text::JiebaTokenizer>(hmm_path_invalid, mp_path, JiebaMode::kMp);
997 EXPECT_NE(jieba_tokenizer, nullptr);
998
999 // Create a Map operation on ds
1000 ds = ds->Map({jieba_tokenizer});
1001 EXPECT_NE(ds, nullptr);
1002
1003 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1004 // Expect failure: invalid JiebaTokenizer input (parameter hmm_path is invalid path)
1005 EXPECT_EQ(iter, nullptr);
1006 }
1007
TEST_F(MindDataTestPipeline,TestJiebaTokenizerFail4)1008 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail4) {
1009 // Testing the incorrect parameter of JiebaTokenizer interface.
1010 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail4.";
1011
1012 // Create a TextFile dataset
1013 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1014 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1015 std::string mp_path_invalid = datasets_root_path_ + "/jiebadict/1.txt";
1016 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1017 EXPECT_NE(ds, nullptr);
1018
1019 // Create jieba_tokenizer operation on ds
1020 // Testing the parameter mp_path is invalid path
1021 std::shared_ptr<TensorTransform> jieba_tokenizer =
1022 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path_invalid, JiebaMode::kMp);
1023 EXPECT_NE(jieba_tokenizer, nullptr);
1024
1025 // Create a Map operation on ds
1026 ds = ds->Map({jieba_tokenizer});
1027 EXPECT_NE(ds, nullptr);
1028
1029 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1030 // Expect failure: invalid JiebaTokenizer input (parameter mp_path is invalid path)
1031 EXPECT_EQ(iter, nullptr);
1032 }
1033
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddWord)1034 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord) {
1035 // Testing the parameter AddWord of JiebaTokenizer when the freq is not provided (default 0).
1036 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord.";
1037
1038 // Create a TextFile dataset
1039 std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
1040 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1041 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1042 std::shared_ptr<Dataset> ds = TextFile({data_file});
1043 EXPECT_NE(ds, nullptr);
1044
1045 // Create jieba_tokenizer operation on ds
1046 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1047 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1048 EXPECT_NE(jieba_tokenizer, nullptr);
1049
1050 // Add word with freq not provided (default 0)
1051 ASSERT_OK(jieba_tokenizer->AddWord("男默女泪"));
1052
1053 // Create Map operation on ds
1054 ds = ds->Map({jieba_tokenizer}, {"text"});
1055 EXPECT_NE(ds, nullptr);
1056
1057 // Create an iterator over the result of the above dataset
1058 // This will trigger the creation of the Execution Tree and launch it.
1059 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1060 EXPECT_NE(iter, nullptr);
1061
1062 // Iterate the dataset and get each row
1063 std::unordered_map<std::string, mindspore::MSTensor> row;
1064 ASSERT_OK(iter->GetNextRow(&row));
1065
1066 std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
1067 std::shared_ptr<Tensor> de_expected_tensor;
1068 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1069 mindspore::MSTensor expected_tensor =
1070 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1071
1072 uint64_t i = 0;
1073 while (row.size() != 0) {
1074 auto ind = row["text"];
1075 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1076 ASSERT_OK(iter->GetNextRow(&row));
1077 i++;
1078 }
1079
1080 EXPECT_EQ(i, 1);
1081
1082 // Manually terminate the pipeline
1083 iter->Stop();
1084 }
1085
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddWord1)1086 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord1) {
1087 // Testing the parameter AddWord of JiebaTokenizer when the freq is set explicitly to 0.
1088 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord1.";
1089
1090 // Create a TextFile dataset
1091 std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
1092 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1093 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1094 std::shared_ptr<Dataset> ds = TextFile({data_file});
1095 EXPECT_NE(ds, nullptr);
1096
1097 // Create jieba_tokenizer operation on ds
1098 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1099 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1100 EXPECT_NE(jieba_tokenizer, nullptr);
1101
1102 // Add word with freq is set explicitly to 0
1103 ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 0));
1104
1105 // Create Map operation on ds
1106 ds = ds->Map({jieba_tokenizer}, {"text"});
1107 EXPECT_NE(ds, nullptr);
1108
1109 // Create an iterator over the result of the above dataset
1110 // This will trigger the creation of the Execution Tree and launch it.
1111 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1112 EXPECT_NE(iter, nullptr);
1113
1114 // Iterate the dataset and get each row
1115 std::unordered_map<std::string, mindspore::MSTensor> row;
1116 ASSERT_OK(iter->GetNextRow(&row));
1117
1118 std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
1119 std::shared_ptr<Tensor> de_expected_tensor;
1120 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1121 mindspore::MSTensor expected_tensor =
1122 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1123
1124 uint64_t i = 0;
1125 while (row.size() != 0) {
1126 auto ind = row["text"];
1127 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1128 ASSERT_OK(iter->GetNextRow(&row));
1129 i++;
1130 }
1131
1132 EXPECT_EQ(i, 1);
1133
1134 // Manually terminate the pipeline
1135 iter->Stop();
1136 }
1137
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddWord2)1138 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord2) {
1139 // Testing the parameter AddWord of JiebaTokenizer when the freq is 10.
1140 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord2.";
1141
1142 // Create a TextFile dataset
1143 std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
1144 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1145 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1146 std::shared_ptr<Dataset> ds = TextFile({data_file});
1147 EXPECT_NE(ds, nullptr);
1148
1149 // Create jieba_tokenizer operation on ds
1150 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1151 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1152 EXPECT_NE(jieba_tokenizer, nullptr);
1153
1154 // Add word with freq 10
1155 ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 10));
1156
1157 // Create Map operation on ds
1158 ds = ds->Map({jieba_tokenizer}, {"text"});
1159 EXPECT_NE(ds, nullptr);
1160
1161 // Create an iterator over the result of the above dataset
1162 // This will trigger the creation of the Execution Tree and launch it.
1163 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1164 EXPECT_NE(iter, nullptr);
1165
1166 // Iterate the dataset and get each row
1167 std::unordered_map<std::string, mindspore::MSTensor> row;
1168 ASSERT_OK(iter->GetNextRow(&row));
1169
1170 std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
1171 std::shared_ptr<Tensor> de_expected_tensor;
1172 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1173 mindspore::MSTensor expected_tensor =
1174 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1175
1176 uint64_t i = 0;
1177 while (row.size() != 0) {
1178 auto ind = row["text"];
1179 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1180 ASSERT_OK(iter->GetNextRow(&row));
1181 i++;
1182 }
1183
1184 EXPECT_EQ(i, 1);
1185
1186 // Manually terminate the pipeline
1187 iter->Stop();
1188 }
1189
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddWord3)1190 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord3) {
1191 // Testing the parameter AddWord of JiebaTokenizer when the freq is 20000 which affects the result of segmentation.
1192 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord3.";
1193
1194 // Create a TextFile dataset
1195 std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt";
1196 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1197 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1198 std::shared_ptr<Dataset> ds = TextFile({data_file});
1199 EXPECT_NE(ds, nullptr);
1200
1201 // Create jieba_tokenizer operation on ds
1202 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1203 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1204 EXPECT_NE(jieba_tokenizer, nullptr);
1205
1206 // Add word with freq 20000
1207 ASSERT_OK(jieba_tokenizer->AddWord("江大桥", 20000));
1208
1209 // Create Map operation on ds
1210 ds = ds->Map({jieba_tokenizer}, {"text"});
1211 EXPECT_NE(ds, nullptr);
1212
1213 // Create an iterator over the result of the above dataset
1214 // This will trigger the creation of the Execution Tree and launch it.
1215 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1216 EXPECT_NE(iter, nullptr);
1217
1218 // Iterate the dataset and get each row
1219 std::unordered_map<std::string, mindspore::MSTensor> row;
1220 ASSERT_OK(iter->GetNextRow(&row));
1221
1222 std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"};
1223 std::shared_ptr<Tensor> de_expected_tensor;
1224 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1225 mindspore::MSTensor expected_tensor =
1226 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1227
1228 uint64_t i = 0;
1229 while (row.size() != 0) {
1230 auto ind = row["text"];
1231 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1232 ASSERT_OK(iter->GetNextRow(&row));
1233 i++;
1234 }
1235
1236 EXPECT_EQ(i, 1);
1237
1238 // Manually terminate the pipeline
1239 iter->Stop();
1240 }
1241
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddWordFail)1242 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWordFail) {
1243 // Testing the incorrect parameter of AddWord in JiebaTokenizer.
1244 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWordFail.";
1245
1246 // Create a TextFile dataset
1247 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1248 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1249 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1250 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1251 EXPECT_NE(ds, nullptr);
1252
1253 // Testing the parameter word of AddWord is empty
1254 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1255 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1256 EXPECT_NE(jieba_tokenizer, nullptr);
1257 EXPECT_NE(jieba_tokenizer->AddWord("", 10), Status::OK());
1258 // Testing the parameter freq of AddWord is negative
1259 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer1 =
1260 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1261 EXPECT_NE(jieba_tokenizer1, nullptr);
1262 EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK());
1263 }
1264
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddDict)1265 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDict) {
1266 // Testing AddDict of JiebaTokenizer when the input is a vector of word-freq pair.
1267 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDict.";
1268
1269 // Create a TextFile dataset
1270 std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt";
1271 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1272 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1273 std::shared_ptr<Dataset> ds = TextFile({data_file});
1274 EXPECT_NE(ds, nullptr);
1275
1276 // Create jieba_tokenizer operation on ds
1277 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1278 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1279 EXPECT_NE(jieba_tokenizer, nullptr);
1280
1281 // Add word with freq 20000
1282 std::vector<std::pair<std::string, int64_t>> user_dict = {{"江大桥", 20000}};
1283 ASSERT_OK(jieba_tokenizer->AddDict(user_dict));
1284
1285 // Create Map operation on ds
1286 ds = ds->Map({jieba_tokenizer}, {"text"});
1287 EXPECT_NE(ds, nullptr);
1288
1289 // Create an iterator over the result of the above dataset
1290 // This will trigger the creation of the Execution Tree and launch it.
1291 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1292 EXPECT_NE(iter, nullptr);
1293
1294 // Iterate the dataset and get each row
1295 std::unordered_map<std::string, mindspore::MSTensor> row;
1296 ASSERT_OK(iter->GetNextRow(&row));
1297
1298 std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"};
1299 std::shared_ptr<Tensor> de_expected_tensor;
1300 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1301 mindspore::MSTensor expected_tensor =
1302 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1303
1304 uint64_t i = 0;
1305 while (row.size() != 0) {
1306 auto txt = row["text"];
1307 EXPECT_MSTENSOR_EQ(txt, expected_tensor);
1308 ASSERT_OK(iter->GetNextRow(&row));
1309 i++;
1310 }
1311
1312 EXPECT_EQ(i, 1);
1313
1314 // Manually terminate the pipeline
1315 iter->Stop();
1316 }
1317
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddDictFromFile)1318 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDictFromFile) {
1319 // Testing AddDict of JiebaTokenizer when the input is a path to dict.
1320 // Test error scenario for AddDict: invalid path
1321 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDictFromFile.";
1322
1323 // Create a TextFile dataset
1324 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1325 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1326 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1327 std::shared_ptr<Dataset> ds = TextFile({data_file});
1328 EXPECT_NE(ds, nullptr);
1329
1330 // Create jieba_tokenizer operation on ds
1331 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1332 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1333 EXPECT_NE(jieba_tokenizer, nullptr);
1334
1335 // Load dict from txt file
1336 std::string user_dict_path = datasets_root_path_ + "/testJiebaDataset/user_dict.txt";
1337 std::string invalid_path = datasets_root_path_ + "/testJiebaDataset/invalid_path.txt";
1338 EXPECT_ERROR(jieba_tokenizer->AddDict(invalid_path));
1339 ASSERT_OK(jieba_tokenizer->AddDict(user_dict_path));
1340
1341 // Create Map operation on ds
1342 ds = ds->Map({jieba_tokenizer}, {"text"});
1343 EXPECT_NE(ds, nullptr);
1344
1345 // Create an iterator over the result of the above dataset
1346 // This will trigger the creation of the Execution Tree and launch it.
1347 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1348 EXPECT_NE(iter, nullptr);
1349
1350 // Iterate the dataset and get each row
1351 std::unordered_map<std::string, mindspore::MSTensor> row;
1352 ASSERT_OK(iter->GetNextRow(&row));
1353
1354 std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
1355 std::shared_ptr<Tensor> de_expected_tensor;
1356 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1357 mindspore::MSTensor expected_tensor =
1358 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1359
1360 uint64_t i = 0;
1361 while (row.size() != 0) {
1362 auto txt = row["text"];
1363 EXPECT_MSTENSOR_EQ(txt, expected_tensor);
1364 ASSERT_OK(iter->GetNextRow(&row));
1365 i++;
1366 }
1367
1368 EXPECT_EQ(i, 1);
1369
1370 // Manually terminate the pipeline
1371 iter->Stop();
1372 }
1373
TEST_F(MindDataTestPipeline,TestSlidingWindowSuccess)1374 TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) {
1375 // Testing the parameter of SlidingWindow interface when the axis is 0.
1376 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess.";
1377
1378 // Create a TextFile dataset
1379 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
1380 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1381 EXPECT_NE(ds, nullptr);
1382
1383 // Create white_tokenizer operation on ds
1384 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
1385 EXPECT_NE(white_tokenizer, nullptr);
1386 // Create sliding_window operation on ds
1387 std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(3, 0);
1388 EXPECT_NE(sliding_window, nullptr);
1389
1390 // Create Map operation on ds
1391 ds = ds->Map({white_tokenizer, sliding_window}, {"text"});
1392 EXPECT_NE(ds, nullptr);
1393
1394 // Create an iterator over the result of the above dataset
1395 // This will trigger the creation of the Execution Tree and launch it.
1396 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1397 EXPECT_NE(iter, nullptr);
1398
1399 // Iterate the dataset and get each row
1400 std::unordered_map<std::string, mindspore::MSTensor> row;
1401 ASSERT_OK(iter->GetNextRow(&row));
1402
1403 std::vector<std::vector<std::string>> expected = {{"This", "is", "a", "is", "a", "text", "a", "text", "file."},
1404 {"Be", "happy", "every", "happy", "every", "day."},
1405 {"Good", "luck", "to", "luck", "to", "everyone."}};
1406
1407 uint64_t i = 0;
1408 while (row.size() != 0) {
1409 auto ind = row["text"];
1410
1411 std::shared_ptr<Tensor> de_expected_tensor;
1412 int x = expected[i].size() / 3;
1413 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x, 3}), &de_expected_tensor));
1414 mindspore::MSTensor expected_tensor =
1415 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1416 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1417
1418 ASSERT_OK(iter->GetNextRow(&row));
1419 i++;
1420 }
1421
1422 EXPECT_EQ(i, 3);
1423
1424 // Manually terminate the pipeline
1425 iter->Stop();
1426 }
1427
TEST_F(MindDataTestPipeline,TestSlidingWindowSuccess1)1428 TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess1) {
1429 // Testing the parameter of SlidingWindow interface when the axis is -1.
1430 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess1.";
1431
1432 // Create a TextFile dataset
1433 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
1434 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1435 EXPECT_NE(ds, nullptr);
1436
1437 // Create white_tokenizer operation on ds
1438 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
1439 EXPECT_NE(white_tokenizer, nullptr);
1440 // Create sliding_window operation on ds
1441 std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(2, -1);
1442 EXPECT_NE(sliding_window, nullptr);
1443
1444 // Create Map operation on ds
1445 ds = ds->Map({white_tokenizer, sliding_window}, {"text"});
1446 EXPECT_NE(ds, nullptr);
1447
1448 // Create an iterator over the result of the above dataset
1449 // This will trigger the creation of the Execution Tree and launch it.
1450 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1451 EXPECT_NE(iter, nullptr);
1452
1453 // Iterate the dataset and get each row
1454 std::unordered_map<std::string, mindspore::MSTensor> row;
1455 ASSERT_OK(iter->GetNextRow(&row));
1456
1457 std::vector<std::vector<std::string>> expected = {{"This", "is", "is", "a", "a", "text", "text", "file."},
1458 {"Be", "happy", "happy", "every", "every", "day."},
1459 {"Good", "luck", "luck", "to", "to", "everyone."}};
1460 uint64_t i = 0;
1461 while (row.size() != 0) {
1462 auto ind = row["text"];
1463
1464 std::shared_ptr<Tensor> de_expected_tensor;
1465 int x = expected[i].size() / 2;
1466 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x, 2}), &de_expected_tensor));
1467 mindspore::MSTensor expected_tensor =
1468 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1469 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1470
1471 ASSERT_OK(iter->GetNextRow(&row));
1472 i++;
1473 }
1474
1475 EXPECT_EQ(i, 3);
1476
1477 // Manually terminate the pipeline
1478 iter->Stop();
1479 }
1480
TEST_F(MindDataTestPipeline,TestSlidingWindowFail1)1481 TEST_F(MindDataTestPipeline, TestSlidingWindowFail1) {
1482 // Testing the incorrect parameter of SlidingWindow interface.
1483 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail1.";
1484
1485 // Create a TextFile dataset
1486 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
1487 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1488 EXPECT_NE(ds, nullptr);
1489
1490 // Create sliding_window operation on ds
1491 // Testing the parameter width less than or equal to 0
1492 // The parameter axis support 0 or -1 only for now
1493 std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(0, 0);
1494 EXPECT_NE(sliding_window, nullptr);
1495
1496 // Create a Map operation on ds
1497 ds = ds->Map({sliding_window});
1498 EXPECT_NE(ds, nullptr);
1499
1500 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1501 // Expect failure: invalid SlidingWindow input (width less than or equal to 0)
1502 EXPECT_EQ(iter, nullptr);
1503 }
1504
TEST_F(MindDataTestPipeline,TestSlidingWindowFail2)1505 TEST_F(MindDataTestPipeline, TestSlidingWindowFail2) {
1506 // Testing the incorrect parameter of SlidingWindow interface.
1507 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail2.";
1508
1509 // Create a TextFile dataset
1510 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
1511 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1512 EXPECT_NE(ds, nullptr);
1513
1514 // Create sliding_window operation on ds
1515 // Testing the parameter width less than or equal to 0
1516 // The parameter axis support 0 or -1 only for now
1517 std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(-2, 0);
1518 EXPECT_NE(sliding_window, nullptr);
1519
1520 // Create a Map operation on ds
1521 ds = ds->Map({sliding_window});
1522 EXPECT_NE(ds, nullptr);
1523
1524 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1525 // Expect failure: invalid SlidingWindow input (width less than or equal to 0)
1526 EXPECT_EQ(iter, nullptr);
1527 }
1528
TEST_F(MindDataTestPipeline,TestToNumberSuccess1)1529 TEST_F(MindDataTestPipeline, TestToNumberSuccess1) {
1530 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberSuccess1.";
1531 // Test ToNumber with integer numbers
1532
1533 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1534
1535 // Create a TextFile dataset
1536 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1537 EXPECT_NE(ds, nullptr);
1538
1539 // Create a Take operation on ds
1540 ds = ds->Take(8);
1541 EXPECT_NE(ds, nullptr);
1542
1543 // Create ToNumber operation on ds
1544 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
1545 EXPECT_NE(to_number, nullptr);
1546
1547 // Create a Map operation on ds
1548 ds = ds->Map({to_number}, {"text"});
1549 EXPECT_NE(ds, nullptr);
1550
1551 // Create an iterator over the result of the above dataset
1552 // This will trigger the creation of the Execution Tree and launch it.
1553 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1554 EXPECT_NE(iter, nullptr);
1555
1556 // Iterate the dataset and get each row
1557 std::unordered_map<std::string, mindspore::MSTensor> row;
1558 ASSERT_OK(iter->GetNextRow(&row));
1559
1560 std::vector<int64_t> expected = {-121, 14, -2219, 7623, -8162536, 162371864, -1726483716, 98921728421};
1561
1562 uint64_t i = 0;
1563 while (row.size() != 0) {
1564 auto ind = row["text"];
1565 std::shared_ptr<Tensor> de_expected_tensor;
1566 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
1567 mindspore::MSTensor ms_expected_tensor =
1568 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1569 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
1570 ASSERT_OK(iter->GetNextRow(&row));
1571 i++;
1572 }
1573
1574 EXPECT_EQ(i, 8);
1575
1576 // Manually terminate the pipeline
1577 iter->Stop();
1578 }
1579
TEST_F(MindDataTestPipeline,TestToNumberSuccess2)1580 TEST_F(MindDataTestPipeline, TestToNumberSuccess2) {
1581 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberSuccess2.";
1582 // Test ToNumber with float numbers
1583
1584 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1585
1586 // Create a TextFile dataset
1587 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1588 EXPECT_NE(ds, nullptr);
1589
1590 // Create a Skip operation on ds
1591 ds = ds->Skip(8);
1592 EXPECT_NE(ds, nullptr);
1593
1594 // Create a Take operation on ds
1595 ds = ds->Take(6);
1596 EXPECT_NE(ds, nullptr);
1597
1598 // Create ToNumber operation on ds
1599 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat64);
1600 EXPECT_NE(to_number, nullptr);
1601
1602 // Create a Map operation on ds
1603 ds = ds->Map({to_number}, {"text"});
1604 EXPECT_NE(ds, nullptr);
1605
1606 // Create an iterator over the result of the above dataset
1607 // This will trigger the creation of the Execution Tree and launch it.
1608 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1609 EXPECT_NE(iter, nullptr);
1610
1611 // Iterate the dataset and get each row
1612 std::unordered_map<std::string, mindspore::MSTensor> row;
1613 ASSERT_OK(iter->GetNextRow(&row));
1614
1615 std::vector<double_t> expected = {-1.1, 1.4, -2219.321, 7623.453, -816256.234282, 162371864.243243};
1616
1617 uint64_t i = 0;
1618 while (row.size() != 0) {
1619 auto ind = row["text"];
1620 std::shared_ptr<Tensor> de_expected_tensor;
1621 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
1622 mindspore::MSTensor ms_expected_tensor =
1623 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1624 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
1625 ASSERT_OK(iter->GetNextRow(&row));
1626 i++;
1627 }
1628
1629 EXPECT_EQ(i, 6);
1630
1631 // Manually terminate the pipeline
1632 iter->Stop();
1633 }
1634
TEST_F(MindDataTestPipeline,TestToNumberFail1)1635 TEST_F(MindDataTestPipeline, TestToNumberFail1) {
1636 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail1.";
1637 // Test ToNumber with overflow integer numbers
1638
1639 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1640
1641 // Create a TextFile dataset
1642 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1643 EXPECT_NE(ds, nullptr);
1644
1645 // Create a Skip operation on ds
1646 ds = ds->Skip(2);
1647 EXPECT_NE(ds, nullptr);
1648
1649 // Create a Take operation on ds
1650 ds = ds->Take(6);
1651 EXPECT_NE(ds, nullptr);
1652
1653 // Create ToNumber operation on ds
1654 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
1655 EXPECT_NE(to_number, nullptr);
1656
1657 // Create a Map operation on ds
1658 ds = ds->Map({to_number}, {"text"});
1659 EXPECT_NE(ds, nullptr);
1660
1661 // Create an iterator over the result of the above dataset
1662 // This will trigger the creation of the Execution Tree and launch it.
1663 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1664 EXPECT_NE(iter, nullptr);
1665
1666 // Iterate the dataset and get each row
1667 std::unordered_map<std::string, mindspore::MSTensor> row;
1668
1669 // Expect error: input out of bounds of int8
1670 EXPECT_ERROR(iter->GetNextRow(&row));
1671
1672 uint64_t i = 0;
1673 while (row.size() != 0) {
1674 EXPECT_ERROR(iter->GetNextRow(&row));
1675 i++;
1676 }
1677
1678 // Expect failure: GetNextRow fail and return nothing
1679 EXPECT_EQ(i, 0);
1680
1681 // Manually terminate the pipeline
1682 iter->Stop();
1683 }
1684
TEST_F(MindDataTestPipeline,TestToNumberFail2)1685 TEST_F(MindDataTestPipeline, TestToNumberFail2) {
1686 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail2.";
1687 // Test ToNumber with overflow float numbers
1688
1689 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1690
1691 // Create a TextFile dataset
1692 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1693 EXPECT_NE(ds, nullptr);
1694
1695 // Create a Skip operation on ds
1696 ds = ds->Skip(12);
1697 EXPECT_NE(ds, nullptr);
1698
1699 // Create a Take operation on ds
1700 ds = ds->Take(2);
1701 EXPECT_NE(ds, nullptr);
1702
1703 // Create ToNumber operation on ds
1704 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
1705 EXPECT_NE(to_number, nullptr);
1706
1707 // Create a Map operation on ds
1708 ds = ds->Map({to_number}, {"text"});
1709 EXPECT_NE(ds, nullptr);
1710
1711 // Create an iterator over the result of the above dataset
1712 // This will trigger the creation of the Execution Tree and launch it.
1713 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1714 EXPECT_NE(iter, nullptr);
1715
1716 // Iterate the dataset and get each row
1717 std::unordered_map<std::string, mindspore::MSTensor> row;
1718
1719 // Expect error: input out of bounds of float16
1720 EXPECT_ERROR(iter->GetNextRow(&row));
1721
1722 uint64_t i = 0;
1723 while (row.size() != 0) {
1724 EXPECT_ERROR(iter->GetNextRow(&row));
1725 i++;
1726 }
1727
1728 // Expect failure: GetNextRow fail and return nothing
1729 EXPECT_EQ(i, 0);
1730
1731 // Manually terminate the pipeline
1732 iter->Stop();
1733 }
1734
TEST_F(MindDataTestPipeline,TestToNumberFail3)1735 TEST_F(MindDataTestPipeline, TestToNumberFail3) {
1736 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail3.";
1737 // Test ToNumber with non numerical input
1738
1739 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1740
1741 // Create a TextFile dataset
1742 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1743 EXPECT_NE(ds, nullptr);
1744
1745 // Create a Skip operation on ds
1746 ds = ds->Skip(14);
1747 EXPECT_NE(ds, nullptr);
1748
1749 // Create ToNumber operation on ds
1750 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
1751 EXPECT_NE(to_number, nullptr);
1752
1753 // Create a Map operation on ds
1754 ds = ds->Map({to_number}, {"text"});
1755 EXPECT_NE(ds, nullptr);
1756
1757 // Create an iterator over the result of the above dataset
1758 // This will trigger the creation of the Execution Tree and launch it.
1759 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1760 EXPECT_NE(iter, nullptr);
1761
1762 // Iterate the dataset and get each row
1763 std::unordered_map<std::string, mindspore::MSTensor> row;
1764
1765 // Expect error: invalid input which is non numerical
1766 EXPECT_ERROR(iter->GetNextRow(&row));
1767
1768 uint64_t i = 0;
1769 while (row.size() != 0) {
1770 EXPECT_ERROR(iter->GetNextRow(&row));
1771 i++;
1772 }
1773
1774 // Expect failure: GetNextRow fail and return nothing
1775 EXPECT_EQ(i, 0);
1776
1777 // Manually terminate the pipeline
1778 iter->Stop();
1779 }
1780
TEST_F(MindDataTestPipeline,TestToNumberFail4)1781 TEST_F(MindDataTestPipeline, TestToNumberFail4) {
1782 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail4.";
1783 // Test ToNumber with non numerical data type
1784
1785 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1786
1787 // Create a TextFile dataset
1788 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1789 EXPECT_NE(ds, nullptr);
1790
1791 // Create ToNumber operation on ds
1792 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kObjectTypeString);
1793 EXPECT_NE(to_number, nullptr);
1794
1795 // Create a Map operation on ds
1796 ds = ds->Map({to_number}, {"text"});
1797 EXPECT_NE(ds, nullptr);
1798
1799 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1800 // Expect failure: invalid parameter with non numerical data type
1801 EXPECT_EQ(iter, nullptr);
1802 }
1803
TEST_F(MindDataTestPipeline,TestToNumberFail5)1804 TEST_F(MindDataTestPipeline, TestToNumberFail5) {
1805 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail5.";
1806 // Test ToNumber with non numerical data type
1807
1808 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1809
1810 // Create a TextFile dataset
1811 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1812 EXPECT_NE(ds, nullptr);
1813
1814 // Create ToNumber operation on ds
1815 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeBool);
1816 EXPECT_NE(to_number, nullptr);
1817
1818 // Create a Map operation on ds
1819 ds = ds->Map({to_number}, {"text"});
1820 EXPECT_NE(ds, nullptr);
1821
1822 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1823 // Expect failure: invalid parameter with non numerical data type
1824 EXPECT_EQ(iter, nullptr);
1825 }
1826
TEST_F(MindDataTestPipeline,TestTruncateSequencePairSuccess1)1827 TEST_F(MindDataTestPipeline, TestTruncateSequencePairSuccess1) {
1828 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairSuccess1.";
1829 // Testing basic TruncateSequencePair
1830
1831 // Set seed for RandomDataset
1832 auto original_seed = config::get_seed();
1833 bool status_set_seed = config::set_seed(0);
1834 EXPECT_EQ(status_set_seed, true);
1835
1836 // Set num_parallel_workers for RandomDataset
1837 auto original_worker = config::get_num_parallel_workers();
1838 bool status_set_worker = config::set_num_parallel_workers(1);
1839 EXPECT_EQ(status_set_worker, true);
1840
1841 // Create a RandomDataset which has column names "col1" and "col2"
1842 std::shared_ptr<SchemaObj> schema = Schema();
1843 ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt16, {5}));
1844 ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt32, {3}));
1845 std::shared_ptr<Dataset> ds = RandomData(3, schema);
1846 EXPECT_NE(ds, nullptr);
1847
1848 // Create a truncate_sequence_pair operation on ds
1849 std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(4);
1850 EXPECT_NE(truncate_sequence_pair, nullptr);
1851
1852 // Create Map operation on ds
1853 ds = ds->Map({truncate_sequence_pair}, {"col1", "col2"});
1854 EXPECT_NE(ds, nullptr);
1855
1856 // Create an iterator over the result of the above dataset
1857 // This will trigger the creation of the Execution Tree and launch it.
1858 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1859 EXPECT_NE(iter, nullptr);
1860
1861 // Iterate the dataset and get each row
1862 std::unordered_map<std::string, mindspore::MSTensor> row;
1863 ASSERT_OK(iter->GetNextRow(&row));
1864
1865 std::vector<std::vector<int16_t>> expected1 = {{-29556, -29556}, {-18505, -18505}, {-25958, -25958}};
1866 std::vector<std::vector<int32_t>> expected2 = {
1867 {-1751672937, -1751672937}, {-656877352, -656877352}, {-606348325, -606348325}};
1868
1869 uint64_t i = 0;
1870 while (row.size() != 0) {
1871 auto ind1 = row["col1"];
1872 auto ind2 = row["col2"];
1873
1874 std::shared_ptr<Tensor> de_expected_tensor1;
1875 ASSERT_OK(Tensor::CreateFromVector(expected1[i], &de_expected_tensor1));
1876 mindspore::MSTensor expected_tensor1 =
1877 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor1));
1878 EXPECT_MSTENSOR_EQ(ind1, expected_tensor1);
1879
1880 std::shared_ptr<Tensor> de_expected_tensor2;
1881 ASSERT_OK(Tensor::CreateFromVector(expected2[i], &de_expected_tensor2));
1882 mindspore::MSTensor expected_tensor2 =
1883 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor2));
1884 EXPECT_MSTENSOR_EQ(ind2, expected_tensor2);
1885
1886 ASSERT_OK(iter->GetNextRow(&row));
1887 i++;
1888 }
1889
1890 EXPECT_EQ(i, 3);
1891
1892 // Manually terminate the pipeline
1893 iter->Stop();
1894
1895 // Restore original seed and num_parallel_workers
1896 status_set_seed = config::set_seed(original_seed);
1897 EXPECT_EQ(status_set_seed, true);
1898 status_set_worker = config::set_num_parallel_workers(original_worker);
1899 EXPECT_EQ(status_set_worker, true);
1900 }
1901
TEST_F(MindDataTestPipeline,TestTruncateSequencePairSuccess2)1902 TEST_F(MindDataTestPipeline, TestTruncateSequencePairSuccess2) {
1903 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairSuccess2.";
1904 // Testing basic TruncateSequencePair with odd max_length
1905
1906 // Set seed for RandomDataset
1907 auto original_seed = config::get_seed();
1908 bool status_set_seed = config::set_seed(1);
1909 EXPECT_EQ(status_set_seed, true);
1910
1911 // Set num_parallel_workers for RandomDataset
1912 auto original_worker = config::get_num_parallel_workers();
1913 bool status_set_worker = config::set_num_parallel_workers(1);
1914 EXPECT_EQ(status_set_worker, true);
1915
1916 // Create a RandomDataset which has column names "col1" and "col2"
1917 std::shared_ptr<SchemaObj> schema = Schema();
1918 ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt32, {4}));
1919 ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt64, {4}));
1920 std::shared_ptr<Dataset> ds = RandomData(4, schema);
1921 EXPECT_NE(ds, nullptr);
1922
1923 // Create a truncate_sequence_pair operation on ds
1924 std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(5);
1925 EXPECT_NE(truncate_sequence_pair, nullptr);
1926
1927 // Create Map operation on ds
1928 ds = ds->Map({truncate_sequence_pair}, {"col1", "col2"});
1929 EXPECT_NE(ds, nullptr);
1930
1931 // Create an iterator over the result of the above dataset
1932 // This will trigger the creation of the Execution Tree and launch it.
1933 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1934 EXPECT_NE(iter, nullptr);
1935
1936 // Iterate the dataset and get each row
1937 std::unordered_map<std::string, mindspore::MSTensor> row;
1938 ASSERT_OK(iter->GetNextRow(&row));
1939
1940 std::vector<std::vector<int32_t>> expected1 = {{1785358954, 1785358954, 1785358954},
1941 {-1195853640, -1195853640, -1195853640},
1942 {0, 0, 0},
1943 {1296911693, 1296911693, 1296911693}};
1944 std::vector<std::vector<int64_t>> expected2 = {
1945 {-1, -1}, {-1229782938247303442, -1229782938247303442}, {2314885530818453536, 2314885530818453536}, {-1, -1}};
1946
1947 uint64_t i = 0;
1948 while (row.size() != 0) {
1949 auto ind1 = row["col1"];
1950 auto ind2 = row["col2"];
1951
1952 std::shared_ptr<Tensor> de_expected_tensor1;
1953 ASSERT_OK(Tensor::CreateFromVector(expected1[i], &de_expected_tensor1));
1954 mindspore::MSTensor expected_tensor1 =
1955 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor1));
1956 EXPECT_MSTENSOR_EQ(ind1, expected_tensor1);
1957
1958 std::shared_ptr<Tensor> de_expected_tensor2;
1959 ASSERT_OK(Tensor::CreateFromVector(expected2[i], &de_expected_tensor2));
1960 mindspore::MSTensor expected_tensor2 =
1961 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor2));
1962 EXPECT_MSTENSOR_EQ(ind2, expected_tensor2);
1963
1964 ASSERT_OK(iter->GetNextRow(&row));
1965 i++;
1966 }
1967
1968 EXPECT_EQ(i, 4);
1969
1970 // Manually terminate the pipeline
1971 iter->Stop();
1972
1973 // Restore original seed and num_parallel_workers
1974 status_set_seed = config::set_seed(original_seed);
1975 EXPECT_EQ(status_set_seed, true);
1976 status_set_worker = config::set_num_parallel_workers(original_worker);
1977 EXPECT_EQ(status_set_worker, true);
1978 }
1979
TEST_F(MindDataTestPipeline,TestTruncateSequencePairFail)1980 TEST_F(MindDataTestPipeline, TestTruncateSequencePairFail) {
1981 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairFail.";
1982 // Testing TruncateSequencePair with negative max_length
1983
1984 // Create a RandomDataset which has column names "col1" and "col2"
1985 std::shared_ptr<SchemaObj> schema = Schema();
1986 ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt8, {3}));
1987 ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt8, {3}));
1988 std::shared_ptr<Dataset> ds = RandomData(3, schema);
1989 EXPECT_NE(ds, nullptr);
1990
1991 // Create a truncate_sequence_pair operation on ds
1992 std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(-1);
1993 EXPECT_NE(truncate_sequence_pair, nullptr);
1994
1995 // Create a Map operation on ds
1996 ds = ds->Map({truncate_sequence_pair});
1997 EXPECT_NE(ds, nullptr);
1998
1999 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2000 // Expect failure: invalid TruncateSequencePair input (invalid parameter with negative max_length)
2001 EXPECT_EQ(iter, nullptr);
2002 }
2003
TEST_F(MindDataTestPipeline,TestNgramSuccess)2004 TEST_F(MindDataTestPipeline, TestNgramSuccess) {
2005 // Testing the parameter of Ngram interface.
2006 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess.";
2007
2008 // Create a TextFile dataset
2009 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2010 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2011 EXPECT_NE(ds, nullptr);
2012
2013 // Create white_tokenizer operation on ds
2014 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
2015 EXPECT_NE(white_tokenizer, nullptr);
2016 // Create sliding_window operation on ds
2017 std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2}, {"_", 1}, {"_", 1}, " "));
2018 EXPECT_NE(ngram_op, nullptr);
2019
2020 // Create Map operation on ds
2021 ds = ds->Map({white_tokenizer, ngram_op}, {"text"});
2022 EXPECT_NE(ds, nullptr);
2023
2024 // Create an iterator over the result of the above dataset
2025 // This will trigger the creation of the Execution Tree and launch it.
2026 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2027 EXPECT_NE(iter, nullptr);
2028
2029 // Iterate the dataset and get each row
2030 std::unordered_map<std::string, mindspore::MSTensor> row;
2031 ASSERT_OK(iter->GetNextRow(&row));
2032
2033 std::vector<std::vector<std::string>> expected = {{"_ This", "This is", "is a", "a text", "text file.", "file. _"},
2034 {"_ Be", "Be happy", "happy every", "every day.", "day. _"},
2035 {"_ Good", "Good luck", "luck to", "to everyone.", "everyone. _"}};
2036
2037 uint64_t i = 0;
2038 while (row.size() != 0) {
2039 auto ind = row["text"];
2040
2041 std::shared_ptr<Tensor> de_expected_tensor;
2042 int x = expected[i].size();
2043 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
2044 mindspore::MSTensor expected_tensor =
2045 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2046 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
2047
2048 ASSERT_OK(iter->GetNextRow(&row));
2049 i++;
2050 }
2051
2052 EXPECT_EQ(i, 3);
2053
2054 // Manually terminate the pipeline
2055 iter->Stop();
2056 }
2057
TEST_F(MindDataTestPipeline,TestNgramSuccess1)2058 TEST_F(MindDataTestPipeline, TestNgramSuccess1) {
2059 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess1.";
2060
2061 // Create a TextFile dataset
2062 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2063 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2064 EXPECT_NE(ds, nullptr);
2065
2066 // Create white_tokenizer operation on ds
2067 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
2068 EXPECT_NE(white_tokenizer, nullptr);
2069 // Create sliding_window operation on ds
2070 std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2, 3}, {"&", 2}, {"&", 2}, "-"));
2071 EXPECT_NE(ngram_op, nullptr);
2072
2073 // Create Map operation on ds
2074 ds = ds->Map({white_tokenizer, ngram_op}, {"text"});
2075 EXPECT_NE(ds, nullptr);
2076
2077 // Create an iterator over the result of the above dataset
2078 // This will trigger the creation of the Execution Tree and launch it.
2079 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2080 EXPECT_NE(iter, nullptr);
2081
2082 // Iterate the dataset and get each row
2083 std::unordered_map<std::string, mindspore::MSTensor> row;
2084 ASSERT_OK(iter->GetNextRow(&row));
2085
2086 std::vector<std::vector<std::string>> expected = {
2087 {"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a",
2088 "is-a-text",
2089 "a-text-file.", "text-file.-&", "file.-&-&"},
2090 {"&-Be", "Be-happy", "happy-every", "every-day.", "day.-&", "&-&-Be", "&-Be-happy", "Be-happy-every",
2091 "happy-every-day.", "every-day.-&", "day.-&-&"},
2092 {"&-Good", "Good-luck", "luck-to", "to-everyone.", "everyone.-&", "&-&-Good", "&-Good-luck", "Good-luck-to",
2093 "luck-to-everyone.", "to-everyone.-&", "everyone.-&-&"}};
2094
2095 uint64_t i = 0;
2096 while (row.size() != 0) {
2097 auto ind = row["text"];
2098
2099 std::shared_ptr<Tensor> de_expected_tensor;
2100 int x = expected[i].size();
2101 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
2102 mindspore::MSTensor expected_tensor =
2103 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2104 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
2105
2106 ASSERT_OK(iter->GetNextRow(&row));
2107 i++;
2108 }
2109
2110 EXPECT_EQ(i, 3);
2111
2112 // Manually terminate the pipeline
2113 iter->Stop();
2114 }
2115
TEST_F(MindDataTestPipeline,TestNgramFail1)2116 TEST_F(MindDataTestPipeline, TestNgramFail1) {
2117 // Testing the incorrect parameter of Ngram interface.
2118 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail1.";
2119
2120 // Create a TextFile dataset
2121 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2122 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2123 EXPECT_NE(ds, nullptr);
2124
2125 // Create sliding_window operation on ds
2126 // Testing the vector of ngram is empty
2127 std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({}));
2128 EXPECT_NE(ngram_op, nullptr);
2129
2130 // Create a Map operation on ds
2131 ds = ds->Map({ngram_op});
2132 EXPECT_NE(ds, nullptr);
2133
2134 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2135 // Expect failure: invalid Ngram input (the vector of ngram is empty)
2136 EXPECT_EQ(iter, nullptr);
2137 }
2138
TEST_F(MindDataTestPipeline,TestNgramFail2)2139 TEST_F(MindDataTestPipeline, TestNgramFail2) {
2140 // Testing the incorrect parameter of Ngram interface.
2141 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail2.";
2142
2143 // Create a TextFile dataset
2144 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2145 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2146 EXPECT_NE(ds, nullptr);
2147
2148 // Create sliding_window operation on ds
2149 // Testing the value of ngrams vector less than and equal to 0
2150 std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({0}));
2151 EXPECT_NE(ngram_op, nullptr);
2152
2153 // Create a Map operation on ds
2154 ds = ds->Map({ngram_op});
2155 EXPECT_NE(ds, nullptr);
2156
2157 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2158 // Expect failure: invalid Ngram input (the value of ngrams vector less than and equal to 0)
2159 EXPECT_EQ(iter, nullptr);
2160 }
2161
TEST_F(MindDataTestPipeline,TestNgramFail3)2162 TEST_F(MindDataTestPipeline, TestNgramFail3) {
2163 // Testing the incorrect parameter of Ngram interface.
2164 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail3.";
2165
2166 // Create a TextFile dataset
2167 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2168 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2169 EXPECT_NE(ds, nullptr);
2170
2171 // Create sliding_window operation on ds
2172 // Testing the value of ngrams vector less than and equal to 0
2173 std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({-2}));
2174 EXPECT_NE(ngram_op, nullptr);
2175
2176 // Create a Map operation on ds
2177 ds = ds->Map({ngram_op});
2178 EXPECT_NE(ds, nullptr);
2179
2180 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2181 // Expect failure: invalid Ngram input (the value of ngrams vector less than and equal to 0)
2182 EXPECT_EQ(iter, nullptr);
2183 }
2184
TEST_F(MindDataTestPipeline,TestNgramFail4)2185 TEST_F(MindDataTestPipeline, TestNgramFail4) {
2186 // Testing the incorrect parameter of Ngram interface.
2187 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail4.";
2188
2189 // Create a TextFile dataset
2190 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2191 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2192 EXPECT_NE(ds, nullptr);
2193
2194 // Create sliding_window operation on ds
2195 // Testing the second parameter pad_width in left_pad vector less than 0
2196 std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2}, {"", -1}));
2197 EXPECT_NE(ngram_op, nullptr);
2198
2199 // Create a Map operation on ds
2200 ds = ds->Map({ngram_op});
2201 EXPECT_NE(ds, nullptr);
2202
2203 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2204 // Expect failure: invalid Ngram input (the second parameter pad_width in left_pad vector less than 0)
2205 EXPECT_EQ(iter, nullptr);
2206 }
2207
TEST_F(MindDataTestPipeline,TestNgramFail5)2208 TEST_F(MindDataTestPipeline, TestNgramFail5) {
2209 // Testing the incorrect parameter of Ngram interface.
2210 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail5.";
2211
2212 // Create a TextFile dataset
2213 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2214 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2215 EXPECT_NE(ds, nullptr);
2216
2217 // Create sliding_window operation on ds
2218 // Testing the second parameter pad_width in right_pad vector less than 0
2219 std::shared_ptr<TensorTransform> ngram_op(new text::Ngram({2}, {"", 1}, {"", -1}));
2220 EXPECT_NE(ngram_op, nullptr);
2221
2222 // Create a Map operation on ds
2223 ds = ds->Map({ngram_op});
2224 EXPECT_NE(ds, nullptr);
2225
2226 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2227 // Expect failure: invalid Ngram input (the second parameter pad_width in left_pad vector less than 0)
2228 EXPECT_EQ(iter, nullptr);
2229 }
2230
TEST_F(MindDataTestPipeline,TestNormalizeUTF8Success)2231 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success) {
2232 // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkc.
2233 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success.";
2234
2235 // Create a TextFile dataset
2236 std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
2237 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2238 EXPECT_NE(ds, nullptr);
2239
2240 // Create normalizeutf8 operation on ds
2241 std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfkc);
2242 EXPECT_NE(normalizeutf8, nullptr);
2243
2244 // Create Map operation on ds
2245 ds = ds->Map({normalizeutf8}, {"text"});
2246 EXPECT_NE(ds, nullptr);
2247
2248 // Create an iterator over the result of the above dataset
2249 // This will trigger the creation of the Execution Tree and launch it.
2250 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2251 EXPECT_NE(iter, nullptr);
2252
2253 // Iterate the dataset and get each row
2254 std::unordered_map<std::string, mindspore::MSTensor> row;
2255 ASSERT_OK(iter->GetNextRow(&row));
2256
2257 std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"};
2258
2259 uint64_t i = 0;
2260 while (row.size() != 0) {
2261 auto ind = row["text"];
2262 std::shared_ptr<Tensor> de_expected_tensor;
2263 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2264 mindspore::MSTensor ms_expected_tensor =
2265 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2266 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2267 ASSERT_OK(iter->GetNextRow(&row));
2268 i++;
2269 }
2270
2271 EXPECT_EQ(i, 6);
2272
2273 // Manually terminate the pipeline
2274 iter->Stop();
2275 }
2276
TEST_F(MindDataTestPipeline,TestNormalizeUTF8Success1)2277 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success1) {
2278 // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfc.
2279 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success1.";
2280
2281 // Create a TextFile dataset
2282 std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
2283 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2284 EXPECT_NE(ds, nullptr);
2285
2286 // Create normalizeutf8 operation on ds
2287 std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfc);
2288 EXPECT_NE(normalizeutf8, nullptr);
2289
2290 // Create Map operation on ds
2291 ds = ds->Map({normalizeutf8}, {"text"});
2292 EXPECT_NE(ds, nullptr);
2293
2294 // Create an iterator over the result of the above dataset
2295 // This will trigger the creation of the Execution Tree and launch it.
2296 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2297 EXPECT_NE(iter, nullptr);
2298
2299 // Iterate the dataset and get each row
2300 std::unordered_map<std::string, mindspore::MSTensor> row;
2301 ASSERT_OK(iter->GetNextRow(&row));
2302
2303 std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"};
2304
2305 uint64_t i = 0;
2306 while (row.size() != 0) {
2307 auto ind = row["text"];
2308 std::shared_ptr<Tensor> de_expected_tensor;
2309 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2310 mindspore::MSTensor ms_expected_tensor =
2311 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2312 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2313 ASSERT_OK(iter->GetNextRow(&row));
2314 i++;
2315 }
2316
2317 EXPECT_EQ(i, 6);
2318
2319 // Manually terminate the pipeline
2320 iter->Stop();
2321 }
2322
TEST_F(MindDataTestPipeline,TestNormalizeUTF8Success2)2323 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success2) {
2324 // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfd.
2325 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success2.";
2326
2327 // Create a TextFile dataset
2328 std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
2329 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2330 EXPECT_NE(ds, nullptr);
2331
2332 // Create normalizeutf8 operation on ds
2333 std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfd);
2334 EXPECT_NE(normalizeutf8, nullptr);
2335
2336 // Create Map operation on ds
2337 ds = ds->Map({normalizeutf8}, {"text"});
2338 EXPECT_NE(ds, nullptr);
2339
2340 // Create an iterator over the result of the above dataset
2341 // This will trigger the creation of the Execution Tree and launch it.
2342 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2343 EXPECT_NE(iter, nullptr);
2344
2345 // Iterate the dataset and get each row
2346 std::unordered_map<std::string, mindspore::MSTensor> row;
2347 ASSERT_OK(iter->GetNextRow(&row));
2348
2349 std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"};
2350
2351 uint64_t i = 0;
2352 while (row.size() != 0) {
2353 auto ind = row["text"];
2354 std::shared_ptr<Tensor> de_expected_tensor;
2355 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2356 mindspore::MSTensor ms_expected_tensor =
2357 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2358 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2359 ASSERT_OK(iter->GetNextRow(&row));
2360 i++;
2361 }
2362
2363 EXPECT_EQ(i, 6);
2364
2365 // Manually terminate the pipeline
2366 iter->Stop();
2367 }
2368
TEST_F(MindDataTestPipeline,TestNormalizeUTF8Success3)2369 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success3) {
2370 // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkd.
2371 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success3.";
2372
2373 // Create a TextFile dataset
2374 std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
2375 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2376 EXPECT_NE(ds, nullptr);
2377
2378 // Create normalizeutf8 operation on ds
2379 std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfkd);
2380 EXPECT_NE(normalizeutf8, nullptr);
2381
2382 // Create Map operation on ds
2383 ds = ds->Map({normalizeutf8}, {"text"});
2384 EXPECT_NE(ds, nullptr);
2385
2386 // Create an iterator over the result of the above dataset
2387 // This will trigger the creation of the Execution Tree and launch it.
2388 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2389 EXPECT_NE(iter, nullptr);
2390
2391 // Iterate the dataset and get each row
2392 std::unordered_map<std::string, mindspore::MSTensor> row;
2393 ASSERT_OK(iter->GetNextRow(&row));
2394
2395 std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"};
2396
2397 uint64_t i = 0;
2398 while (row.size() != 0) {
2399 auto ind = row["text"];
2400 std::shared_ptr<Tensor> de_expected_tensor;
2401 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2402 mindspore::MSTensor ms_expected_tensor =
2403 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2404 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2405 ASSERT_OK(iter->GetNextRow(&row));
2406 i++;
2407 }
2408
2409 EXPECT_EQ(i, 6);
2410
2411 // Manually terminate the pipeline
2412 iter->Stop();
2413 }
2414
TEST_F(MindDataTestPipeline,TestRegexReplaceSuccess)2415 TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess) {
2416 // Testing the parameter of RegexReplace interface when the replace_all is true.
2417 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess.";
2418
2419 // Create a TextFile dataset
2420 std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
2421 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2422 EXPECT_NE(ds, nullptr);
2423
2424 // Create regex_replace operation on ds
2425 std::shared_ptr<TensorTransform> regex_replace = std::make_shared<text::RegexReplace>("\\s+", "_", true);
2426 EXPECT_NE(regex_replace, nullptr);
2427
2428 // Create Map operation on ds
2429 ds = ds->Map({regex_replace}, {"text"});
2430 EXPECT_NE(ds, nullptr);
2431
2432 // Create an iterator over the result of the above dataset
2433 // This will trigger the creation of the Execution Tree and launch it.
2434 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2435 EXPECT_NE(iter, nullptr);
2436
2437 // Iterate the dataset and get each row
2438 std::unordered_map<std::string, mindspore::MSTensor> row;
2439 ASSERT_OK(iter->GetNextRow(&row));
2440
2441 std::vector<std::string> expected = {"Hello_World", "Let's_Go", "1:hello", "2:world",
2442 "31:beijing", "Welcome_to_China!", "_我_不想_长大_", "Welcome_to_Shenzhen!"};
2443
2444 uint64_t i = 0;
2445 while (row.size() != 0) {
2446 auto ind = row["text"];
2447 std::shared_ptr<Tensor> de_expected_tensor;
2448 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2449 mindspore::MSTensor ms_expected_tensor =
2450 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2451 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2452 ASSERT_OK(iter->GetNextRow(&row));
2453 i++;
2454 }
2455
2456 EXPECT_EQ(i, 8);
2457
2458 // Manually terminate the pipeline
2459 iter->Stop();
2460 }
2461
TEST_F(MindDataTestPipeline,TestRegexReplaceSuccess1)2462 TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess1) {
2463 // Testing the parameter of RegexReplace interface when the replace_all is false.
2464 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess1.";
2465
2466 // Create a TextFile dataset
2467 std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
2468 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2469 EXPECT_NE(ds, nullptr);
2470
2471 // Create regex_replace operation on ds
2472 std::shared_ptr<TensorTransform> regex_replace = std::make_shared<text::RegexReplace>("\\s+", "_", false);
2473 EXPECT_NE(regex_replace, nullptr);
2474
2475 // Create Map operation on ds
2476 ds = ds->Map({regex_replace}, {"text"});
2477 EXPECT_NE(ds, nullptr);
2478
2479 // Create an iterator over the result of the above dataset
2480 // This will trigger the creation of the Execution Tree and launch it.
2481 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2482 EXPECT_NE(iter, nullptr);
2483
2484 // Iterate the dataset and get each row
2485 std::unordered_map<std::string, mindspore::MSTensor> row;
2486 ASSERT_OK(iter->GetNextRow(&row));
2487
2488 std::vector<std::string> expected = {"Hello_World", "Let's_Go", "1:hello", "2:world",
2489 "31:beijing", "Welcome_to China!", "_我 不想 长大 ", "Welcome_to Shenzhen!"};
2490
2491 uint64_t i = 0;
2492 while (row.size() != 0) {
2493 auto ind = row["text"];
2494 std::shared_ptr<Tensor> de_expected_tensor;
2495 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2496 mindspore::MSTensor ms_expected_tensor =
2497 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2498 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2499 ASSERT_OK(iter->GetNextRow(&row));
2500 i++;
2501 }
2502
2503 EXPECT_EQ(i, 8);
2504
2505 // Manually terminate the pipeline
2506 iter->Stop();
2507 }
2508
TEST_F(MindDataTestPipeline,TestRegexTokenizerSuccess)2509 TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess) {
2510 // Testing the parameter of RegexTokenizer interface when the with_offsets is false.
2511 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess.";
2512
2513 // Create a TextFile dataset
2514 std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
2515 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2516 EXPECT_NE(ds, nullptr);
2517
2518 // Create regex_tokenizer operation on ds
2519 std::shared_ptr<TensorTransform> regex_tokenizer = std::make_shared<text::RegexTokenizer>("\\s+", "\\s+", false);
2520 EXPECT_NE(regex_tokenizer, nullptr);
2521
2522 // Create Map operation on ds
2523 ds = ds->Map({regex_tokenizer}, {"text"});
2524 EXPECT_NE(ds, nullptr);
2525
2526 // Create an iterator over the result of the above dataset
2527 // This will trigger the creation of the Execution Tree and launch it.
2528 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2529 EXPECT_NE(iter, nullptr);
2530
2531 // Iterate the dataset and get each row
2532 std::unordered_map<std::string, mindspore::MSTensor> row;
2533 ASSERT_OK(iter->GetNextRow(&row));
2534
2535 std::vector<std::vector<std::string>> expected = {{"Hello", " ", "World"},
2536 {"Let's", " ", "Go"},
2537 {"1:hello"},
2538 {"2:world"},
2539 {"31:beijing"},
2540 {"Welcome", " ", "to", " ", "China!"},
2541 {" ", "我", " ", "不想", " ", "长大", " "},
2542 {"Welcome", " ", "to", " ", "Shenzhen!"}};
2543
2544 uint64_t i = 0;
2545 while (row.size() != 0) {
2546 auto ind = row["text"];
2547
2548 std::shared_ptr<Tensor> de_expected_tensor;
2549 int x = expected[i].size();
2550 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
2551 mindspore::MSTensor expected_tensor =
2552 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2553 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
2554
2555 ASSERT_OK(iter->GetNextRow(&row));
2556 i++;
2557 }
2558
2559 EXPECT_EQ(i, 8);
2560
2561 // Manually terminate the pipeline
2562 iter->Stop();
2563 }
2564
TEST_F(MindDataTestPipeline,TestRegexTokenizerSuccess1)2565 TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess1) {
2566 // Testing the parameter of RegexTokenizer interface when the with_offsets is true.
2567 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess1.";
2568
2569 // Create a TextFile dataset
2570 std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
2571 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2572 EXPECT_NE(ds, nullptr);
2573
2574 // Create regex_tokenizer operation on ds
2575 std::shared_ptr<TensorTransform> regex_tokenizer = std::make_shared<text::RegexTokenizer>("\\s+", "\\s+", true);
2576 EXPECT_NE(regex_tokenizer, nullptr);
2577
2578 // Create Map operation on ds
2579 ds = ds->Map({regex_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
2580 {"token", "offsets_start", "offsets_limit"});
2581 EXPECT_NE(ds, nullptr);
2582
2583 // Create an iterator over the result of the above dataset
2584 // This will trigger the creation of the Execution Tree and launch it.
2585 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2586 EXPECT_NE(iter, nullptr);
2587
2588 // Iterate the dataset and get each row
2589 std::unordered_map<std::string, mindspore::MSTensor> row;
2590 ASSERT_OK(iter->GetNextRow(&row));
2591
2592 std::vector<std::vector<std::string>> expected_tokens = {{"Hello", " ", "World"},
2593 {"Let's", " ", "Go"},
2594 {"1:hello"},
2595 {"2:world"},
2596 {"31:beijing"},
2597 {"Welcome", " ", "to", " ", "China!"},
2598 {" ", "我", " ", "不想", " ", "长大", " "},
2599 {"Welcome", " ", "to", " ", "Shenzhen!"}};
2600
2601 std::vector<std::vector<uint32_t>> expected_offsets_start = {
2602 {0, 5, 6}, {0, 5, 6}, {0}, {0}, {0}, {0, 7, 8, 10, 11}, {0, 2, 5, 6, 12, 14, 20}, {0, 7, 8, 10, 11}};
2603 std::vector<std::vector<uint32_t>> expected_offsets_limit = {
2604 {5, 6, 11}, {5, 6, 8}, {7}, {7}, {10}, {7, 8, 10, 11, 17}, {2, 5, 6, 12, 14, 20, 21}, {7, 8, 10, 11, 20}};
2605
2606 uint64_t i = 0;
2607 while (row.size() != 0) {
2608 auto token = row["token"];
2609 auto start = row["offsets_start"];
2610 auto limit = row["offsets_limit"];
2611
2612 std::shared_ptr<Tensor> de_expected_tokens;
2613 int x = expected_tokens[i].size();
2614 ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
2615 mindspore::MSTensor ms_expected_tokens =
2616 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
2617 EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
2618
2619 std::shared_ptr<Tensor> de_expected_offsets_start;
2620 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
2621 mindspore::MSTensor ms_expected_offsets_start =
2622 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
2623 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
2624
2625 std::shared_ptr<Tensor> de_expected_offsets_limit;
2626 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
2627 mindspore::MSTensor ms_expected_offsets_limit =
2628 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
2629 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
2630
2631 ASSERT_OK(iter->GetNextRow(&row));
2632 i++;
2633 }
2634
2635 EXPECT_EQ(i, 8);
2636
2637 // Manually terminate the pipeline
2638 iter->Stop();
2639 }
2640
TEST_F(MindDataTestPipeline,TestUnicodeCharTokenizerSuccess)2641 TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess) {
2642 // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is default.
2643 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess.";
2644
2645 // Create a TextFile dataset
2646 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
2647 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2648 EXPECT_NE(ds, nullptr);
2649
2650 // Create unicodechar_tokenizer operation on ds
2651 std::shared_ptr<TensorTransform> unicodechar_tokenizer = std::make_shared<text::UnicodeCharTokenizer>();
2652 EXPECT_NE(unicodechar_tokenizer, nullptr);
2653
2654 // Create Map operation on ds
2655 ds = ds->Map({unicodechar_tokenizer}, {"text"});
2656 EXPECT_NE(ds, nullptr);
2657
2658 // Create an iterator over the result of the above dataset
2659 // This will trigger the creation of the Execution Tree and launch it.
2660 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2661 EXPECT_NE(iter, nullptr);
2662
2663 // Iterate the dataset and get each row
2664 std::unordered_map<std::string, mindspore::MSTensor> row;
2665 ASSERT_OK(iter->GetNextRow(&row));
2666
2667 std::vector<std::vector<std::string>> expected = {
2668 {"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"},
2669 {"北", "京", "欢", "迎", "您", "!"},
2670 {"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"},
2671 {" ", " "}};
2672
2673 uint64_t i = 0;
2674 while (row.size() != 0) {
2675 auto ind = row["text"];
2676
2677 std::shared_ptr<Tensor> de_expected_tensor;
2678 int x = expected[i].size();
2679 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
2680 mindspore::MSTensor expected_tensor =
2681 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2682 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
2683
2684 ASSERT_OK(iter->GetNextRow(&row));
2685 i++;
2686 }
2687
2688 EXPECT_EQ(i, 4);
2689
2690 // Manually terminate the pipeline
2691 iter->Stop();
2692 }
2693
TEST_F(MindDataTestPipeline,TestUnicodeCharTokenizerSuccess1)2694 TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess1) {
2695 // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is true.
2696 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess1.";
2697
2698 // Create a TextFile dataset
2699 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
2700 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2701 EXPECT_NE(ds, nullptr);
2702
2703 // Create unicodechar_tokenizer operation on ds
2704 std::shared_ptr<TensorTransform> unicodechar_tokenizer = std::make_shared<text::UnicodeCharTokenizer>(true);
2705 EXPECT_NE(unicodechar_tokenizer, nullptr);
2706
2707 // Create Map operation on ds
2708 ds = ds->Map({unicodechar_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
2709 {"token", "offsets_start", "offsets_limit"});
2710 EXPECT_NE(ds, nullptr);
2711
2712 // Create an iterator over the result of the above dataset
2713 // This will trigger the creation of the Execution Tree and launch it.
2714 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2715 EXPECT_NE(iter, nullptr);
2716
2717 // Iterate the dataset and get each row
2718 std::unordered_map<std::string, mindspore::MSTensor> row;
2719 ASSERT_OK(iter->GetNextRow(&row));
2720
2721 std::vector<std::vector<std::string>> expected_tokens = {
2722 {"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"},
2723 {"北", "京", "欢", "迎", "您", "!"},
2724 {"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"},
2725 {" ", " "}};
2726
2727 std::vector<std::vector<uint32_t>> expected_offsets_start = {
2728 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18},
2729 {0, 3, 6, 9, 12, 15},
2730 {0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16},
2731 {0, 1}};
2732
2733 std::vector<std::vector<uint32_t>> expected_offsets_limit = {
2734 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
2735 {3, 6, 9, 12, 15, 18},
2736 {3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17},
2737 {1, 2}};
2738
2739 uint64_t i = 0;
2740 while (row.size() != 0) {
2741 auto token = row["token"];
2742 auto start = row["offsets_start"];
2743 auto limit = row["offsets_limit"];
2744
2745 std::shared_ptr<Tensor> de_expected_tokens;
2746 int x = expected_tokens[i].size();
2747 ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
2748 mindspore::MSTensor ms_expected_tokens =
2749 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
2750 EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
2751
2752 std::shared_ptr<Tensor> de_expected_offsets_start;
2753 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
2754 mindspore::MSTensor ms_expected_offsets_start =
2755 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
2756 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
2757
2758 std::shared_ptr<Tensor> de_expected_offsets_limit;
2759 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
2760 mindspore::MSTensor ms_expected_offsets_limit =
2761 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
2762 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
2763
2764 ASSERT_OK(iter->GetNextRow(&row));
2765 i++;
2766 }
2767
2768 EXPECT_EQ(i, 4);
2769
2770 // Manually terminate the pipeline
2771 iter->Stop();
2772 }
2773
2774 std::vector<std::string> vocab_english = {"book", "cholera", "era", "favor", "##ite", "my",
2775 "is", "love", "dur", "##ing", "the"};
2776
2777 std::vector<std::string> vocab_chinese = {"我", "最", "喜", "欢", "的", "书", "是", "霍", "乱", "时", "期", "爱", "情"};
2778
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess1)2779 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess1) {
2780 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess1.";
2781 // Test WordpieceTokenizer with default parameters on English vocab
2782
2783 // Create a TextFile dataset
2784 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
2785 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2786 EXPECT_NE(ds, nullptr);
2787
2788 // Create Take operation on ds
2789 ds = ds->Take(10);
2790 EXPECT_NE(ds, nullptr);
2791
2792 // Create a vocab from vector
2793 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
2794 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
2795 EXPECT_EQ(s, Status::OK());
2796
2797 // Create WordpieceTokenizer operation on ds
2798 std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab);
2799 EXPECT_NE(wordpiece_tokenizer, nullptr);
2800
2801 // Create Map operation on ds
2802 ds = ds->Map({wordpiece_tokenizer}, {"text"});
2803 EXPECT_NE(ds, nullptr);
2804
2805 // Create an iterator over the result of the above dataset
2806 // This will trigger the creation of the Execution Tree and launch it.
2807 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2808 EXPECT_NE(iter, nullptr);
2809
2810 // Iterate the dataset and get each row
2811 std::unordered_map<std::string, mindspore::MSTensor> row;
2812 ASSERT_OK(iter->GetNextRow(&row));
2813
2814 std::vector<std::vector<std::string>> expected = {
2815 {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}};
2816
2817 uint64_t i = 0;
2818 while (row.size() != 0) {
2819 auto txt = row["text"];
2820 std::shared_ptr<Tensor> de_expected_tensor;
2821 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
2822 mindspore::MSTensor expected_tensor =
2823 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2824 EXPECT_MSTENSOR_EQ(txt, expected_tensor);
2825 ASSERT_OK(iter->GetNextRow(&row));
2826 i++;
2827 }
2828
2829 EXPECT_EQ(i, 10);
2830
2831 // Manually terminate the pipeline
2832 iter->Stop();
2833 }
2834
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess2)2835 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess2) {
2836 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess2.";
2837 // Test WordpieceTokenizer with empty unknown_token
2838
2839 // Create a TextFile dataset
2840 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
2841 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2842 EXPECT_NE(ds, nullptr);
2843
2844 // Create Take operation on ds
2845 ds = ds->Take(10);
2846 EXPECT_NE(ds, nullptr);
2847
2848 // Create a vocab from vector
2849 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
2850 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
2851 EXPECT_EQ(s, Status::OK());
2852
2853 // Create WordpieceTokenizer operation on ds
2854 std::shared_ptr<TensorTransform> wordpiece_tokenizer =
2855 std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "", false);
2856 EXPECT_NE(wordpiece_tokenizer, nullptr);
2857
2858 // Create Map operation on ds
2859 ds = ds->Map({wordpiece_tokenizer}, {"text"});
2860 EXPECT_NE(ds, nullptr);
2861
2862 // Create an iterator over the result of the above dataset
2863 // This will trigger the creation of the Execution Tree and launch it.
2864 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2865 EXPECT_NE(iter, nullptr);
2866
2867 // Iterate the dataset and get each row
2868 std::unordered_map<std::string, mindspore::MSTensor> row;
2869 ASSERT_OK(iter->GetNextRow(&row));
2870
2871 std::vector<std::vector<std::string>> expected = {
2872 {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"what"}};
2873
2874 uint64_t i = 0;
2875 while (row.size() != 0) {
2876 auto txt = row["text"];
2877 std::shared_ptr<Tensor> de_expected_tensor;
2878 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
2879 mindspore::MSTensor expected_tensor =
2880 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2881 EXPECT_MSTENSOR_EQ(txt, expected_tensor);
2882 ASSERT_OK(iter->GetNextRow(&row));
2883 i++;
2884 }
2885
2886 EXPECT_EQ(i, 10);
2887
2888 // Manually terminate the pipeline
2889 iter->Stop();
2890 }
2891
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess3)2892 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess3) {
2893 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess3.";
2894 // Test WordpieceTokenizer with non-default max_bytes_per_token
2895
2896 // Create a TextFile dataset
2897 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
2898 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2899 EXPECT_NE(ds, nullptr);
2900
2901 // Create Take operation on ds
2902 ds = ds->Take(10);
2903 EXPECT_NE(ds, nullptr);
2904
2905 // Create a vocab from vector
2906 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
2907 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
2908 EXPECT_EQ(s, Status::OK());
2909
2910 // Create WordpieceTokenizer operation on ds
2911 std::shared_ptr<TensorTransform> wordpiece_tokenizer =
2912 std::make_shared<text::WordpieceTokenizer>(vocab, "##", 4, "[UNK]", false);
2913 EXPECT_NE(wordpiece_tokenizer, nullptr);
2914
2915 // Create Map operation on ds
2916 ds = ds->Map({wordpiece_tokenizer}, {"text"});
2917 EXPECT_NE(ds, nullptr);
2918
2919 // Create an iterator over the result of the above dataset
2920 // This will trigger the creation of the Execution Tree and launch it.
2921 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2922 EXPECT_NE(iter, nullptr);
2923
2924 // Iterate the dataset and get each row
2925 std::unordered_map<std::string, mindspore::MSTensor> row;
2926 ASSERT_OK(iter->GetNextRow(&row));
2927
2928 std::vector<std::vector<std::string>> expected = {{"my"}, {"[UNK]"}, {"book"}, {"is"}, {"love"},
2929 {"[UNK]"}, {"the"}, {"[UNK]"}, {"era"}, {"[UNK]"}};
2930
2931 uint64_t i = 0;
2932 while (row.size() != 0) {
2933 auto txt = row["text"];
2934 std::shared_ptr<Tensor> de_expected_tensor;
2935 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
2936 mindspore::MSTensor expected_tensor =
2937 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2938 EXPECT_MSTENSOR_EQ(txt, expected_tensor);
2939 ASSERT_OK(iter->GetNextRow(&row));
2940 i++;
2941 }
2942
2943 EXPECT_EQ(i, 10);
2944
2945 // Manually terminate the pipeline
2946 iter->Stop();
2947 }
2948
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess4)2949 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess4) {
2950 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess4.";
2951 // Test WordpieceTokenizer with default parameters on Chinese vocab
2952
2953 // Create a TextFile dataset
2954 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
2955 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2956 EXPECT_NE(ds, nullptr);
2957
2958 // Create Skip operation on ds
2959 ds = ds->Skip(10);
2960 EXPECT_NE(ds, nullptr);
2961
2962 // Create Take operation on ds
2963 ds = ds->Take(15);
2964 EXPECT_NE(ds, nullptr);
2965
2966 // Create a vocab from vector
2967 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
2968 Status s = Vocab::BuildFromVector(vocab_chinese, {}, true, &vocab);
2969 EXPECT_EQ(s, Status::OK());
2970
2971 // Create WordpieceTokenizer operation on ds
2972 std::shared_ptr<TensorTransform> wordpiece_tokenizer =
2973 std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", false);
2974 EXPECT_NE(wordpiece_tokenizer, nullptr);
2975
2976 // Create Map operation on ds
2977 ds = ds->Map({wordpiece_tokenizer}, {"text"});
2978 EXPECT_NE(ds, nullptr);
2979
2980 // Create an iterator over the result of the above dataset
2981 // This will trigger the creation of the Execution Tree and launch it.
2982 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2983 EXPECT_NE(iter, nullptr);
2984
2985 // Iterate the dataset and get each row
2986 std::unordered_map<std::string, mindspore::MSTensor> row;
2987 ASSERT_OK(iter->GetNextRow(&row));
2988
2989 std::vector<std::vector<std::string>> expected = {{"我"}, {"最"}, {"喜"}, {"欢"}, {"的"}, {"书"}, {"是"}, {"霍"},
2990 {"乱"}, {"时"}, {"期"}, {"的"}, {"爱"}, {"情"}, {"[UNK]"}};
2991
2992 uint64_t i = 0;
2993 while (row.size() != 0) {
2994 auto txt = row["text"];
2995 std::shared_ptr<Tensor> de_expected_tensor;
2996 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
2997 mindspore::MSTensor expected_tensor =
2998 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2999 EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3000 ASSERT_OK(iter->GetNextRow(&row));
3001 i++;
3002 }
3003
3004 EXPECT_EQ(i, 15);
3005
3006 // Manually terminate the pipeline
3007 iter->Stop();
3008 }
3009
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess5)3010 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess5) {
3011 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess5.";
3012 // Test WordpieceTokenizer with with_offsets true
3013
3014 // Create a TextFile dataset
3015 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
3016 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3017 EXPECT_NE(ds, nullptr);
3018
3019 // Create Take operation on ds
3020 ds = ds->Take(10);
3021 EXPECT_NE(ds, nullptr);
3022
3023 // Create a vocab from vector
3024 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3025 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3026 EXPECT_EQ(s, Status::OK());
3027
3028 // Create WordpieceTokenizer operation on ds
3029 std::shared_ptr<TensorTransform> wordpiece_tokenizer =
3030 std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", true);
3031 EXPECT_NE(wordpiece_tokenizer, nullptr);
3032
3033 // Create Map operation on ds
3034 ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
3035 EXPECT_NE(ds, nullptr);
3036
3037 // Create an iterator over the result of the above dataset
3038 // This will trigger the creation of the Execution Tree and launch it.
3039 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3040 EXPECT_NE(iter, nullptr);
3041
3042 // Iterate the dataset and get each row
3043 std::unordered_map<std::string, mindspore::MSTensor> row;
3044 ASSERT_OK(iter->GetNextRow(&row));
3045
3046 std::vector<std::vector<std::string>> expected = {
3047 {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}};
3048 std::vector<std::vector<uint32_t>> expected_offsets_start = {{0}, {0, 5}, {0}, {0}, {0}, {0, 3}, {0}, {0}, {0}, {0}};
3049 std::vector<std::vector<uint32_t>> expected_offsets_limit = {{2}, {5, 8}, {4}, {2}, {4}, {3, 6}, {3}, {7}, {3}, {4}};
3050
3051 uint64_t i = 0;
3052 while (row.size() != 0) {
3053 auto txt = row["token"];
3054 std::shared_ptr<Tensor> de_expected_tensor;
3055 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
3056 mindspore::MSTensor expected_tensor =
3057 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3058 EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3059
3060 auto start = row["offsets_start"];
3061 std::shared_ptr<Tensor> de_expected_start_tensor;
3062 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], &de_expected_start_tensor));
3063 mindspore::MSTensor expected_start_tensor =
3064 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_start_tensor));
3065 EXPECT_MSTENSOR_EQ(start, expected_start_tensor);
3066
3067 auto limit = row["offsets_limit"];
3068 std::shared_ptr<Tensor> de_expected_limit_tensor;
3069 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], &de_expected_limit_tensor));
3070 mindspore::MSTensor expected_limit_tensor =
3071 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_limit_tensor));
3072 EXPECT_MSTENSOR_EQ(limit, expected_limit_tensor);
3073 ASSERT_OK(iter->GetNextRow(&row));
3074 i++;
3075 }
3076
3077 EXPECT_EQ(i, 10);
3078
3079 // Manually terminate the pipeline
3080 iter->Stop();
3081 }
3082
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess6)3083 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess6) {
3084 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess6.";
3085 // Test WordpieceTokenizer with max_bytes_per_token equals to 0
3086
3087 // Create a TextFile dataset
3088 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
3089 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3090 EXPECT_NE(ds, nullptr);
3091
3092 // Create Take operation on ds
3093 ds = ds->Take(10);
3094 EXPECT_NE(ds, nullptr);
3095
3096 // Create a vocab from vector
3097 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3098 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3099 EXPECT_EQ(s, Status::OK());
3100
3101 // Create WordpieceTokenizer operation on ds
3102 std::shared_ptr<TensorTransform> wordpiece_tokenizer =
3103 std::make_shared<text::WordpieceTokenizer>(vocab, "##", 0, "[UNK]", true);
3104 EXPECT_NE(wordpiece_tokenizer, nullptr);
3105
3106 // Create Map operation on ds
3107 ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
3108 EXPECT_NE(ds, nullptr);
3109
3110 // Create an iterator over the result of the above dataset
3111 // This will trigger the creation of the Execution Tree and launch it.
3112 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3113 EXPECT_NE(iter, nullptr);
3114
3115 // Iterate the dataset and get each row
3116 std::unordered_map<std::string, mindspore::MSTensor> row;
3117 ASSERT_OK(iter->GetNextRow(&row));
3118
3119 std::vector<std::vector<std::string>> expected = {{"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"},
3120 {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}};
3121
3122 uint64_t i = 0;
3123 while (row.size() != 0) {
3124 auto txt = row["token"];
3125 std::shared_ptr<Tensor> de_expected_tensor;
3126 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
3127 mindspore::MSTensor expected_tensor =
3128 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3129 EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3130 ASSERT_OK(iter->GetNextRow(&row));
3131 i++;
3132 }
3133
3134 EXPECT_EQ(i, 10);
3135
3136 // Manually terminate the pipeline
3137 iter->Stop();
3138 }
3139
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerFail1)3140 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail1) {
3141 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail1.";
3142 // Test WordpieceTokenizer with nullptr vocab
3143
3144 // Create a TextFile dataset
3145 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
3146 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3147 EXPECT_NE(ds, nullptr);
3148
3149 // Create WordpieceTokenizer operation on ds
3150 std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(nullptr);
3151 EXPECT_NE(wordpiece_tokenizer, nullptr);
3152
3153 // Create a Map operation on ds
3154 ds = ds->Map({wordpiece_tokenizer});
3155 EXPECT_NE(ds, nullptr);
3156
3157 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3158 // Expect failure: invalid WordpieceTokenizer input with nullptr vocab
3159 EXPECT_EQ(iter, nullptr);
3160 }
3161
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerFail2)3162 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail2) {
3163 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail2.";
3164 // Test WordpieceTokenizer with negative max_bytes_per_token
3165
3166 // Create a TextFile dataset
3167 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
3168 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3169 EXPECT_NE(ds, nullptr);
3170
3171 // Create a vocab from vector
3172 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3173 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3174 EXPECT_EQ(s, Status::OK());
3175
3176 // Create WordpieceTokenizer operation on ds
3177 std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab, "##", -1);
3178 EXPECT_NE(wordpiece_tokenizer, nullptr);
3179
3180 // Create a Map operation on ds
3181 ds = ds->Map({wordpiece_tokenizer});
3182 EXPECT_NE(ds, nullptr);
3183
3184 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3185 // Expect failure: invalid WordpieceTokenizer input with nullptr vocab
3186 EXPECT_EQ(iter, nullptr);
3187 }
3188
TEST_F(MindDataTestPipeline,TestUnicodeScriptTokenizerSuccess)3189 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess) {
3190 // Testing the parameter of UnicodeScriptTokenizer interface when the with_offsets and the keep_whitespace is default.
3191 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess.";
3192
3193 // Create a TextFile dataset
3194 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3195 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3196 EXPECT_NE(ds, nullptr);
3197
3198 // Create unicodescript_tokenizer operation on ds
3199 std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>();
3200 EXPECT_NE(unicodescript_tokenizer, nullptr);
3201
3202 // Create Map operation on ds
3203 ds = ds->Map({unicodescript_tokenizer}, {"text"});
3204 EXPECT_NE(ds, nullptr);
3205
3206 // Create an iterator over the result of the above dataset
3207 // This will trigger the creation of the Execution Tree and launch it.
3208 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3209 EXPECT_NE(iter, nullptr);
3210
3211 // Iterate the dataset and get each row
3212 std::unordered_map<std::string, mindspore::MSTensor> row;
3213 ASSERT_OK(iter->GetNextRow(&row));
3214
3215 std::vector<std::vector<std::string>> expected = {
3216 {"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}};
3217
3218 uint64_t i = 0;
3219 while (row.size() != 0) {
3220 auto ind = row["text"];
3221
3222 std::shared_ptr<Tensor> de_expected_tensor;
3223 int x = expected[i].size();
3224 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
3225 mindspore::MSTensor expected_tensor =
3226 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3227 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
3228
3229 ASSERT_OK(iter->GetNextRow(&row));
3230 i++;
3231 }
3232
3233 EXPECT_EQ(i, 4);
3234
3235 // Manually terminate the pipeline
3236 iter->Stop();
3237 }
3238
TEST_F(MindDataTestPipeline,TestUnicodeScriptTokenizerSuccess1)3239 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess1) {
3240 // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is
3241 // false.
3242 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess1.";
3243
3244 // Create a TextFile dataset
3245 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3246 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3247 EXPECT_NE(ds, nullptr);
3248
3249 // Create unicodescript_tokenizer operation on ds
3250 std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>(true);
3251 EXPECT_NE(unicodescript_tokenizer, nullptr);
3252
3253 // Create Map operation on ds
3254 ds = ds->Map({unicodescript_tokenizer}, {"text"});
3255 EXPECT_NE(ds, nullptr);
3256
3257 // Create an iterator over the result of the above dataset
3258 // This will trigger the creation of the Execution Tree and launch it.
3259 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3260 EXPECT_NE(iter, nullptr);
3261
3262 // Iterate the dataset and get each row
3263 std::unordered_map<std::string, mindspore::MSTensor> row;
3264 ASSERT_OK(iter->GetNextRow(&row));
3265
3266 std::vector<std::vector<std::string>> expected = {
3267 {"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {" "}};
3268
3269 uint64_t i = 0;
3270 while (row.size() != 0) {
3271 auto ind = row["text"];
3272
3273 std::shared_ptr<Tensor> de_expected_tensor;
3274 int x = expected[i].size();
3275 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
3276 mindspore::MSTensor expected_tensor =
3277 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3278 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
3279
3280 ASSERT_OK(iter->GetNextRow(&row));
3281 i++;
3282 }
3283
3284 EXPECT_EQ(i, 4);
3285
3286 // Manually terminate the pipeline
3287 iter->Stop();
3288 }
3289
TEST_F(MindDataTestPipeline,TestUnicodeScriptTokenizerSuccess2)3290 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess2) {
3291 // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is false and the with_offsets is
3292 // true.
3293 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess2.";
3294
3295 // Create a TextFile dataset
3296 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3297 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3298 EXPECT_NE(ds, nullptr);
3299
3300 // Create unicodescript_tokenizer operation on ds
3301 std::shared_ptr<TensorTransform> unicodescript_tokenizer =
3302 std::make_shared<text::UnicodeScriptTokenizer>(false, true);
3303 EXPECT_NE(unicodescript_tokenizer, nullptr);
3304
3305 // Create Map operation on ds
3306 ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
3307 {"token", "offsets_start", "offsets_limit"});
3308 EXPECT_NE(ds, nullptr);
3309
3310 // Create an iterator over the result of the above dataset
3311 // This will trigger the creation of the Execution Tree and launch it.
3312 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3313 EXPECT_NE(iter, nullptr);
3314
3315 // Iterate the dataset and get each row
3316 std::unordered_map<std::string, mindspore::MSTensor> row;
3317 ASSERT_OK(iter->GetNextRow(&row));
3318
3319 std::vector<std::vector<std::string>> expected_tokens = {
3320 {"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}};
3321
3322 std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11, 18}, {0, 15}, {0, 9, 16}, {0}};
3323 std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 18, 19}, {15, 18}, {9, 16, 17}, {0}};
3324
3325 uint64_t i = 0;
3326 while (row.size() != 0) {
3327 auto token = row["token"];
3328 auto start = row["offsets_start"];
3329 auto limit = row["offsets_limit"];
3330
3331 std::shared_ptr<Tensor> de_expected_tokens;
3332 int x = expected_tokens[i].size();
3333 ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
3334 mindspore::MSTensor ms_expected_tokens =
3335 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
3336 EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
3337
3338 std::shared_ptr<Tensor> de_expected_offsets_start;
3339 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
3340 mindspore::MSTensor ms_expected_offsets_start =
3341 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
3342 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
3343
3344 std::shared_ptr<Tensor> de_expected_offsets_limit;
3345 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
3346 mindspore::MSTensor ms_expected_offsets_limit =
3347 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
3348 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
3349
3350 ASSERT_OK(iter->GetNextRow(&row));
3351 i++;
3352 }
3353
3354 EXPECT_EQ(i, 4);
3355
3356 // Manually terminate the pipeline
3357 iter->Stop();
3358 }
3359
TEST_F(MindDataTestPipeline,TestUnicodeScriptTokenizerSuccess3)3360 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess3) {
3361 // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is
3362 // true.
3363 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess3.";
3364
3365 // Create a TextFile dataset
3366 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3367 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3368 EXPECT_NE(ds, nullptr);
3369
3370 // Create unicodescript_tokenizer operation on ds
3371 std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>(true, true);
3372 EXPECT_NE(unicodescript_tokenizer, nullptr);
3373
3374 // Create Map operation on ds
3375 ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
3376 {"token", "offsets_start", "offsets_limit"});
3377 EXPECT_NE(ds, nullptr);
3378
3379 // Create an iterator over the result of the above dataset
3380 // This will trigger the creation of the Execution Tree and launch it.
3381 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3382 EXPECT_NE(iter, nullptr);
3383
3384 // Iterate the dataset and get each row
3385 std::unordered_map<std::string, mindspore::MSTensor> row;
3386 ASSERT_OK(iter->GetNextRow(&row));
3387
3388 std::vector<std::vector<std::string>> expected_tokens = {
3389 {"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {" "}};
3390
3391 std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 7, 8, 10, 11, 18}, {0, 15}, {0, 9, 16}, {0}};
3392 std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 8, 10, 11, 18, 19}, {15, 18}, {9, 16, 17}, {2}};
3393
3394 uint64_t i = 0;
3395 while (row.size() != 0) {
3396 auto token = row["token"];
3397 auto start = row["offsets_start"];
3398 auto limit = row["offsets_limit"];
3399
3400 std::shared_ptr<Tensor> de_expected_tokens;
3401 int x = expected_tokens[i].size();
3402 ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
3403 mindspore::MSTensor ms_expected_tokens =
3404 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
3405 EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
3406
3407 std::shared_ptr<Tensor> de_expected_offsets_start;
3408 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
3409 mindspore::MSTensor ms_expected_offsets_start =
3410 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
3411 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
3412
3413 std::shared_ptr<Tensor> de_expected_offsets_limit;
3414 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
3415 mindspore::MSTensor ms_expected_offsets_limit =
3416 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
3417 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
3418
3419 ASSERT_OK(iter->GetNextRow(&row));
3420 i++;
3421 }
3422
3423 EXPECT_EQ(i, 4);
3424
3425 // Manually terminate the pipeline
3426 iter->Stop();
3427 }
3428
TEST_F(MindDataTestPipeline,TestWhitespaceTokenizerSuccess)3429 TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess) {
3430 // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is default.
3431 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess.";
3432
3433 // Create a TextFile dataset
3434 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
3435 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3436 EXPECT_NE(ds, nullptr);
3437
3438 // Create white_tokenizer operation on ds
3439 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
3440 EXPECT_NE(white_tokenizer, nullptr);
3441
3442 // Create Map operation on ds
3443 ds = ds->Map({white_tokenizer}, {"text"});
3444 EXPECT_NE(ds, nullptr);
3445
3446 // Create an iterator over the result of the above dataset
3447 // This will trigger the creation of the Execution Tree and launch it.
3448 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3449 EXPECT_NE(iter, nullptr);
3450
3451 // Iterate the dataset and get each row
3452 std::unordered_map<std::string, mindspore::MSTensor> row;
3453 ASSERT_OK(iter->GetNextRow(&row));
3454
3455 std::vector<std::vector<std::string>> expected = {
3456 {"This", "is", "a", "text", "file."}, {"Be", "happy", "every", "day."}, {"Good", "luck", "to", "everyone."}};
3457
3458 uint64_t i = 0;
3459 while (row.size() != 0) {
3460 auto ind = row["text"];
3461
3462 std::shared_ptr<Tensor> de_expected_tensor;
3463 int x = expected[i].size();
3464 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
3465 mindspore::MSTensor expected_tensor =
3466 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3467 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
3468
3469 ASSERT_OK(iter->GetNextRow(&row));
3470 i++;
3471 }
3472
3473 EXPECT_EQ(i, 3);
3474
3475 // Manually terminate the pipeline
3476 iter->Stop();
3477 }
3478
TEST_F(MindDataTestPipeline,TestWhitespaceTokenizerSuccess1)3479 TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess1) {
3480 // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is true.
3481 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess1.";
3482
3483 // Create a TextFile dataset
3484 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3485 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3486 EXPECT_NE(ds, nullptr);
3487
3488 // Create white_tokenizer operation on ds
3489 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>(true);
3490 EXPECT_NE(white_tokenizer, nullptr);
3491
3492 // Create Map operation on ds
3493 ds = ds->Map({white_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"},
3494 {"token", "offsets_start", "offsets_limit"});
3495 EXPECT_NE(ds, nullptr);
3496
3497 // Create an iterator over the result of the above dataset
3498 // This will trigger the creation of the Execution Tree and launch it.
3499 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3500 EXPECT_NE(iter, nullptr);
3501
3502 // Iterate the dataset and get each row
3503 std::unordered_map<std::string, mindspore::MSTensor> row;
3504 ASSERT_OK(iter->GetNextRow(&row));
3505
3506 std::vector<std::vector<std::string>> expected_tokens = {
3507 {"Welcome", "to", "Beijing!"}, {"北京欢迎您!"}, {"我喜欢English!"}, {""}};
3508
3509 std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11}, {0}, {0}, {0}};
3510 std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 19}, {18}, {17}, {0}};
3511
3512 uint64_t i = 0;
3513 while (row.size() != 0) {
3514 auto token = row["token"];
3515 auto start = row["offsets_start"];
3516 auto limit = row["offsets_limit"];
3517
3518 std::shared_ptr<Tensor> de_expected_tokens;
3519 int x = expected_tokens[i].size();
3520 ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
3521 mindspore::MSTensor ms_expected_tokens =
3522 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
3523 EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
3524
3525 std::shared_ptr<Tensor> de_expected_offsets_start;
3526 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
3527 mindspore::MSTensor ms_expected_offsets_start =
3528 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
3529 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
3530
3531 std::shared_ptr<Tensor> de_expected_offsets_limit;
3532 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
3533 mindspore::MSTensor ms_expected_offsets_limit =
3534 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
3535 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
3536
3537 ASSERT_OK(iter->GetNextRow(&row));
3538 i++;
3539 }
3540
3541 EXPECT_EQ(i, 4);
3542
3543 // Manually terminate the pipeline
3544 iter->Stop();
3545 }
3546