1 /**
2 * Copyright 2020-2022 Huawei Technologies Co., Ltd
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16 #include <memory>
17 #include <string>
18 #include <vector>
19
20 #include "common/common.h"
21 #include "include/api/status.h"
22 #include "minddata/dataset/include/dataset/config.h"
23 #include "minddata/dataset/include/dataset/datasets.h"
24 #include "minddata/dataset/include/dataset/text.h"
25 #include "minddata/dataset/include/dataset/transforms.h"
26 #include "minddata/dataset/text/char_n_gram.h"
27 #include "minddata/dataset/text/fast_text.h"
28 #include "minddata/dataset/text/glove.h"
29 #include "minddata/dataset/text/vectors.h"
30
31 using namespace mindspore::dataset;
32 using mindspore::Status;
33 using mindspore::dataset::CharNGram;
34 using mindspore::dataset::FastText;
35 using mindspore::dataset::GloVe;
36 using mindspore::dataset::ShuffleMode;
37 using mindspore::dataset::Tensor;
38 using mindspore::dataset::Vectors;
39 using mindspore::dataset::Vocab;
40
41 class MindDataTestPipeline : public UT::DatasetOpTesting {
42 protected:
43 };
44
45 /// Feature: BasicTokenizer op
46 /// Description: Test BasicTokenizer op on TextFileDataset with default inputs
47 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBasicTokenizerSuccess1)48 TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess1) {
49 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess1.";
50 // Test BasicTokenizer with default parameters
51
52 // Create a TextFile dataset
53 std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
54 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
55 EXPECT_NE(ds, nullptr);
56
57 // Create Take operation on ds
58 ds = ds->Take(6);
59 EXPECT_NE(ds, nullptr);
60
61 // Create BasicTokenizer operation on ds
62 std::shared_ptr<TensorTransform> basic_tokenizer = std::make_shared<text::BasicTokenizer>();
63 EXPECT_NE(basic_tokenizer, nullptr);
64
65 // Create Map operation on ds
66 ds = ds->Map({basic_tokenizer}, {"text"});
67 EXPECT_NE(ds, nullptr);
68
69 // Create an iterator over the result of the above dataset
70 // This will trigger the creation of the Execution Tree and launch it.
71 std::shared_ptr<Iterator> iter = ds->CreateIterator();
72 EXPECT_NE(iter, nullptr);
73
74 // Iterate the dataset and get each row
75 std::unordered_map<std::string, mindspore::MSTensor> row;
76 ASSERT_OK(iter->GetNextRow(&row));
77
78 std::vector<std::vector<std::string>> expected = {
79 {"Welcome", "to", "Beijing", "北", "京", "欢", "迎", "您"},
80 {"長", "風", "破", "浪", "會", "有", "時", ",", "直", "掛", "雲", "帆", "濟", "滄", "海"},
81 {"", "嘿", "嘿", "", "哈", "哈", "", "大", "笑", "", "嘻", "嘻"},
82 {"明", "朝", "(", "1368", "—", "1644", "年", ")", "和", "清", "朝", "(", "1644", "—", "1911", "年", ")",
83 ",", "是", "中", "国", "封", "建", "王", "朝", "史", "上", "最", "后", "两", "个", "朝", "代"},
84 {"明", "代", "(", "1368", "-", "1644", ")", "と", "清", "代", "(", "1644",
85 "-", "1911", ")", "は", "、", "中", "国", "の", "封", "建", "王", "朝",
86 "の", "歴", "史", "における", "最", "後", "の2つの", "王", "朝", "でした"},
87 {"명나라", "(", "1368", "-", "1644", ")", "와", "청나라", "(", "1644", "-",
88 "1911", ")", "는", "중국", "봉건", "왕조의", "역사에서", "마지막", "두", "왕조였다"}};
89
90 uint64_t i = 0;
91 while (row.size() != 0) {
92 auto ind = row["text"];
93 std::shared_ptr<Tensor> de_expected_tensor;
94 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
95 mindspore::MSTensor expected_tensor =
96 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
97 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
98
99 ASSERT_OK(iter->GetNextRow(&row));
100 i++;
101 }
102
103 EXPECT_EQ(i, 6);
104
105 // Manually terminate the pipeline
106 iter->Stop();
107 }
108
109 /// Feature: BasicTokenizer op
110 /// Description: Test BasicTokenizer op on TextFileDataset with lower_case=true
111 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBasicTokenizerSuccess2)112 TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess2) {
113 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess2.";
114 // Test BasicTokenizer with lower_case true
115
116 // Create a TextFile dataset
117 std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
118 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
119 EXPECT_NE(ds, nullptr);
120
121 // Create Skip operation on ds
122 ds = ds->Skip(6);
123 EXPECT_NE(ds, nullptr);
124
125 // Create BasicTokenizer operation on ds
126 std::shared_ptr<TensorTransform> basic_tokenizer = std::make_shared<text::BasicTokenizer>(true);
127 EXPECT_NE(basic_tokenizer, nullptr);
128
129 // Create Map operation on ds
130 ds = ds->Map({basic_tokenizer}, {"text"});
131 EXPECT_NE(ds, nullptr);
132
133 // Create an iterator over the result of the above dataset
134 // This will trigger the creation of the Execution Tree and launch it.
135 std::shared_ptr<Iterator> iter = ds->CreateIterator();
136 EXPECT_NE(iter, nullptr);
137
138 // Iterate the dataset and get each row
139 std::unordered_map<std::string, mindspore::MSTensor> row;
140 ASSERT_OK(iter->GetNextRow(&row));
141
142 std::vector<std::string> expected = {"this", "is", "a", "funky", "string"};
143 std::shared_ptr<Tensor> de_expected_tensor;
144 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
145 mindspore::MSTensor expected_tensor =
146 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
147
148 uint64_t i = 0;
149 while (row.size() != 0) {
150 auto ind = row["text"];
151 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
152 ASSERT_OK(iter->GetNextRow(&row));
153 i++;
154 }
155
156 EXPECT_EQ(i, 1);
157
158 // Manually terminate the pipeline
159 iter->Stop();
160 }
161
162 /// Feature: BasicTokenizer op
163 /// Description: Test BasicTokenizer op on TextFileDataset with with_offsets=true and lower_case=true
164 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBasicTokenizerSuccess3)165 TEST_F(MindDataTestPipeline, TestBasicTokenizerSuccess3) {
166 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBasicTokenizerSuccess3.";
167 // Test BasicTokenizer with with_offsets true and lower_case true
168
169 // Create a TextFile dataset
170 std::string data_file = datasets_root_path_ + "/testTokenizerData/basic_tokenizer.txt";
171 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
172 EXPECT_NE(ds, nullptr);
173
174 // Create Skip operation on ds
175 ds = ds->Skip(6);
176 EXPECT_NE(ds, nullptr);
177
178 // Create BasicTokenizer operation on ds
179 std::shared_ptr<TensorTransform> basic_tokenizer =
180 std::make_shared<text::BasicTokenizer>(true, false, NormalizeForm::kNone, true, true);
181 EXPECT_NE(basic_tokenizer, nullptr);
182
183 // Create Map operation on ds
184 ds = ds->Map({basic_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
185 EXPECT_NE(ds, nullptr);
186
187 // Create an iterator over the result of the above dataset
188 // This will trigger the creation of the Execution Tree and launch it.
189 std::shared_ptr<Iterator> iter = ds->CreateIterator();
190 EXPECT_NE(iter, nullptr);
191
192 // Iterate the dataset and get each row
193 std::unordered_map<std::string, mindspore::MSTensor> row;
194 ASSERT_OK(iter->GetNextRow(&row));
195
196 std::vector<std::string> expected_tokens = {"this", "is", "a", "funky", "string"};
197 std::vector<uint32_t> expected_offsets_start = {0, 5, 8, 10, 16};
198 std::vector<uint32_t> expected_offsets_limit = {4, 7, 9, 15, 22};
199
200 std::shared_ptr<Tensor> de_expected_tokens;
201 ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
202 mindspore::MSTensor ms_expected_tokens =
203 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
204
205 std::shared_ptr<Tensor> de_expected_offsets_start;
206 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
207 mindspore::MSTensor ms_expected_offsets_start =
208 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
209
210 std::shared_ptr<Tensor> de_expected_offsets_limit;
211 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
212 mindspore::MSTensor ms_expected_offsets_limit =
213 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
214
215 uint64_t i = 0;
216 while (row.size() != 0) {
217 auto ind = row["token"];
218 EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
219
220 auto start = row["offsets_start"];
221 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
222
223 auto limit = row["offsets_limit"];
224 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
225
226 ASSERT_OK(iter->GetNextRow(&row));
227 i++;
228 }
229
230 EXPECT_EQ(i, 1);
231
232 // Manually terminate the pipeline
233 iter->Stop();
234 }
235
236 std::vector<std::string> list = {
237 "床", "前", "明", "月", "光", "疑", "是", "地", "上", "霜", "举", "头",
238 "望", "低", "思", "故", "乡", "繁", "體", "字", "嘿", "哈", "大", "笑",
239 "嘻", "i", "am", "mak", "make", "small", "mistake", "##s", "during", "work", "##ing", "hour",
240 "", "", "", "", "+", "/", "-", "=", "12", "28", "40", "16",
241 " ", "I", "[CLS]", "[SEP]", "[UNK]", "[PAD]", "[MASK]", "[unused1]", "[unused10]"};
242
243 /// Feature: BertTokenizer op
244 /// Description: Test BertTokenizer op on TextFileDataset with default parameters
245 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess1)246 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess1) {
247 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess1.";
248 // Test BertTokenizer with default parameters
249
250 // Create a TextFile dataset
251 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
252 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
253 EXPECT_NE(ds, nullptr);
254
255 // Create Take operation on ds
256 ds = ds->Take(4);
257 EXPECT_NE(ds, nullptr);
258
259 // Create a vocab from vector
260 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
261 Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
262 EXPECT_EQ(s, Status::OK());
263
264 // Create BertTokenizer operation on ds
265 std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(vocab);
266 EXPECT_NE(bert_tokenizer, nullptr);
267
268 // Create Map operation on ds
269 ds = ds->Map({bert_tokenizer}, {"text"});
270 EXPECT_NE(ds, nullptr);
271
272 // Create an iterator over the result of the above dataset
273 // This will trigger the creation of the Execution Tree and launch it.
274 std::shared_ptr<Iterator> iter = ds->CreateIterator();
275 EXPECT_NE(iter, nullptr);
276
277 // Iterate the dataset and get each row
278 std::unordered_map<std::string, mindspore::MSTensor> row;
279 ASSERT_OK(iter->GetNextRow(&row));
280
281 std::vector<std::vector<std::string>> expected = {{"床", "前", "明", "月", "光"},
282 {"疑", "是", "地", "上", "霜"},
283 {"举", "头", "望", "明", "月"},
284 {"低", "头", "思", "故", "乡"}};
285
286 uint64_t i = 0;
287 while (row.size() != 0) {
288 auto ind = row["text"];
289 std::shared_ptr<Tensor> de_expected_tensor;
290 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
291 mindspore::MSTensor expected_tensor =
292 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
293 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
294
295 ASSERT_OK(iter->GetNextRow(&row));
296 i++;
297 }
298
299 EXPECT_EQ(i, 4);
300
301 // Manually terminate the pipeline
302 iter->Stop();
303 }
304
305 /// Feature: BertTokenizer op
306 /// Description: Test BertTokenizer op on TextFileDataset with lower_case=true
307 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess2)308 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess2) {
309 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess2.";
310 // Test BertTokenizer with lower_case true
311
312 // Create a TextFile dataset
313 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
314 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
315 EXPECT_NE(ds, nullptr);
316
317 // Create Skip operation on ds
318 ds = ds->Skip(4);
319 EXPECT_NE(ds, nullptr);
320
321 // Create Take operation on ds
322 ds = ds->Take(1);
323 EXPECT_NE(ds, nullptr);
324
325 // Create a vocab from vector
326 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
327 Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
328 EXPECT_EQ(s, Status::OK());
329
330 // Create BertTokenizer operation on ds
331 std::shared_ptr<TensorTransform> bert_tokenizer =
332 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", true);
333 EXPECT_NE(bert_tokenizer, nullptr);
334
335 // Create Map operation on ds
336 ds = ds->Map({bert_tokenizer}, {"text"});
337 EXPECT_NE(ds, nullptr);
338
339 // Create an iterator over the result of the above dataset
340 // This will trigger the creation of the Execution Tree and launch it.
341 std::shared_ptr<Iterator> iter = ds->CreateIterator();
342 EXPECT_NE(iter, nullptr);
343
344 // Iterate the dataset and get each row
345 std::unordered_map<std::string, mindspore::MSTensor> row;
346 ASSERT_OK(iter->GetNextRow(&row));
347
348 std::vector<std::string> expected = {"i", "am", "mak", "##ing", "small", "mistake",
349 "##s", "during", "work", "##ing", "hour", "##s"};
350 std::shared_ptr<Tensor> de_expected_tensor;
351 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
352 mindspore::MSTensor expected_tensor =
353 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
354
355 uint64_t i = 0;
356 while (row.size() != 0) {
357 auto ind = row["text"];
358 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
359 ASSERT_OK(iter->GetNextRow(&row));
360 i++;
361 }
362
363 EXPECT_EQ(i, 1);
364
365 // Manually terminate the pipeline
366 iter->Stop();
367 }
368
369 /// Feature: BertTokenizer op
370 /// Description: Test BertTokenizer op on TextFileDataset with NormalizeForm::kNfc
371 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess3)372 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess3) {
373 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess3.";
374 // Test BertTokenizer with normalization_form NFKC
375
376 // Create a TextFile dataset
377 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
378 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
379 EXPECT_NE(ds, nullptr);
380
381 // Create Skip operation on ds
382 ds = ds->Skip(5);
383 EXPECT_NE(ds, nullptr);
384
385 // Create Take operation on ds
386 ds = ds->Take(2);
387 EXPECT_NE(ds, nullptr);
388
389 // Create a vocab from vector
390 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
391 Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
392 EXPECT_EQ(s, Status::OK());
393
394 // Create BertTokenizer operation on ds
395 std::shared_ptr<TensorTransform> bert_tokenizer =
396 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", false, false, NormalizeForm::kNfc);
397 EXPECT_NE(bert_tokenizer, nullptr);
398
399 // Create Map operation on ds
400 ds = ds->Map({bert_tokenizer}, {"text"});
401 EXPECT_NE(ds, nullptr);
402
403 // Create an iterator over the result of the above dataset
404 // This will trigger the creation of the Execution Tree and launch it.
405 std::shared_ptr<Iterator> iter = ds->CreateIterator();
406 EXPECT_NE(iter, nullptr);
407
408 // Iterate the dataset and get each row
409 std::unordered_map<std::string, mindspore::MSTensor> row;
410 ASSERT_OK(iter->GetNextRow(&row));
411
412 std::vector<std::vector<std::string>> expected = {
413 {"", "嘿", "嘿", "", "哈", "哈", "", "大", "笑", "", "嘻", "嘻"}, {"繁", "體", "字"}};
414
415 uint64_t i = 0;
416 while (row.size() != 0) {
417 auto ind = row["text"];
418 std::shared_ptr<Tensor> de_expected_tensor;
419 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
420 mindspore::MSTensor expected_tensor =
421 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
422 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
423
424 ASSERT_OK(iter->GetNextRow(&row));
425 i++;
426 }
427
428 EXPECT_EQ(i, 2);
429
430 // Manually terminate the pipeline
431 iter->Stop();
432 }
433
434 /// Feature: BertTokenizer op
435 /// Description: Test BertTokenizer op on TextFileDataset with keep_whitespace=true
436 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess4)437 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess4) {
438 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess4.";
439 // Test BertTokenizer with keep_whitespace true
440
441 // Create a TextFile dataset
442 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
443 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
444 EXPECT_NE(ds, nullptr);
445
446 // Create Skip operation on ds
447 ds = ds->Skip(7);
448 EXPECT_NE(ds, nullptr);
449
450 // Create Take operation on ds
451 ds = ds->Take(1);
452 EXPECT_NE(ds, nullptr);
453
454 // Create a vocab from vector
455 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
456 Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
457 EXPECT_EQ(s, Status::OK());
458
459 // Create BertTokenizer operation on ds
460 std::shared_ptr<TensorTransform> bert_tokenizer =
461 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", false, true);
462 EXPECT_NE(bert_tokenizer, nullptr);
463
464 // Create Map operation on ds
465 ds = ds->Map({bert_tokenizer}, {"text"});
466 EXPECT_NE(ds, nullptr);
467
468 // Create an iterator over the result of the above dataset
469 // This will trigger the creation of the Execution Tree and launch it.
470 std::shared_ptr<Iterator> iter = ds->CreateIterator();
471 EXPECT_NE(iter, nullptr);
472
473 // Iterate the dataset and get each row
474 std::unordered_map<std::string, mindspore::MSTensor> row;
475 ASSERT_OK(iter->GetNextRow(&row));
476
477 std::vector<std::string> expected = {"[UNK]", " ", "[CLS]"};
478 std::shared_ptr<Tensor> de_expected_tensor;
479 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
480 mindspore::MSTensor expected_tensor =
481 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
482
483 uint64_t i = 0;
484 while (row.size() != 0) {
485 auto ind = row["text"];
486 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
487 ASSERT_OK(iter->GetNextRow(&row));
488 i++;
489 }
490
491 EXPECT_EQ(i, 1);
492
493 // Manually terminate the pipeline
494 iter->Stop();
495 }
496
497 /// Feature: BertTokenizer op
498 /// Description: Test BertTokenizer op on TextFileDataset with empty unknown_token and keep_whitespace=true
499 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess5)500 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess5) {
501 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess5.";
502 // Test BertTokenizer with unknown_token empty and keep_whitespace true
503
504 // Create a TextFile dataset
505 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
506 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
507 EXPECT_NE(ds, nullptr);
508
509 // Create Skip operation on ds
510 ds = ds->Skip(7);
511 EXPECT_NE(ds, nullptr);
512
513 // Create Take operation on ds
514 ds = ds->Take(1);
515 EXPECT_NE(ds, nullptr);
516
517 // Create a vocab from vector
518 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
519 Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
520 EXPECT_EQ(s, Status::OK());
521
522 // Create BertTokenizer operation on ds
523 std::shared_ptr<TensorTransform> bert_tokenizer =
524 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "", false, true);
525 EXPECT_NE(bert_tokenizer, nullptr);
526
527 // Create Map operation on ds
528 ds = ds->Map({bert_tokenizer}, {"text"});
529 EXPECT_NE(ds, nullptr);
530
531 // Create an iterator over the result of the above dataset
532 // This will trigger the creation of the Execution Tree and launch it.
533 std::shared_ptr<Iterator> iter = ds->CreateIterator();
534 EXPECT_NE(iter, nullptr);
535
536 // Iterate the dataset and get each row
537 std::unordered_map<std::string, mindspore::MSTensor> row;
538 ASSERT_OK(iter->GetNextRow(&row));
539
540 std::vector<std::string> expected = {"unused", " ", "[CLS]"};
541 std::shared_ptr<Tensor> de_expected_tensor;
542 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
543 mindspore::MSTensor expected_tensor =
544 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
545
546 uint64_t i = 0;
547 while (row.size() != 0) {
548 auto ind = row["text"];
549 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
550 ASSERT_OK(iter->GetNextRow(&row));
551 i++;
552 }
553
554 EXPECT_EQ(i, 1);
555
556 // Manually terminate the pipeline
557 iter->Stop();
558 }
559
560 /// Feature: BertTokenizer op
561 /// Description: Test BertTokenizer op with preserve_unused_token=false, empty unknown_token, and keep_whitespace=true
562 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess6)563 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess6) {
564 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess6.";
565 // Test BertTokenizer with preserve_unused_token false, unknown_token empty and keep_whitespace true
566
567 // Create a TextFile dataset
568 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
569 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
570 EXPECT_NE(ds, nullptr);
571
572 // Create Skip operation on ds
573 ds = ds->Skip(7);
574 EXPECT_NE(ds, nullptr);
575
576 // Create Take operation on ds
577 ds = ds->Take(1);
578 EXPECT_NE(ds, nullptr);
579
580 // Create a vocab from vector
581 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
582 Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
583 EXPECT_EQ(s, Status::OK());
584
585 // Create BertTokenizer operation on ds
586 std::shared_ptr<TensorTransform> bert_tokenizer =
587 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "", false, true, NormalizeForm::kNone, false);
588 EXPECT_NE(bert_tokenizer, nullptr);
589
590 // Create Map operation on ds
591 ds = ds->Map({bert_tokenizer}, {"text"});
592 EXPECT_NE(ds, nullptr);
593
594 // Create an iterator over the result of the above dataset
595 // This will trigger the creation of the Execution Tree and launch it.
596 std::shared_ptr<Iterator> iter = ds->CreateIterator();
597 EXPECT_NE(iter, nullptr);
598
599 // Iterate the dataset and get each row
600 std::unordered_map<std::string, mindspore::MSTensor> row;
601 ASSERT_OK(iter->GetNextRow(&row));
602
603 std::vector<std::string> expected = {"unused", " ", "[", "CLS", "]"};
604 std::shared_ptr<Tensor> de_expected_tensor;
605 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
606 mindspore::MSTensor expected_tensor =
607 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
608
609 uint64_t i = 0;
610 while (row.size() != 0) {
611 auto ind = row["text"];
612 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
613 ASSERT_OK(iter->GetNextRow(&row));
614 i++;
615 }
616
617 EXPECT_EQ(i, 1);
618
619 // Manually terminate the pipeline
620 iter->Stop();
621 }
622
623 /// Feature: BertTokenizer op
624 /// Description: Test BertTokenizer op with with_offsets=true and lower_case=true
625 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestBertTokenizerSuccess7)626 TEST_F(MindDataTestPipeline, TestBertTokenizerSuccess7) {
627 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerSuccess7.";
628 // Test BertTokenizer with with_offsets true and lower_case true
629
630 // Create a TextFile dataset
631 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
632 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
633 EXPECT_NE(ds, nullptr);
634
635 // Create Skip operation on ds
636 ds = ds->Skip(4);
637 EXPECT_NE(ds, nullptr);
638
639 // Create Take operation on ds
640 ds = ds->Take(1);
641 EXPECT_NE(ds, nullptr);
642
643 // Create a vocab from vector
644 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
645 Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
646 EXPECT_EQ(s, Status::OK());
647
648 // Create BertTokenizer operation on ds
649 std::shared_ptr<TensorTransform> bert_tokenizer =
650 std::make_shared<text::BertTokenizer>(vocab, "##", 100, "[UNK]", true, false, NormalizeForm::kNone, true, true);
651 EXPECT_NE(bert_tokenizer, nullptr);
652
653 // Create Map operation on ds
654 ds = ds->Map({bert_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
655 EXPECT_NE(ds, nullptr);
656
657 // Create an iterator over the result of the above dataset
658 // This will trigger the creation of the Execution Tree and launch it.
659 std::shared_ptr<Iterator> iter = ds->CreateIterator();
660 EXPECT_NE(iter, nullptr);
661
662 // Iterate the dataset and get each row
663 std::unordered_map<std::string, mindspore::MSTensor> row;
664 ASSERT_OK(iter->GetNextRow(&row));
665
666 std::vector<std::string> expected_tokens = {"i", "am", "mak", "##ing", "small", "mistake",
667 "##s", "during", "work", "##ing", "hour", "##s"};
668 std::vector<uint32_t> expected_offsets_start = {0, 2, 5, 8, 12, 18, 25, 27, 34, 38, 42, 46};
669 std::vector<uint32_t> expected_offsets_limit = {1, 4, 8, 11, 17, 25, 26, 33, 38, 41, 46, 47};
670
671 std::shared_ptr<Tensor> de_expected_tokens;
672 ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
673 mindspore::MSTensor ms_expected_tokens =
674 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
675
676 std::shared_ptr<Tensor> de_expected_offsets_start;
677 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
678 mindspore::MSTensor ms_expected_offsets_start =
679 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
680
681 std::shared_ptr<Tensor> de_expected_offsets_limit;
682 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
683 mindspore::MSTensor ms_expected_offsets_limit =
684 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
685
686 uint64_t i = 0;
687 while (row.size() != 0) {
688 auto ind = row["token"];
689 EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
690
691 auto start = row["offsets_start"];
692 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
693
694 auto limit = row["offsets_limit"];
695 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
696
697 ASSERT_OK(iter->GetNextRow(&row));
698 i++;
699 }
700
701 EXPECT_EQ(i, 1);
702
703 // Manually terminate the pipeline
704 iter->Stop();
705 }
706
707 /// Feature: BertTokenizer op
708 /// Description: Test BertTokenizer op with nullptr vocab
709 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestBertTokenizerFail1)710 TEST_F(MindDataTestPipeline, TestBertTokenizerFail1) {
711 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail1.";
712 // Test BertTokenizer with nullptr vocab
713
714 // Create a TextFile dataset
715 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
716 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
717 EXPECT_NE(ds, nullptr);
718
719 // Create BertTokenizer operation on ds
720 std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(nullptr);
721 EXPECT_NE(bert_tokenizer, nullptr);
722
723 // Create a Map operation on ds
724 ds = ds->Map({bert_tokenizer});
725 EXPECT_NE(ds, nullptr);
726
727 std::shared_ptr<Iterator> iter = ds->CreateIterator();
728 // Expect failure: invalid BertTokenizer input with nullptr vocab
729 EXPECT_EQ(iter, nullptr);
730 }
731
732 /// Feature: BertTokenizer op
733 /// Description: Test BertTokenizer op with negative max_bytes_per_token
734 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestBertTokenizerFail2)735 TEST_F(MindDataTestPipeline, TestBertTokenizerFail2) {
736 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestBertTokenizerFail2.";
737 // Test BertTokenizer with negative max_bytes_per_token
738
739 // Create a TextFile dataset
740 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
741 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
742 EXPECT_NE(ds, nullptr);
743
744 // Create a vocab from vector
745 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
746 Status s = Vocab::BuildFromVector(list, {}, true, &vocab);
747 EXPECT_EQ(s, Status::OK());
748
749 // Create BertTokenizer operation on ds
750 std::shared_ptr<TensorTransform> bert_tokenizer = std::make_shared<text::BertTokenizer>(vocab, "##", -1);
751 EXPECT_NE(bert_tokenizer, nullptr);
752
753 // Create a Map operation on ds
754 ds = ds->Map({bert_tokenizer});
755 EXPECT_NE(ds, nullptr);
756
757 std::shared_ptr<Iterator> iter = ds->CreateIterator();
758 // Expect failure: invalid BertTokenizer input with nullptr vocab
759 EXPECT_EQ(iter, nullptr);
760 }
761
762 /// Feature: CaseFold op
763 /// Description: Test CaseFold op on TextFileDataset with default parameters
764 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestCaseFoldSuccess)765 TEST_F(MindDataTestPipeline, TestCaseFoldSuccess) {
766 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCaseFoldSuccess.";
767
768 // Create a TextFile dataset
769 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
770 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
771 EXPECT_NE(ds, nullptr);
772
773 // Create casefold operation on ds
774 std::shared_ptr<TensorTransform> casefold = std::make_shared<text::CaseFold>();
775 EXPECT_NE(casefold, nullptr);
776
777 // Create Map operation on ds
778 ds = ds->Map({casefold}, {"text"});
779 EXPECT_NE(ds, nullptr);
780
781 // Create an iterator over the result of the above dataset
782 // This will trigger the creation of the Execution Tree and launch it.
783 std::shared_ptr<Iterator> iter = ds->CreateIterator();
784 EXPECT_NE(iter, nullptr);
785
786 // Iterate the dataset and get each row
787 std::unordered_map<std::string, mindspore::MSTensor> row;
788 ASSERT_OK(iter->GetNextRow(&row));
789
790 std::vector<std::string> expected = {"welcome to beijing!", "北京欢迎您!", "我喜欢english!", " "};
791
792 uint64_t i = 0;
793 while (row.size() != 0) {
794 auto ind = row["text"];
795 std::shared_ptr<Tensor> de_expected_tensor;
796 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
797 mindspore::MSTensor ms_expected_tensor =
798 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
799 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
800 ASSERT_OK(iter->GetNextRow(&row));
801 i++;
802 }
803
804 EXPECT_EQ(i, 4);
805
806 // Manually terminate the pipeline
807 iter->Stop();
808 }
809
810 /// Feature: FilterWikipediaXML op
811 /// Description: Test FilterWikipediaXML op in pipeline mode
812 /// Expectation: The data is processed successfully
TEST_F(MindDataTestPipeline,TestFilterWikipediaXMLSuccess)813 TEST_F(MindDataTestPipeline, TestFilterWikipediaXMLSuccess) {
814 // Testing the parameter of FilterWikipediaXML interface .
815 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFilterWikipediaXMLSuccess.";
816
817 // Create a TextFile dataset
818 std::string data_file = datasets_root_path_ + "/testTokenizerData/2.txt";
819 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
820 EXPECT_NE(ds, nullptr);
821
822 // Create filter_wikipedia_xml operation on ds
823 std::shared_ptr<TensorTransform> filter_wikipedia_xml = std::make_shared<text::FilterWikipediaXML>();
824 EXPECT_NE(filter_wikipedia_xml, nullptr);
825
826 // Create Map operation on ds
827 ds = ds->Map({filter_wikipedia_xml}, {"text"});
828 EXPECT_NE(ds, nullptr);
829
830 // Create an iterator over the result of the above dataset
831 // This will trigger the creation of the Execution Tree and launch it.
832 std::shared_ptr<Iterator> iter = ds->CreateIterator();
833 EXPECT_NE(iter, nullptr);
834
835 // Iterate the dataset and get each row
836 std::unordered_map<std::string, mindspore::MSTensor> row;
837 ASSERT_OK(iter->GetNextRow(&row));
838 std::vector<std::string> expected = {"welcome to beijing", "", ""};
839
840 uint64_t i = 0;
841
842 while (row.size() != 0) {
843 auto ind = row["text"];
844 std::shared_ptr<Tensor> de_expected_tensor;
845 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
846 mindspore::MSTensor ms_expected_tensor =
847 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
848 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
849 ASSERT_OK(iter->GetNextRow(&row));
850 i++;
851 }
852
853 EXPECT_EQ(i, 3);
854
855 // Manually terminate the pipeline
856 iter->Stop();
857 }
858
859 /// Feature: JiebaTokenizer op
860 /// Description: Test JiebaTokenizer op when the mode is JiebaMode::kMp and with_offsets=false
861 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerSuccess)862 TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess) {
863 // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is false.
864 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess.";
865
866 // Create a TextFile dataset
867 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
868 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
869 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
870 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
871 EXPECT_NE(ds, nullptr);
872
873 // Create jieba_tokenizer operation on ds
874 std::shared_ptr<TensorTransform> jieba_tokenizer =
875 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
876 EXPECT_NE(jieba_tokenizer, nullptr);
877
878 // Create Map operation on ds
879 ds = ds->Map({jieba_tokenizer}, {"text"});
880 EXPECT_NE(ds, nullptr);
881
882 // Create an iterator over the result of the above dataset
883 // This will trigger the creation of the Execution Tree and launch it.
884 std::shared_ptr<Iterator> iter = ds->CreateIterator();
885 EXPECT_NE(iter, nullptr);
886
887 // Iterate the dataset and get each row
888 std::unordered_map<std::string, mindspore::MSTensor> row;
889 ASSERT_OK(iter->GetNextRow(&row));
890
891 std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
892 std::shared_ptr<Tensor> de_expected_tensor;
893 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
894 mindspore::MSTensor expected_tensor =
895 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
896
897 uint64_t i = 0;
898 while (row.size() != 0) {
899 auto ind = row["text"];
900 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
901 ASSERT_OK(iter->GetNextRow(&row));
902 i++;
903 }
904
905 EXPECT_EQ(i, 1);
906
907 // Manually terminate the pipeline
908 iter->Stop();
909 }
910
911 /// Feature: JiebaTokenizer op
912 /// Description: Test JiebaTokenizer op when the mode is JiebaMode::kHmm and with_offsets=false
913 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerSuccess1)914 TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess1) {
915 // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kHmm and the with_offsets is false.
916 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess1.";
917
918 // Create a TextFile dataset
919 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
920 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
921 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
922 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
923 EXPECT_NE(ds, nullptr);
924
925 // Create jieba_tokenizer operation on ds
926 std::shared_ptr<TensorTransform> jieba_tokenizer =
927 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kHmm);
928 EXPECT_NE(jieba_tokenizer, nullptr);
929
930 // Create Map operation on ds
931 ds = ds->Map({jieba_tokenizer}, {"text"});
932 EXPECT_NE(ds, nullptr);
933
934 // Create an iterator over the result of the above dataset
935 // This will trigger the creation of the Execution Tree and launch it.
936 std::shared_ptr<Iterator> iter = ds->CreateIterator();
937 EXPECT_NE(iter, nullptr);
938
939 // Iterate the dataset and get each row
940 std::unordered_map<std::string, mindspore::MSTensor> row;
941 ASSERT_OK(iter->GetNextRow(&row));
942
943 std::vector<std::string> expected = {"今天", "天气", "太", "好", "了", "我们", "一起", "去", "外面", "玩", "吧"};
944 std::shared_ptr<Tensor> de_expected_tensor;
945 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
946 mindspore::MSTensor expected_tensor =
947 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
948
949 uint64_t i = 0;
950 while (row.size() != 0) {
951 auto ind = row["text"];
952 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
953 ASSERT_OK(iter->GetNextRow(&row));
954 i++;
955 }
956
957 EXPECT_EQ(i, 1);
958
959 // Manually terminate the pipeline
960 iter->Stop();
961 }
962
963 /// Feature: JiebaTokenizer op
964 /// Description: Test JiebaTokenizer op when the mode is JiebaMode::kMp and with_offsets=true
965 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerSuccess2)966 TEST_F(MindDataTestPipeline, TestJiebaTokenizerSuccess2) {
967 // Testing the parameter of JiebaTokenizer interface when the mode is JiebaMode::kMp and the with_offsets is true.
968 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerSuccess2.";
969
970 // Create a TextFile dataset
971 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
972 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
973 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
974 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
975 EXPECT_NE(ds, nullptr);
976
977 // Create jieba_tokenizer operation on ds
978 std::shared_ptr<TensorTransform> jieba_tokenizer =
979 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp, true);
980 EXPECT_NE(jieba_tokenizer, nullptr);
981
982 // Create Map operation on ds
983 ds = ds->Map({jieba_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
984 EXPECT_NE(ds, nullptr);
985
986 // Create an iterator over the result of the above dataset
987 // This will trigger the creation of the Execution Tree and launch it.
988 std::shared_ptr<Iterator> iter = ds->CreateIterator();
989 EXPECT_NE(iter, nullptr);
990
991 // Iterate the dataset and get each row
992 std::unordered_map<std::string, mindspore::MSTensor> row;
993 ASSERT_OK(iter->GetNextRow(&row));
994
995 std::vector<std::string> expected_tokens = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
996 std::vector<uint32_t> expected_offsets_start = {0, 12, 21, 27, 33, 36, 42};
997 std::vector<uint32_t> expected_offsets_limit = {12, 21, 27, 33, 36, 42, 48};
998
999 std::shared_ptr<Tensor> de_expected_tokens;
1000 ASSERT_OK(Tensor::CreateFromVector(expected_tokens, &de_expected_tokens));
1001 mindspore::MSTensor ms_expected_tokens =
1002 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
1003
1004 std::shared_ptr<Tensor> de_expected_offsets_start;
1005 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start, &de_expected_offsets_start));
1006 mindspore::MSTensor ms_expected_offsets_start =
1007 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
1008
1009 std::shared_ptr<Tensor> de_expected_offsets_limit;
1010 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit, &de_expected_offsets_limit));
1011 mindspore::MSTensor ms_expected_offsets_limit =
1012 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
1013
1014 uint64_t i = 0;
1015 while (row.size() != 0) {
1016 auto ind = row["token"];
1017 EXPECT_MSTENSOR_EQ(ind, ms_expected_tokens);
1018
1019 auto start = row["offsets_start"];
1020 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
1021
1022 auto limit = row["offsets_limit"];
1023 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
1024
1025 ASSERT_OK(iter->GetNextRow(&row));
1026 i++;
1027 }
1028
1029 EXPECT_EQ(i, 1);
1030
1031 // Manually terminate the pipeline
1032 iter->Stop();
1033 }
1034
1035 /// Feature: JiebaTokenizer op
1036 /// Description: Test JiebaTokenizer op with empty hmm_path
1037 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestJiebaTokenizerFail1)1038 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail1) {
1039 // Testing the incorrect parameter of JiebaTokenizer interface.
1040 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail1.";
1041
1042 // Create a TextFile dataset
1043 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1044 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1045 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1046 EXPECT_NE(ds, nullptr);
1047
1048 // Create jieba_tokenizer operation on ds
1049 // Testing the parameter hmm_path is empty
1050 std::shared_ptr<TensorTransform> jieba_tokenizer =
1051 std::make_shared<text::JiebaTokenizer>("", mp_path, JiebaMode::kMp);
1052 EXPECT_NE(jieba_tokenizer, nullptr);
1053
1054 // Create a Map operation on ds
1055 ds = ds->Map({jieba_tokenizer});
1056 EXPECT_NE(ds, nullptr);
1057
1058 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1059 // Expect failure: invalid JiebaTokenizer input (parameter hmm_path is empty)
1060 EXPECT_EQ(iter, nullptr);
1061 }
1062
1063 /// Feature: JiebaTokenizer op
1064 /// Description: Test JiebaTokenizer op with empty mp_path
1065 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestJiebaTokenizerFail2)1066 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail2) {
1067 // Testing the incorrect parameter of JiebaTokenizer interface.
1068 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail2.";
1069
1070 // Create a TextFile dataset
1071 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1072 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1073 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1074 EXPECT_NE(ds, nullptr);
1075
1076 // Create jieba_tokenizer operation on ds
1077 // Testing the parameter mp_path is empty
1078 std::shared_ptr<TensorTransform> jieba_tokenizer =
1079 std::make_shared<text::JiebaTokenizer>(hmm_path, "", JiebaMode::kMp);
1080 EXPECT_NE(jieba_tokenizer, nullptr);
1081
1082 // Create a Map operation on ds
1083 ds = ds->Map({jieba_tokenizer});
1084 EXPECT_NE(ds, nullptr);
1085
1086 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1087 // Expect failure: invalid JiebaTokenizer input (parameter mp_path is empty)
1088 EXPECT_EQ(iter, nullptr);
1089 }
1090
1091 /// Feature: JiebaTokenizer op
1092 /// Description: Test JiebaTokenizer op with invalid hmm_path
1093 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestJiebaTokenizerFail3)1094 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail3) {
1095 // Testing the incorrect parameter of JiebaTokenizer interface.
1096 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail3.";
1097
1098 // Create a TextFile dataset
1099 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1100 std::string hmm_path_invalid = datasets_root_path_ + "/jiebadict/1.txt";
1101 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1102 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1103 EXPECT_NE(ds, nullptr);
1104
1105 // Create jieba_tokenizer operation on ds
1106 // Testing the parameter hmm_path is invalid path
1107 std::shared_ptr<TensorTransform> jieba_tokenizer =
1108 std::make_shared<text::JiebaTokenizer>(hmm_path_invalid, mp_path, JiebaMode::kMp);
1109 EXPECT_NE(jieba_tokenizer, nullptr);
1110
1111 // Create a Map operation on ds
1112 ds = ds->Map({jieba_tokenizer});
1113 EXPECT_NE(ds, nullptr);
1114
1115 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1116 // Expect failure: invalid JiebaTokenizer input (parameter hmm_path is invalid path)
1117 EXPECT_EQ(iter, nullptr);
1118 }
1119
1120 /// Feature: JiebaTokenizer op
1121 /// Description: Test JiebaTokenizer op with invalid mp_path
1122 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestJiebaTokenizerFail4)1123 TEST_F(MindDataTestPipeline, TestJiebaTokenizerFail4) {
1124 // Testing the incorrect parameter of JiebaTokenizer interface.
1125 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerFail4.";
1126
1127 // Create a TextFile dataset
1128 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1129 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1130 std::string mp_path_invalid = datasets_root_path_ + "/jiebadict/1.txt";
1131 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1132 EXPECT_NE(ds, nullptr);
1133
1134 // Create jieba_tokenizer operation on ds
1135 // Testing the parameter mp_path is invalid path
1136 std::shared_ptr<TensorTransform> jieba_tokenizer =
1137 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path_invalid, JiebaMode::kMp);
1138 EXPECT_NE(jieba_tokenizer, nullptr);
1139
1140 // Create a Map operation on ds
1141 ds = ds->Map({jieba_tokenizer});
1142 EXPECT_NE(ds, nullptr);
1143
1144 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1145 // Expect failure: invalid JiebaTokenizer input (parameter mp_path is invalid path)
1146 EXPECT_EQ(iter, nullptr);
1147 }
1148
1149 /// Feature: JiebaTokenizer op
1150 /// Description: Test AddWord of JiebaTokenizer when the freq is not provided (default 0)
1151 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddWord)1152 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord) {
1153 // Testing the parameter AddWord of JiebaTokenizer when the freq is not provided (default 0).
1154 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord.";
1155
1156 // Create a TextFile dataset
1157 std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
1158 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1159 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1160 std::shared_ptr<Dataset> ds = TextFile({data_file});
1161 EXPECT_NE(ds, nullptr);
1162
1163 // Create jieba_tokenizer operation on ds
1164 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1165 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1166 EXPECT_NE(jieba_tokenizer, nullptr);
1167
1168 // Add word with freq not provided (default 0)
1169 ASSERT_OK(jieba_tokenizer->AddWord("男默女泪"));
1170
1171 // Create Map operation on ds
1172 ds = ds->Map({jieba_tokenizer}, {"text"});
1173 EXPECT_NE(ds, nullptr);
1174
1175 // Create an iterator over the result of the above dataset
1176 // This will trigger the creation of the Execution Tree and launch it.
1177 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1178 EXPECT_NE(iter, nullptr);
1179
1180 // Iterate the dataset and get each row
1181 std::unordered_map<std::string, mindspore::MSTensor> row;
1182 ASSERT_OK(iter->GetNextRow(&row));
1183
1184 std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
1185 std::shared_ptr<Tensor> de_expected_tensor;
1186 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1187 mindspore::MSTensor expected_tensor =
1188 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1189
1190 uint64_t i = 0;
1191 while (row.size() != 0) {
1192 auto ind = row["text"];
1193 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1194 ASSERT_OK(iter->GetNextRow(&row));
1195 i++;
1196 }
1197
1198 EXPECT_EQ(i, 1);
1199
1200 // Manually terminate the pipeline
1201 iter->Stop();
1202 }
1203
1204 /// Feature: JiebaTokenizer op
1205 /// Description: Test AddWord of JiebaTokenizer when the freq is set explicitly to 0
1206 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddWord1)1207 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord1) {
1208 // Testing the parameter AddWord of JiebaTokenizer when the freq is set explicitly to 0.
1209 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord1.";
1210
1211 // Create a TextFile dataset
1212 std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
1213 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1214 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1215 std::shared_ptr<Dataset> ds = TextFile({data_file});
1216 EXPECT_NE(ds, nullptr);
1217
1218 // Create jieba_tokenizer operation on ds
1219 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1220 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1221 EXPECT_NE(jieba_tokenizer, nullptr);
1222
1223 // Add word with freq is set explicitly to 0
1224 ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 0));
1225
1226 // Create Map operation on ds
1227 ds = ds->Map({jieba_tokenizer}, {"text"});
1228 EXPECT_NE(ds, nullptr);
1229
1230 // Create an iterator over the result of the above dataset
1231 // This will trigger the creation of the Execution Tree and launch it.
1232 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1233 EXPECT_NE(iter, nullptr);
1234
1235 // Iterate the dataset and get each row
1236 std::unordered_map<std::string, mindspore::MSTensor> row;
1237 ASSERT_OK(iter->GetNextRow(&row));
1238
1239 std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
1240 std::shared_ptr<Tensor> de_expected_tensor;
1241 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1242 mindspore::MSTensor expected_tensor =
1243 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1244
1245 uint64_t i = 0;
1246 while (row.size() != 0) {
1247 auto ind = row["text"];
1248 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1249 ASSERT_OK(iter->GetNextRow(&row));
1250 i++;
1251 }
1252
1253 EXPECT_EQ(i, 1);
1254
1255 // Manually terminate the pipeline
1256 iter->Stop();
1257 }
1258
1259 /// Feature: JiebaTokenizer op
1260 /// Description: Test AddWord of JiebaTokenizer when the freq is set to 10
1261 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddWord2)1262 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord2) {
1263 // Testing the parameter AddWord of JiebaTokenizer when the freq is 10.
1264 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord2.";
1265
1266 // Create a TextFile dataset
1267 std::string data_file = datasets_root_path_ + "/testJiebaDataset/4.txt";
1268 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1269 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1270 std::shared_ptr<Dataset> ds = TextFile({data_file});
1271 EXPECT_NE(ds, nullptr);
1272
1273 // Create jieba_tokenizer operation on ds
1274 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1275 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1276 EXPECT_NE(jieba_tokenizer, nullptr);
1277
1278 // Add word with freq 10
1279 ASSERT_OK(jieba_tokenizer->AddWord("男默女泪", 10));
1280
1281 // Create Map operation on ds
1282 ds = ds->Map({jieba_tokenizer}, {"text"});
1283 EXPECT_NE(ds, nullptr);
1284
1285 // Create an iterator over the result of the above dataset
1286 // This will trigger the creation of the Execution Tree and launch it.
1287 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1288 EXPECT_NE(iter, nullptr);
1289
1290 // Iterate the dataset and get each row
1291 std::unordered_map<std::string, mindspore::MSTensor> row;
1292 ASSERT_OK(iter->GetNextRow(&row));
1293
1294 std::vector<std::string> expected = {"男默女泪", "市", "长江大桥"};
1295 std::shared_ptr<Tensor> de_expected_tensor;
1296 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1297 mindspore::MSTensor expected_tensor =
1298 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1299
1300 uint64_t i = 0;
1301 while (row.size() != 0) {
1302 auto ind = row["text"];
1303 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1304 ASSERT_OK(iter->GetNextRow(&row));
1305 i++;
1306 }
1307
1308 EXPECT_EQ(i, 1);
1309
1310 // Manually terminate the pipeline
1311 iter->Stop();
1312 }
1313
1314 /// Feature: JiebaTokenizer op
1315 /// Description: Test AddWord of JiebaTokenizer when the freq is 20000 which affects the result of segmentation
1316 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddWord3)1317 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWord3) {
1318 // Testing the parameter AddWord of JiebaTokenizer when the freq is 20000 which affects the result of segmentation.
1319 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWord3.";
1320
1321 // Create a TextFile dataset
1322 std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt";
1323 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1324 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1325 std::shared_ptr<Dataset> ds = TextFile({data_file});
1326 EXPECT_NE(ds, nullptr);
1327
1328 // Create jieba_tokenizer operation on ds
1329 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1330 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1331 EXPECT_NE(jieba_tokenizer, nullptr);
1332
1333 // Add word with freq 20000
1334 ASSERT_OK(jieba_tokenizer->AddWord("江大桥", 20000));
1335
1336 // Create Map operation on ds
1337 ds = ds->Map({jieba_tokenizer}, {"text"});
1338 EXPECT_NE(ds, nullptr);
1339
1340 // Create an iterator over the result of the above dataset
1341 // This will trigger the creation of the Execution Tree and launch it.
1342 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1343 EXPECT_NE(iter, nullptr);
1344
1345 // Iterate the dataset and get each row
1346 std::unordered_map<std::string, mindspore::MSTensor> row;
1347 ASSERT_OK(iter->GetNextRow(&row));
1348
1349 std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"};
1350 std::shared_ptr<Tensor> de_expected_tensor;
1351 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1352 mindspore::MSTensor expected_tensor =
1353 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1354
1355 uint64_t i = 0;
1356 while (row.size() != 0) {
1357 auto ind = row["text"];
1358 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1359 ASSERT_OK(iter->GetNextRow(&row));
1360 i++;
1361 }
1362
1363 EXPECT_EQ(i, 1);
1364
1365 // Manually terminate the pipeline
1366 iter->Stop();
1367 }
1368
1369 /// Feature: JiebaTokenizer op
1370 /// Description: Test AddWord of JiebaTokenizer with invalid parameters
1371 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddWordFail)1372 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddWordFail) {
1373 // Testing the incorrect parameter of AddWord in JiebaTokenizer.
1374 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddWordFail.";
1375
1376 // Create a TextFile dataset
1377 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1378 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1379 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1380 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1381 EXPECT_NE(ds, nullptr);
1382
1383 // Testing the parameter word of AddWord is empty
1384 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1385 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1386 EXPECT_NE(jieba_tokenizer, nullptr);
1387 EXPECT_NE(jieba_tokenizer->AddWord("", 10), Status::OK());
1388 // Testing the parameter freq of AddWord is negative
1389 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer1 =
1390 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1391 EXPECT_NE(jieba_tokenizer1, nullptr);
1392 EXPECT_NE(jieba_tokenizer1->AddWord("我们", -1), Status::OK());
1393 }
1394
1395 /// Feature: JiebaTokenizer op
1396 /// Description: Test AddDict of JiebaTokenizer when the input is a vector of word-freq pair
1397 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddDict)1398 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDict) {
1399 // Testing AddDict of JiebaTokenizer when the input is a vector of word-freq pair.
1400 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDict.";
1401
1402 // Create a TextFile dataset
1403 std::string data_file = datasets_root_path_ + "/testJiebaDataset/6.txt";
1404 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1405 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1406 std::shared_ptr<Dataset> ds = TextFile({data_file});
1407 EXPECT_NE(ds, nullptr);
1408
1409 // Create jieba_tokenizer operation on ds
1410 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1411 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1412 EXPECT_NE(jieba_tokenizer, nullptr);
1413
1414 // Add word with freq 20000
1415 std::vector<std::pair<std::string, int64_t>> user_dict = {{"江大桥", 20000}};
1416 ASSERT_OK(jieba_tokenizer->AddDict(user_dict));
1417
1418 // Create Map operation on ds
1419 ds = ds->Map({jieba_tokenizer}, {"text"});
1420 EXPECT_NE(ds, nullptr);
1421
1422 // Create an iterator over the result of the above dataset
1423 // This will trigger the creation of the Execution Tree and launch it.
1424 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1425 EXPECT_NE(iter, nullptr);
1426
1427 // Iterate the dataset and get each row
1428 std::unordered_map<std::string, mindspore::MSTensor> row;
1429 ASSERT_OK(iter->GetNextRow(&row));
1430
1431 std::vector<std::string> expected = {"江州", "市长", "江大桥", "参加", "了", "长江大桥", "的", "通车", "仪式"};
1432 std::shared_ptr<Tensor> de_expected_tensor;
1433 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1434 mindspore::MSTensor expected_tensor =
1435 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1436
1437 uint64_t i = 0;
1438 while (row.size() != 0) {
1439 auto txt = row["text"];
1440 EXPECT_MSTENSOR_EQ(txt, expected_tensor);
1441 ASSERT_OK(iter->GetNextRow(&row));
1442 i++;
1443 }
1444
1445 EXPECT_EQ(i, 1);
1446
1447 // Manually terminate the pipeline
1448 iter->Stop();
1449 }
1450
1451 /// Feature: JiebaTokenizer op
1452 /// Description: Test AddDict of JiebaTokenizer when the input is a path to dict
1453 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestJiebaTokenizerAddDictFromFile)1454 TEST_F(MindDataTestPipeline, TestJiebaTokenizerAddDictFromFile) {
1455 // Testing AddDict of JiebaTokenizer when the input is a path to dict.
1456 // Test error scenario for AddDict: invalid path
1457 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestJiebaTokenizerAddDictFromFile.";
1458
1459 // Create a TextFile dataset
1460 std::string data_file = datasets_root_path_ + "/testJiebaDataset/3.txt";
1461 std::string hmm_path = datasets_root_path_ + "/jiebadict/hmm_model.utf8";
1462 std::string mp_path = datasets_root_path_ + "/jiebadict/jieba.dict.utf8";
1463 std::shared_ptr<Dataset> ds = TextFile({data_file});
1464 EXPECT_NE(ds, nullptr);
1465
1466 // Create jieba_tokenizer operation on ds
1467 std::shared_ptr<text::JiebaTokenizer> jieba_tokenizer =
1468 std::make_shared<text::JiebaTokenizer>(hmm_path, mp_path, JiebaMode::kMp);
1469 EXPECT_NE(jieba_tokenizer, nullptr);
1470
1471 // Load dict from txt file
1472 std::string user_dict_path = datasets_root_path_ + "/testJiebaDataset/user_dict.txt";
1473 std::string invalid_path = datasets_root_path_ + "/testJiebaDataset/invalid_path.txt";
1474 EXPECT_ERROR(jieba_tokenizer->AddDict(invalid_path));
1475 ASSERT_OK(jieba_tokenizer->AddDict(user_dict_path));
1476
1477 // Create Map operation on ds
1478 ds = ds->Map({jieba_tokenizer}, {"text"});
1479 EXPECT_NE(ds, nullptr);
1480
1481 // Create an iterator over the result of the above dataset
1482 // This will trigger the creation of the Execution Tree and launch it.
1483 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1484 EXPECT_NE(iter, nullptr);
1485
1486 // Iterate the dataset and get each row
1487 std::unordered_map<std::string, mindspore::MSTensor> row;
1488 ASSERT_OK(iter->GetNextRow(&row));
1489
1490 std::vector<std::string> expected = {"今天天气", "太好了", "我们", "一起", "去", "外面", "玩吧"};
1491 std::shared_ptr<Tensor> de_expected_tensor;
1492 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
1493 mindspore::MSTensor expected_tensor =
1494 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1495
1496 uint64_t i = 0;
1497 while (row.size() != 0) {
1498 auto txt = row["text"];
1499 EXPECT_MSTENSOR_EQ(txt, expected_tensor);
1500 ASSERT_OK(iter->GetNextRow(&row));
1501 i++;
1502 }
1503
1504 EXPECT_EQ(i, 1);
1505
1506 // Manually terminate the pipeline
1507 iter->Stop();
1508 }
1509
1510 /// Feature: SlidingWindow op
1511 /// Description: Test SlidingWindow when the axis is 0
1512 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestSlidingWindowSuccess)1513 TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess) {
1514 // Testing the parameter of SlidingWindow interface when the axis is 0.
1515 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess.";
1516
1517 // Create a TextFile dataset
1518 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
1519 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1520 EXPECT_NE(ds, nullptr);
1521
1522 // Create white_tokenizer operation on ds
1523 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
1524 EXPECT_NE(white_tokenizer, nullptr);
1525 // Create sliding_window operation on ds
1526 std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(3, 0);
1527 EXPECT_NE(sliding_window, nullptr);
1528
1529 // Create Map operation on ds
1530 ds = ds->Map({white_tokenizer, sliding_window}, {"text"});
1531 EXPECT_NE(ds, nullptr);
1532
1533 // Create an iterator over the result of the above dataset
1534 // This will trigger the creation of the Execution Tree and launch it.
1535 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1536 EXPECT_NE(iter, nullptr);
1537
1538 // Iterate the dataset and get each row
1539 std::unordered_map<std::string, mindspore::MSTensor> row;
1540 ASSERT_OK(iter->GetNextRow(&row));
1541
1542 std::vector<std::vector<std::string>> expected = {{"This", "is", "a", "is", "a", "text", "a", "text", "file."},
1543 {"Be", "happy", "every", "happy", "every", "day."},
1544 {"Good", "luck", "to", "luck", "to", "everyone."}};
1545
1546 uint64_t i = 0;
1547 while (row.size() != 0) {
1548 auto ind = row["text"];
1549
1550 std::shared_ptr<Tensor> de_expected_tensor;
1551 int x = expected[i].size() / 3;
1552 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x, 3}), &de_expected_tensor));
1553 mindspore::MSTensor expected_tensor =
1554 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1555 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1556
1557 ASSERT_OK(iter->GetNextRow(&row));
1558 i++;
1559 }
1560
1561 EXPECT_EQ(i, 3);
1562
1563 // Manually terminate the pipeline
1564 iter->Stop();
1565 }
1566
1567 /// Feature: SlidingWindow op
1568 /// Description: Test SlidingWindow when the axis is -1
1569 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestSlidingWindowSuccess1)1570 TEST_F(MindDataTestPipeline, TestSlidingWindowSuccess1) {
1571 // Testing the parameter of SlidingWindow interface when the axis is -1.
1572 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowSuccess1.";
1573
1574 // Create a TextFile dataset
1575 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
1576 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1577 EXPECT_NE(ds, nullptr);
1578
1579 // Create white_tokenizer operation on ds
1580 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
1581 EXPECT_NE(white_tokenizer, nullptr);
1582 // Create sliding_window operation on ds
1583 std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(2, -1);
1584 EXPECT_NE(sliding_window, nullptr);
1585
1586 // Create Map operation on ds
1587 ds = ds->Map({white_tokenizer, sliding_window}, {"text"});
1588 EXPECT_NE(ds, nullptr);
1589
1590 // Create an iterator over the result of the above dataset
1591 // This will trigger the creation of the Execution Tree and launch it.
1592 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1593 EXPECT_NE(iter, nullptr);
1594
1595 // Iterate the dataset and get each row
1596 std::unordered_map<std::string, mindspore::MSTensor> row;
1597 ASSERT_OK(iter->GetNextRow(&row));
1598
1599 std::vector<std::vector<std::string>> expected = {{"This", "is", "is", "a", "a", "text", "text", "file."},
1600 {"Be", "happy", "happy", "every", "every", "day."},
1601 {"Good", "luck", "luck", "to", "to", "everyone."}};
1602 uint64_t i = 0;
1603 while (row.size() != 0) {
1604 auto ind = row["text"];
1605
1606 std::shared_ptr<Tensor> de_expected_tensor;
1607 int x = expected[i].size() / 2;
1608 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x, 2}), &de_expected_tensor));
1609 mindspore::MSTensor expected_tensor =
1610 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1611 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
1612
1613 ASSERT_OK(iter->GetNextRow(&row));
1614 i++;
1615 }
1616
1617 EXPECT_EQ(i, 3);
1618
1619 // Manually terminate the pipeline
1620 iter->Stop();
1621 }
1622
1623 /// Feature: SlidingWindow op
1624 /// Description: Test SlidingWindow when the width=0
1625 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestSlidingWindowFail1)1626 TEST_F(MindDataTestPipeline, TestSlidingWindowFail1) {
1627 // Testing the incorrect parameter of SlidingWindow interface.
1628 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail1.";
1629
1630 // Create a TextFile dataset
1631 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
1632 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1633 EXPECT_NE(ds, nullptr);
1634
1635 // Create sliding_window operation on ds
1636 // Testing the parameter width less than or equal to 0
1637 // The parameter axis support 0 or -1 only for now
1638 std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(0, 0);
1639 EXPECT_NE(sliding_window, nullptr);
1640
1641 // Create a Map operation on ds
1642 ds = ds->Map({sliding_window});
1643 EXPECT_NE(ds, nullptr);
1644
1645 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1646 // Expect failure: invalid SlidingWindow input (width less than or equal to 0)
1647 EXPECT_EQ(iter, nullptr);
1648 }
1649
1650 /// Feature: SlidingWindow op
1651 /// Description: Test SlidingWindow when the width=-2
1652 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestSlidingWindowFail2)1653 TEST_F(MindDataTestPipeline, TestSlidingWindowFail2) {
1654 // Testing the incorrect parameter of SlidingWindow interface.
1655 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestSlidingWindowFail2.";
1656
1657 // Create a TextFile dataset
1658 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
1659 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1660 EXPECT_NE(ds, nullptr);
1661
1662 // Create sliding_window operation on ds
1663 // Testing the parameter width less than or equal to 0
1664 // The parameter axis support 0 or -1 only for now
1665 std::shared_ptr<TensorTransform> sliding_window = std::make_shared<text::SlidingWindow>(-2, 0);
1666 EXPECT_NE(sliding_window, nullptr);
1667
1668 // Create a Map operation on ds
1669 ds = ds->Map({sliding_window});
1670 EXPECT_NE(ds, nullptr);
1671
1672 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1673 // Expect failure: invalid SlidingWindow input (width less than or equal to 0)
1674 EXPECT_EQ(iter, nullptr);
1675 }
1676
1677 /// Feature: ToNumber op
1678 /// Description: Test ToNumber with integer numbers
1679 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestToNumberSuccess1)1680 TEST_F(MindDataTestPipeline, TestToNumberSuccess1) {
1681 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberSuccess1.";
1682 // Test ToNumber with integer numbers
1683
1684 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1685
1686 // Create a TextFile dataset
1687 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1688 EXPECT_NE(ds, nullptr);
1689
1690 // Create a Take operation on ds
1691 ds = ds->Take(8);
1692 EXPECT_NE(ds, nullptr);
1693
1694 // Create ToNumber operation on ds
1695 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
1696 EXPECT_NE(to_number, nullptr);
1697
1698 // Create a Map operation on ds
1699 ds = ds->Map({to_number}, {"text"});
1700 EXPECT_NE(ds, nullptr);
1701
1702 // Create an iterator over the result of the above dataset
1703 // This will trigger the creation of the Execution Tree and launch it.
1704 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1705 EXPECT_NE(iter, nullptr);
1706
1707 // Iterate the dataset and get each row
1708 std::unordered_map<std::string, mindspore::MSTensor> row;
1709 ASSERT_OK(iter->GetNextRow(&row));
1710
1711 std::vector<int64_t> expected = {-121, 14, -2219, 7623, -8162536, 162371864, -1726483716, 98921728421};
1712
1713 uint64_t i = 0;
1714 while (row.size() != 0) {
1715 auto ind = row["text"];
1716 std::shared_ptr<Tensor> de_expected_tensor;
1717 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
1718 mindspore::MSTensor ms_expected_tensor =
1719 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1720 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
1721 ASSERT_OK(iter->GetNextRow(&row));
1722 i++;
1723 }
1724
1725 EXPECT_EQ(i, 8);
1726
1727 // Manually terminate the pipeline
1728 iter->Stop();
1729 }
1730
1731 /// Feature: ToNumber op
1732 /// Description: Test ToNumber with float numbers
1733 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestToNumberSuccess2)1734 TEST_F(MindDataTestPipeline, TestToNumberSuccess2) {
1735 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberSuccess2.";
1736 // Test ToNumber with float numbers
1737
1738 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1739
1740 // Create a TextFile dataset
1741 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1742 EXPECT_NE(ds, nullptr);
1743
1744 // Create a Skip operation on ds
1745 ds = ds->Skip(8);
1746 EXPECT_NE(ds, nullptr);
1747
1748 // Create a Take operation on ds
1749 ds = ds->Take(6);
1750 EXPECT_NE(ds, nullptr);
1751
1752 // Create ToNumber operation on ds
1753 std::shared_ptr<TensorTransform> to_number =
1754 std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat64);
1755 EXPECT_NE(to_number, nullptr);
1756
1757 // Create a Map operation on ds
1758 ds = ds->Map({to_number}, {"text"});
1759 EXPECT_NE(ds, nullptr);
1760
1761 // Create an iterator over the result of the above dataset
1762 // This will trigger the creation of the Execution Tree and launch it.
1763 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1764 EXPECT_NE(iter, nullptr);
1765
1766 // Iterate the dataset and get each row
1767 std::unordered_map<std::string, mindspore::MSTensor> row;
1768 ASSERT_OK(iter->GetNextRow(&row));
1769
1770 std::vector<double_t> expected = {-1.1, 1.4, -2219.321, 7623.453, -816256.234282, 162371864.243243};
1771
1772 uint64_t i = 0;
1773 while (row.size() != 0) {
1774 auto ind = row["text"];
1775 std::shared_ptr<Tensor> de_expected_tensor;
1776 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
1777 mindspore::MSTensor ms_expected_tensor =
1778 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
1779 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
1780 ASSERT_OK(iter->GetNextRow(&row));
1781 i++;
1782 }
1783
1784 EXPECT_EQ(i, 6);
1785
1786 // Manually terminate the pipeline
1787 iter->Stop();
1788 }
1789
1790 /// Feature: ToNumber op
1791 /// Description: Test ToNumber with overflow integer numbers
1792 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestToNumberFail1)1793 TEST_F(MindDataTestPipeline, TestToNumberFail1) {
1794 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail1.";
1795 // Test ToNumber with overflow integer numbers
1796
1797 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1798
1799 // Create a TextFile dataset
1800 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1801 EXPECT_NE(ds, nullptr);
1802
1803 // Create a Skip operation on ds
1804 ds = ds->Skip(2);
1805 EXPECT_NE(ds, nullptr);
1806
1807 // Create a Take operation on ds
1808 ds = ds->Take(6);
1809 EXPECT_NE(ds, nullptr);
1810
1811 // Create ToNumber operation on ds
1812 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt8);
1813 EXPECT_NE(to_number, nullptr);
1814
1815 // Create a Map operation on ds
1816 ds = ds->Map({to_number}, {"text"});
1817 EXPECT_NE(ds, nullptr);
1818
1819 // Create an iterator over the result of the above dataset
1820 // This will trigger the creation of the Execution Tree and launch it.
1821 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1822 EXPECT_NE(iter, nullptr);
1823
1824 // Iterate the dataset and get each row
1825 std::unordered_map<std::string, mindspore::MSTensor> row;
1826
1827 // Expect error: input out of bounds of int8
1828 EXPECT_ERROR(iter->GetNextRow(&row));
1829
1830 uint64_t i = 0;
1831 while (row.size() != 0) {
1832 EXPECT_ERROR(iter->GetNextRow(&row));
1833 i++;
1834 }
1835
1836 // Expect failure: GetNextRow fail and return nothing
1837 EXPECT_EQ(i, 0);
1838
1839 // Manually terminate the pipeline
1840 iter->Stop();
1841 }
1842
1843 /// Feature: ToNumber op
1844 /// Description: Test ToNumber with overflow float numbers
1845 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestToNumberFail2)1846 TEST_F(MindDataTestPipeline, TestToNumberFail2) {
1847 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail2.";
1848 // Test ToNumber with overflow float numbers
1849
1850 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1851
1852 // Create a TextFile dataset
1853 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1854 EXPECT_NE(ds, nullptr);
1855
1856 // Create a Skip operation on ds
1857 ds = ds->Skip(12);
1858 EXPECT_NE(ds, nullptr);
1859
1860 // Create a Take operation on ds
1861 ds = ds->Take(2);
1862 EXPECT_NE(ds, nullptr);
1863
1864 // Create ToNumber operation on ds
1865 std::shared_ptr<TensorTransform> to_number =
1866 std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeFloat16);
1867 EXPECT_NE(to_number, nullptr);
1868
1869 // Create a Map operation on ds
1870 ds = ds->Map({to_number}, {"text"});
1871 EXPECT_NE(ds, nullptr);
1872
1873 // Create an iterator over the result of the above dataset
1874 // This will trigger the creation of the Execution Tree and launch it.
1875 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1876 EXPECT_NE(iter, nullptr);
1877
1878 // Iterate the dataset and get each row
1879 std::unordered_map<std::string, mindspore::MSTensor> row;
1880
1881 // Expect error: input out of bounds of float16
1882 EXPECT_ERROR(iter->GetNextRow(&row));
1883
1884 uint64_t i = 0;
1885 while (row.size() != 0) {
1886 EXPECT_ERROR(iter->GetNextRow(&row));
1887 i++;
1888 }
1889
1890 // Expect failure: GetNextRow fail and return nothing
1891 EXPECT_EQ(i, 0);
1892
1893 // Manually terminate the pipeline
1894 iter->Stop();
1895 }
1896
1897 /// Feature: ToNumber op
1898 /// Description: Test ToNumber with non numerical input
1899 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestToNumberFail3)1900 TEST_F(MindDataTestPipeline, TestToNumberFail3) {
1901 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail3.";
1902 // Test ToNumber with non numerical input
1903
1904 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1905
1906 // Create a TextFile dataset
1907 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1908 EXPECT_NE(ds, nullptr);
1909
1910 // Create a Skip operation on ds
1911 ds = ds->Skip(14);
1912 EXPECT_NE(ds, nullptr);
1913
1914 // Create ToNumber operation on ds
1915 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeInt64);
1916 EXPECT_NE(to_number, nullptr);
1917
1918 // Create a Map operation on ds
1919 ds = ds->Map({to_number}, {"text"});
1920 EXPECT_NE(ds, nullptr);
1921
1922 // Create an iterator over the result of the above dataset
1923 // This will trigger the creation of the Execution Tree and launch it.
1924 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1925 EXPECT_NE(iter, nullptr);
1926
1927 // Iterate the dataset and get each row
1928 std::unordered_map<std::string, mindspore::MSTensor> row;
1929
1930 // Expect error: invalid input which is non numerical
1931 EXPECT_ERROR(iter->GetNextRow(&row));
1932
1933 uint64_t i = 0;
1934 while (row.size() != 0) {
1935 EXPECT_ERROR(iter->GetNextRow(&row));
1936 i++;
1937 }
1938
1939 // Expect failure: GetNextRow fail and return nothing
1940 EXPECT_EQ(i, 0);
1941
1942 // Manually terminate the pipeline
1943 iter->Stop();
1944 }
1945
1946 /// Feature: ToNumber op
1947 /// Description: Test ToNumber with non numerical data type (kObjectTypeString)
1948 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestToNumberFail4)1949 TEST_F(MindDataTestPipeline, TestToNumberFail4) {
1950 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail4.";
1951 // Test ToNumber with non numerical data type
1952
1953 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1954
1955 // Create a TextFile dataset
1956 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1957 EXPECT_NE(ds, nullptr);
1958
1959 // Create ToNumber operation on ds
1960 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kObjectTypeString);
1961 EXPECT_NE(to_number, nullptr);
1962
1963 // Create a Map operation on ds
1964 ds = ds->Map({to_number}, {"text"});
1965 EXPECT_NE(ds, nullptr);
1966
1967 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1968 // Expect failure: invalid parameter with non numerical data type
1969 EXPECT_EQ(iter, nullptr);
1970 }
1971
1972 /// Feature: ToNumber op
1973 /// Description: Test ToNumber with non numerical data type (kObjectTypeBool)
1974 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestToNumberFail5)1975 TEST_F(MindDataTestPipeline, TestToNumberFail5) {
1976 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestToNumberFail5.";
1977 // Test ToNumber with non numerical data type
1978
1979 std::string data_file = datasets_root_path_ + "/testTokenizerData/to_number.txt";
1980
1981 // Create a TextFile dataset
1982 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
1983 EXPECT_NE(ds, nullptr);
1984
1985 // Create ToNumber operation on ds
1986 std::shared_ptr<TensorTransform> to_number = std::make_shared<text::ToNumber>(mindspore::DataType::kNumberTypeBool);
1987 EXPECT_NE(to_number, nullptr);
1988
1989 // Create a Map operation on ds
1990 ds = ds->Map({to_number}, {"text"});
1991 EXPECT_NE(ds, nullptr);
1992
1993 std::shared_ptr<Iterator> iter = ds->CreateIterator();
1994 // Expect failure: invalid parameter with non numerical data type
1995 EXPECT_EQ(iter, nullptr);
1996 }
1997
1998 /// Feature: TruncateSequencePair op
1999 /// Description: Test TruncateSequencePair basic usage
2000 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestTruncateSequencePairSuccess1)2001 TEST_F(MindDataTestPipeline, TestTruncateSequencePairSuccess1) {
2002 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairSuccess1.";
2003 // Testing basic TruncateSequencePair
2004
2005 // Set seed for RandomDataset
2006 auto original_seed = config::get_seed();
2007 bool status_set_seed = config::set_seed(0);
2008 EXPECT_EQ(status_set_seed, true);
2009
2010 // Set num_parallel_workers for RandomDataset
2011 auto original_worker = config::get_num_parallel_workers();
2012 bool status_set_worker = config::set_num_parallel_workers(1);
2013 EXPECT_EQ(status_set_worker, true);
2014
2015 // Create a RandomDataset which has column names "col1" and "col2"
2016 std::shared_ptr<SchemaObj> schema = Schema();
2017 ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt16, {5}));
2018 ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt32, {3}));
2019 std::shared_ptr<Dataset> ds = RandomData(3, schema);
2020 EXPECT_NE(ds, nullptr);
2021
2022 // Create a truncate_sequence_pair operation on ds
2023 std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(4);
2024 EXPECT_NE(truncate_sequence_pair, nullptr);
2025
2026 // Create Map operation on ds
2027 ds = ds->Map({truncate_sequence_pair}, {"col1", "col2"});
2028 EXPECT_NE(ds, nullptr);
2029
2030 // Create an iterator over the result of the above dataset
2031 // This will trigger the creation of the Execution Tree and launch it.
2032 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2033 EXPECT_NE(iter, nullptr);
2034
2035 // Iterate the dataset and get each row
2036 std::unordered_map<std::string, mindspore::MSTensor> row;
2037 ASSERT_OK(iter->GetNextRow(&row));
2038
2039 std::vector<std::vector<int16_t>> expected1 = {{-29556, -29556}, {-18505, -18505}, {-25958, -25958}};
2040 std::vector<std::vector<int32_t>> expected2 = {
2041 {-1751672937, -1751672937}, {-656877352, -656877352}, {-606348325, -606348325}};
2042
2043 uint64_t i = 0;
2044 while (row.size() != 0) {
2045 auto ind1 = row["col1"];
2046 auto ind2 = row["col2"];
2047
2048 std::shared_ptr<Tensor> de_expected_tensor1;
2049 ASSERT_OK(Tensor::CreateFromVector(expected1[i], &de_expected_tensor1));
2050 mindspore::MSTensor expected_tensor1 =
2051 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor1));
2052 EXPECT_MSTENSOR_EQ(ind1, expected_tensor1);
2053
2054 std::shared_ptr<Tensor> de_expected_tensor2;
2055 ASSERT_OK(Tensor::CreateFromVector(expected2[i], &de_expected_tensor2));
2056 mindspore::MSTensor expected_tensor2 =
2057 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor2));
2058 EXPECT_MSTENSOR_EQ(ind2, expected_tensor2);
2059
2060 ASSERT_OK(iter->GetNextRow(&row));
2061 i++;
2062 }
2063
2064 EXPECT_EQ(i, 3);
2065
2066 // Manually terminate the pipeline
2067 iter->Stop();
2068
2069 // Restore original seed and num_parallel_workers
2070 status_set_seed = config::set_seed(original_seed);
2071 EXPECT_EQ(status_set_seed, true);
2072 status_set_worker = config::set_num_parallel_workers(original_worker);
2073 EXPECT_EQ(status_set_worker, true);
2074 }
2075
2076 /// Feature: TruncateSequencePair op
2077 /// Description: Test TruncateSequencePair with odd max_length
2078 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestTruncateSequencePairSuccess2)2079 TEST_F(MindDataTestPipeline, TestTruncateSequencePairSuccess2) {
2080 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairSuccess2.";
2081 // Testing basic TruncateSequencePair with odd max_length
2082
2083 // Set seed for RandomDataset
2084 auto original_seed = config::get_seed();
2085 bool status_set_seed = config::set_seed(1);
2086 EXPECT_EQ(status_set_seed, true);
2087
2088 // Set num_parallel_workers for RandomDataset
2089 auto original_worker = config::get_num_parallel_workers();
2090 bool status_set_worker = config::set_num_parallel_workers(1);
2091 EXPECT_EQ(status_set_worker, true);
2092
2093 // Create a RandomDataset which has column names "col1" and "col2"
2094 std::shared_ptr<SchemaObj> schema = Schema();
2095 ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt32, {4}));
2096 ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt64, {4}));
2097 std::shared_ptr<Dataset> ds = RandomData(4, schema);
2098 EXPECT_NE(ds, nullptr);
2099
2100 // Create a truncate_sequence_pair operation on ds
2101 std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(5);
2102 EXPECT_NE(truncate_sequence_pair, nullptr);
2103
2104 // Create Map operation on ds
2105 ds = ds->Map({truncate_sequence_pair}, {"col1", "col2"});
2106 EXPECT_NE(ds, nullptr);
2107
2108 // Create an iterator over the result of the above dataset
2109 // This will trigger the creation of the Execution Tree and launch it.
2110 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2111 EXPECT_NE(iter, nullptr);
2112
2113 // Iterate the dataset and get each row
2114 std::unordered_map<std::string, mindspore::MSTensor> row;
2115 ASSERT_OK(iter->GetNextRow(&row));
2116
2117 std::vector<std::vector<int32_t>> expected1 = {{1785358954, 1785358954, 1785358954},
2118 {-1195853640, -1195853640, -1195853640},
2119 {0, 0, 0},
2120 {1296911693, 1296911693, 1296911693}};
2121 std::vector<std::vector<int64_t>> expected2 = {
2122 {-1, -1}, {-1229782938247303442, -1229782938247303442}, {2314885530818453536, 2314885530818453536}, {-1, -1}};
2123
2124 uint64_t i = 0;
2125 while (row.size() != 0) {
2126 auto ind1 = row["col1"];
2127 auto ind2 = row["col2"];
2128
2129 std::shared_ptr<Tensor> de_expected_tensor1;
2130 ASSERT_OK(Tensor::CreateFromVector(expected1[i], &de_expected_tensor1));
2131 mindspore::MSTensor expected_tensor1 =
2132 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor1));
2133 EXPECT_MSTENSOR_EQ(ind1, expected_tensor1);
2134
2135 std::shared_ptr<Tensor> de_expected_tensor2;
2136 ASSERT_OK(Tensor::CreateFromVector(expected2[i], &de_expected_tensor2));
2137 mindspore::MSTensor expected_tensor2 =
2138 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor2));
2139 EXPECT_MSTENSOR_EQ(ind2, expected_tensor2);
2140
2141 ASSERT_OK(iter->GetNextRow(&row));
2142 i++;
2143 }
2144
2145 EXPECT_EQ(i, 4);
2146
2147 // Manually terminate the pipeline
2148 iter->Stop();
2149
2150 // Restore original seed and num_parallel_workers
2151 status_set_seed = config::set_seed(original_seed);
2152 EXPECT_EQ(status_set_seed, true);
2153 status_set_worker = config::set_num_parallel_workers(original_worker);
2154 EXPECT_EQ(status_set_worker, true);
2155 }
2156
2157 /// Feature: TruncateSequencePair op
2158 /// Description: Test TruncateSequencePair with negative max_length
2159 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestTruncateSequencePairFail)2160 TEST_F(MindDataTestPipeline, TestTruncateSequencePairFail) {
2161 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSequencePairFail.";
2162 // Testing TruncateSequencePair with negative max_length
2163
2164 // Create a RandomDataset which has column names "col1" and "col2"
2165 std::shared_ptr<SchemaObj> schema = Schema();
2166 ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt8, {3}));
2167 ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt8, {3}));
2168 std::shared_ptr<Dataset> ds = RandomData(3, schema);
2169 EXPECT_NE(ds, nullptr);
2170
2171 // Create a truncate_sequence_pair operation on ds
2172 std::shared_ptr<TensorTransform> truncate_sequence_pair = std::make_shared<text::TruncateSequencePair>(-1);
2173 EXPECT_NE(truncate_sequence_pair, nullptr);
2174
2175 // Create a Map operation on ds
2176 ds = ds->Map({truncate_sequence_pair});
2177 EXPECT_NE(ds, nullptr);
2178
2179 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2180 // Expect failure: invalid TruncateSequencePair input (invalid parameter with negative max_length)
2181 EXPECT_EQ(iter, nullptr);
2182 }
2183
2184 /// Feature: Ngram op
2185 /// Description: Test parameters for Ngram interface
2186 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestNgramSuccess)2187 TEST_F(MindDataTestPipeline, TestNgramSuccess) {
2188 // Testing the parameter of Ngram interface.
2189 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess.";
2190
2191 // Create a TextFile dataset
2192 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2193 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2194 EXPECT_NE(ds, nullptr);
2195
2196 // Create white_tokenizer operation on ds
2197 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
2198 EXPECT_NE(white_tokenizer, nullptr);
2199 // Create sliding_window operation on ds
2200 auto ngram_op = std::make_shared<text::Ngram>(
2201 std::vector<int>{2}, std::pair<std::string, int32_t>{"_", 1}, std::pair<std::string, int32_t>{"_", 1}, " ");
2202 EXPECT_NE(ngram_op, nullptr);
2203
2204 // Create Map operation on ds
2205 ds = ds->Map({white_tokenizer, ngram_op}, {"text"});
2206 EXPECT_NE(ds, nullptr);
2207
2208 // Create an iterator over the result of the above dataset
2209 // This will trigger the creation of the Execution Tree and launch it.
2210 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2211 EXPECT_NE(iter, nullptr);
2212
2213 // Iterate the dataset and get each row
2214 std::unordered_map<std::string, mindspore::MSTensor> row;
2215 ASSERT_OK(iter->GetNextRow(&row));
2216
2217 std::vector<std::vector<std::string>> expected = {{"_ This", "This is", "is a", "a text", "text file.", "file. _"},
2218 {"_ Be", "Be happy", "happy every", "every day.", "day. _"},
2219 {"_ Good", "Good luck", "luck to", "to everyone.", "everyone. _"}};
2220
2221 uint64_t i = 0;
2222 while (row.size() != 0) {
2223 auto ind = row["text"];
2224
2225 std::shared_ptr<Tensor> de_expected_tensor;
2226 int x = expected[i].size();
2227 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
2228 mindspore::MSTensor expected_tensor =
2229 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2230 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
2231
2232 ASSERT_OK(iter->GetNextRow(&row));
2233 i++;
2234 }
2235
2236 EXPECT_EQ(i, 3);
2237
2238 // Manually terminate the pipeline
2239 iter->Stop();
2240 }
2241
2242 /// Feature: Ngram op
2243 /// Description: Test Ngram basic usage
2244 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestNgramSuccess1)2245 TEST_F(MindDataTestPipeline, TestNgramSuccess1) {
2246 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramSuccess1.";
2247
2248 // Create a TextFile dataset
2249 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2250 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2251 EXPECT_NE(ds, nullptr);
2252
2253 // Create white_tokenizer operation on ds
2254 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
2255 EXPECT_NE(white_tokenizer, nullptr);
2256 // Create sliding_window operation on ds
2257 auto ngram_op = std::make_shared<text::Ngram>(
2258 std::vector<int32_t>{2, 3}, std::pair<std::string, int32_t>{"&", 2}, std::pair<std::string, int32_t>{"&", 2}, "-");
2259 EXPECT_NE(ngram_op, nullptr);
2260
2261 // Create Map operation on ds
2262 ds = ds->Map({white_tokenizer, ngram_op}, {"text"});
2263 EXPECT_NE(ds, nullptr);
2264
2265 // Create an iterator over the result of the above dataset
2266 // This will trigger the creation of the Execution Tree and launch it.
2267 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2268 EXPECT_NE(iter, nullptr);
2269
2270 // Iterate the dataset and get each row
2271 std::unordered_map<std::string, mindspore::MSTensor> row;
2272 ASSERT_OK(iter->GetNextRow(&row));
2273
2274 std::vector<std::vector<std::string>> expected = {
2275 {"&-This", "This-is", "is-a", "a-text", "text-file.", "file.-&", "&-&-This", "&-This-is", "This-is-a", "is-a-text",
2276 "a-text-file.", "text-file.-&", "file.-&-&"},
2277 {"&-Be", "Be-happy", "happy-every", "every-day.", "day.-&", "&-&-Be", "&-Be-happy", "Be-happy-every",
2278 "happy-every-day.", "every-day.-&", "day.-&-&"},
2279 {"&-Good", "Good-luck", "luck-to", "to-everyone.", "everyone.-&", "&-&-Good", "&-Good-luck", "Good-luck-to",
2280 "luck-to-everyone.", "to-everyone.-&", "everyone.-&-&"}};
2281
2282 uint64_t i = 0;
2283 while (row.size() != 0) {
2284 auto ind = row["text"];
2285
2286 std::shared_ptr<Tensor> de_expected_tensor;
2287 int x = expected[i].size();
2288 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
2289 mindspore::MSTensor expected_tensor =
2290 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2291 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
2292
2293 ASSERT_OK(iter->GetNextRow(&row));
2294 i++;
2295 }
2296
2297 EXPECT_EQ(i, 3);
2298
2299 // Manually terminate the pipeline
2300 iter->Stop();
2301 }
2302
2303 /// Feature: Ngram op
2304 /// Description: Test Ngram where the vector of ngram is empty
2305 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestNgramFail1)2306 TEST_F(MindDataTestPipeline, TestNgramFail1) {
2307 // Testing the incorrect parameter of Ngram interface.
2308 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail1.";
2309
2310 // Create a TextFile dataset
2311 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2312 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2313 EXPECT_NE(ds, nullptr);
2314
2315 // Create sliding_window operation on ds
2316 // Testing the vector of ngram is empty
2317 auto ngram_op = std::make_shared<text::Ngram>(std::vector<int32_t>{});
2318 EXPECT_NE(ngram_op, nullptr);
2319
2320 // Create a Map operation on ds
2321 ds = ds->Map({ngram_op});
2322 EXPECT_NE(ds, nullptr);
2323
2324 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2325 // Expect failure: invalid Ngram input (the vector of ngram is empty)
2326 EXPECT_EQ(iter, nullptr);
2327 }
2328
2329 /// Feature: Ngram op
2330 /// Description: Test Ngram where value of ngram vector is equal to 0
2331 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestNgramFail2)2332 TEST_F(MindDataTestPipeline, TestNgramFail2) {
2333 // Testing the incorrect parameter of Ngram interface.
2334 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail2.";
2335
2336 // Create a TextFile dataset
2337 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2338 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2339 EXPECT_NE(ds, nullptr);
2340
2341 // Create sliding_window operation on ds
2342 // Testing the value of ngrams vector less than and equal to 0
2343 auto ngram_op = std::make_shared<text::Ngram>(std::vector<int32_t>{0});
2344 EXPECT_NE(ngram_op, nullptr);
2345
2346 // Create a Map operation on ds
2347 ds = ds->Map({ngram_op});
2348 EXPECT_NE(ds, nullptr);
2349
2350 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2351 // Expect failure: invalid Ngram input (the value of ngrams vector less than and equal to 0)
2352 EXPECT_EQ(iter, nullptr);
2353 }
2354
2355 /// Feature: Ngram op
2356 /// Description: Test Ngram where value of ngram vector is less than 0
2357 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestNgramFail3)2358 TEST_F(MindDataTestPipeline, TestNgramFail3) {
2359 // Testing the incorrect parameter of Ngram interface.
2360 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail3.";
2361
2362 // Create a TextFile dataset
2363 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2364 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2365 EXPECT_NE(ds, nullptr);
2366
2367 // Create sliding_window operation on ds
2368 // Testing the value of ngrams vector less than and equal to 0
2369 auto ngram_op = std::make_shared<text::Ngram>(std::vector<int32_t>{-2});
2370 EXPECT_NE(ngram_op, nullptr);
2371
2372 // Create a Map operation on ds
2373 ds = ds->Map({ngram_op});
2374 EXPECT_NE(ds, nullptr);
2375
2376 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2377 // Expect failure: invalid Ngram input (the value of ngrams vector less than and equal to 0)
2378 EXPECT_EQ(iter, nullptr);
2379 }
2380
2381 /// Feature: Ngram op
2382 /// Description: Test Ngram where second parameter pad_width in left_pad vector is less than 0
2383 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestNgramFail4)2384 TEST_F(MindDataTestPipeline, TestNgramFail4) {
2385 // Testing the incorrect parameter of Ngram interface.
2386 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail4.";
2387
2388 // Create a TextFile dataset
2389 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2390 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2391 EXPECT_NE(ds, nullptr);
2392
2393 // Create sliding_window operation on ds
2394 // Testing the second parameter pad_width in left_pad vector less than 0
2395 auto ngram_op = std::make_shared<text::Ngram>(std::vector<int32_t>{2}, std::pair<std::string, int32_t>{"", -1});
2396 EXPECT_NE(ngram_op, nullptr);
2397
2398 // Create a Map operation on ds
2399 ds = ds->Map({ngram_op});
2400 EXPECT_NE(ds, nullptr);
2401
2402 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2403 // Expect failure: invalid Ngram input (the second parameter pad_width in left_pad vector less than 0)
2404 EXPECT_EQ(iter, nullptr);
2405 }
2406
2407 /// Feature: Ngram op
2408 /// Description: Test Ngram where second parameter pad_width in right_pad vector is less than 0
2409 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestNgramFail5)2410 TEST_F(MindDataTestPipeline, TestNgramFail5) {
2411 // Testing the incorrect parameter of Ngram interface.
2412 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNgramFail5.";
2413
2414 // Create a TextFile dataset
2415 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
2416 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2417 EXPECT_NE(ds, nullptr);
2418
2419 // Create sliding_window operation on ds
2420 // Testing the second parameter pad_width in right_pad vector less than 0
2421 auto ngram_op = std::make_shared<text::Ngram>(
2422 std::vector<int32_t>{2}, std::pair<std::string, int32_t>{"", 1}, std::pair<std::string, int32_t>{"", -1});
2423 EXPECT_NE(ngram_op, nullptr);
2424
2425 // Create a Map operation on ds
2426 ds = ds->Map({ngram_op});
2427 EXPECT_NE(ds, nullptr);
2428
2429 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2430 // Expect failure: invalid Ngram input (the second parameter pad_width in left_pad vector less than 0)
2431 EXPECT_EQ(iter, nullptr);
2432 }
2433
2434 /// Feature: NormalizeUTF8 op
2435 /// Description: Test NormalizeUTF8 when the normalize_form is NormalizeForm::kNfkc
2436 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestNormalizeUTF8Success)2437 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success) {
2438 // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkc.
2439 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success.";
2440
2441 // Create a TextFile dataset
2442 std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
2443 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2444 EXPECT_NE(ds, nullptr);
2445
2446 // Create normalizeutf8 operation on ds
2447 std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfkc);
2448 EXPECT_NE(normalizeutf8, nullptr);
2449
2450 // Create Map operation on ds
2451 ds = ds->Map({normalizeutf8}, {"text"});
2452 EXPECT_NE(ds, nullptr);
2453
2454 // Create an iterator over the result of the above dataset
2455 // This will trigger the creation of the Execution Tree and launch it.
2456 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2457 EXPECT_NE(iter, nullptr);
2458
2459 // Iterate the dataset and get each row
2460 std::unordered_map<std::string, mindspore::MSTensor> row;
2461 ASSERT_OK(iter->GetNextRow(&row));
2462
2463 std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"};
2464
2465 uint64_t i = 0;
2466 while (row.size() != 0) {
2467 auto ind = row["text"];
2468 std::shared_ptr<Tensor> de_expected_tensor;
2469 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2470 mindspore::MSTensor ms_expected_tensor =
2471 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2472 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2473 ASSERT_OK(iter->GetNextRow(&row));
2474 i++;
2475 }
2476
2477 EXPECT_EQ(i, 6);
2478
2479 // Manually terminate the pipeline
2480 iter->Stop();
2481 }
2482
2483 /// Feature: NormalizeUTF8 op
2484 /// Description: Test NormalizeUTF8 when the normalize_form is NormalizeForm::kNfc
2485 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestNormalizeUTF8Success1)2486 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success1) {
2487 // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfc.
2488 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success1.";
2489
2490 // Create a TextFile dataset
2491 std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
2492 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2493 EXPECT_NE(ds, nullptr);
2494
2495 // Create normalizeutf8 operation on ds
2496 std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfc);
2497 EXPECT_NE(normalizeutf8, nullptr);
2498
2499 // Create Map operation on ds
2500 ds = ds->Map({normalizeutf8}, {"text"});
2501 EXPECT_NE(ds, nullptr);
2502
2503 // Create an iterator over the result of the above dataset
2504 // This will trigger the creation of the Execution Tree and launch it.
2505 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2506 EXPECT_NE(iter, nullptr);
2507
2508 // Iterate the dataset and get each row
2509 std::unordered_map<std::string, mindspore::MSTensor> row;
2510 ASSERT_OK(iter->GetNextRow(&row));
2511
2512 std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"};
2513
2514 uint64_t i = 0;
2515 while (row.size() != 0) {
2516 auto ind = row["text"];
2517 std::shared_ptr<Tensor> de_expected_tensor;
2518 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2519 mindspore::MSTensor ms_expected_tensor =
2520 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2521 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2522 ASSERT_OK(iter->GetNextRow(&row));
2523 i++;
2524 }
2525
2526 EXPECT_EQ(i, 6);
2527
2528 // Manually terminate the pipeline
2529 iter->Stop();
2530 }
2531
2532 /// Feature: NormalizeUTF8 op
2533 /// Description: Test NormalizeUTF8 when the normalize_form is NormalizeForm::kNfd
2534 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestNormalizeUTF8Success2)2535 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success2) {
2536 // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfd.
2537 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success2.";
2538
2539 // Create a TextFile dataset
2540 std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
2541 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2542 EXPECT_NE(ds, nullptr);
2543
2544 // Create normalizeutf8 operation on ds
2545 std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfd);
2546 EXPECT_NE(normalizeutf8, nullptr);
2547
2548 // Create Map operation on ds
2549 ds = ds->Map({normalizeutf8}, {"text"});
2550 EXPECT_NE(ds, nullptr);
2551
2552 // Create an iterator over the result of the above dataset
2553 // This will trigger the creation of the Execution Tree and launch it.
2554 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2555 EXPECT_NE(iter, nullptr);
2556
2557 // Iterate the dataset and get each row
2558 std::unordered_map<std::string, mindspore::MSTensor> row;
2559 ASSERT_OK(iter->GetNextRow(&row));
2560
2561 std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "2⁵", "ẛ̣"};
2562
2563 uint64_t i = 0;
2564 while (row.size() != 0) {
2565 auto ind = row["text"];
2566 std::shared_ptr<Tensor> de_expected_tensor;
2567 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2568 mindspore::MSTensor ms_expected_tensor =
2569 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2570 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2571 ASSERT_OK(iter->GetNextRow(&row));
2572 i++;
2573 }
2574
2575 EXPECT_EQ(i, 6);
2576
2577 // Manually terminate the pipeline
2578 iter->Stop();
2579 }
2580
2581 /// Feature: NormalizeUTF8 op
2582 /// Description: Test NormalizeUTF8 when the normalize_form is NormalizeForm::kNfkd
2583 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestNormalizeUTF8Success3)2584 TEST_F(MindDataTestPipeline, TestNormalizeUTF8Success3) {
2585 // Testing the parameter of NormalizeUTF8 interface when the normalize_form is NormalizeForm::kNfkd.
2586 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestNormalizeUTF8Success3.";
2587
2588 // Create a TextFile dataset
2589 std::string data_file = datasets_root_path_ + "/testTokenizerData/normalize.txt";
2590 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2591 EXPECT_NE(ds, nullptr);
2592
2593 // Create normalizeutf8 operation on ds
2594 std::shared_ptr<TensorTransform> normalizeutf8 = std::make_shared<text::NormalizeUTF8>(NormalizeForm::kNfkd);
2595 EXPECT_NE(normalizeutf8, nullptr);
2596
2597 // Create Map operation on ds
2598 ds = ds->Map({normalizeutf8}, {"text"});
2599 EXPECT_NE(ds, nullptr);
2600
2601 // Create an iterator over the result of the above dataset
2602 // This will trigger the creation of the Execution Tree and launch it.
2603 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2604 EXPECT_NE(iter, nullptr);
2605
2606 // Iterate the dataset and get each row
2607 std::unordered_map<std::string, mindspore::MSTensor> row;
2608 ASSERT_OK(iter->GetNextRow(&row));
2609
2610 std::vector<std::string> expected = {"ṩ", "ḍ̇", "q̣̇", "fi", "25", "ṩ"};
2611
2612 uint64_t i = 0;
2613 while (row.size() != 0) {
2614 auto ind = row["text"];
2615 std::shared_ptr<Tensor> de_expected_tensor;
2616 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2617 mindspore::MSTensor ms_expected_tensor =
2618 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2619 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2620 ASSERT_OK(iter->GetNextRow(&row));
2621 i++;
2622 }
2623
2624 EXPECT_EQ(i, 6);
2625
2626 // Manually terminate the pipeline
2627 iter->Stop();
2628 }
2629
2630 /// Feature: RegexReplace op
2631 /// Description: Test RegexReplace when the replace_all=true
2632 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestRegexReplaceSuccess)2633 TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess) {
2634 // Testing the parameter of RegexReplace interface when the replace_all is true.
2635 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess.";
2636
2637 // Create a TextFile dataset
2638 std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
2639 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2640 EXPECT_NE(ds, nullptr);
2641
2642 // Create regex_replace operation on ds
2643 std::shared_ptr<TensorTransform> regex_replace = std::make_shared<text::RegexReplace>("\\s+", "_", true);
2644 EXPECT_NE(regex_replace, nullptr);
2645
2646 // Create Map operation on ds
2647 ds = ds->Map({regex_replace}, {"text"});
2648 EXPECT_NE(ds, nullptr);
2649
2650 // Create an iterator over the result of the above dataset
2651 // This will trigger the creation of the Execution Tree and launch it.
2652 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2653 EXPECT_NE(iter, nullptr);
2654
2655 // Iterate the dataset and get each row
2656 std::unordered_map<std::string, mindspore::MSTensor> row;
2657 ASSERT_OK(iter->GetNextRow(&row));
2658
2659 std::vector<std::string> expected = {"Hello_World", "Let's_Go", "1:hello", "2:world",
2660 "31:beijing", "Welcome_to_China!", "_我_不想_长大_", "Welcome_to_Shenzhen!"};
2661
2662 uint64_t i = 0;
2663 while (row.size() != 0) {
2664 auto ind = row["text"];
2665 std::shared_ptr<Tensor> de_expected_tensor;
2666 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2667 mindspore::MSTensor ms_expected_tensor =
2668 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2669 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2670 ASSERT_OK(iter->GetNextRow(&row));
2671 i++;
2672 }
2673
2674 EXPECT_EQ(i, 8);
2675
2676 // Manually terminate the pipeline
2677 iter->Stop();
2678 }
2679
2680 /// Feature: RegexReplace op
2681 /// Description: Test RegexReplace when the replace_all=false
2682 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestRegexReplaceSuccess1)2683 TEST_F(MindDataTestPipeline, TestRegexReplaceSuccess1) {
2684 // Testing the parameter of RegexReplace interface when the replace_all is false.
2685 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexReplaceSuccess1.";
2686
2687 // Create a TextFile dataset
2688 std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
2689 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2690 EXPECT_NE(ds, nullptr);
2691
2692 // Create regex_replace operation on ds
2693 std::shared_ptr<TensorTransform> regex_replace = std::make_shared<text::RegexReplace>("\\s+", "_", false);
2694 EXPECT_NE(regex_replace, nullptr);
2695
2696 // Create Map operation on ds
2697 ds = ds->Map({regex_replace}, {"text"});
2698 EXPECT_NE(ds, nullptr);
2699
2700 // Create an iterator over the result of the above dataset
2701 // This will trigger the creation of the Execution Tree and launch it.
2702 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2703 EXPECT_NE(iter, nullptr);
2704
2705 // Iterate the dataset and get each row
2706 std::unordered_map<std::string, mindspore::MSTensor> row;
2707 ASSERT_OK(iter->GetNextRow(&row));
2708
2709 std::vector<std::string> expected = {"Hello_World", "Let's_Go", "1:hello", "2:world",
2710 "31:beijing", "Welcome_to China!", "_我 不想 长大 ", "Welcome_to Shenzhen!"};
2711
2712 uint64_t i = 0;
2713 while (row.size() != 0) {
2714 auto ind = row["text"];
2715 std::shared_ptr<Tensor> de_expected_tensor;
2716 ASSERT_OK(Tensor::CreateScalar(expected[i], &de_expected_tensor));
2717 mindspore::MSTensor ms_expected_tensor =
2718 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2719 EXPECT_MSTENSOR_EQ(ind, ms_expected_tensor);
2720 ASSERT_OK(iter->GetNextRow(&row));
2721 i++;
2722 }
2723
2724 EXPECT_EQ(i, 8);
2725
2726 // Manually terminate the pipeline
2727 iter->Stop();
2728 }
2729
2730 /// Feature: RegexTokenizer op
2731 /// Description: Test RegexTokenizer when with_offsets=false
2732 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestRegexTokenizerSuccess)2733 TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess) {
2734 // Testing the parameter of RegexTokenizer interface when the with_offsets is false.
2735 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess.";
2736
2737 // Create a TextFile dataset
2738 std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
2739 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2740 EXPECT_NE(ds, nullptr);
2741
2742 // Create regex_tokenizer operation on ds
2743 std::shared_ptr<TensorTransform> regex_tokenizer = std::make_shared<text::RegexTokenizer>("\\s+", "\\s+", false);
2744 EXPECT_NE(regex_tokenizer, nullptr);
2745
2746 // Create Map operation on ds
2747 ds = ds->Map({regex_tokenizer}, {"text"});
2748 EXPECT_NE(ds, nullptr);
2749
2750 // Create an iterator over the result of the above dataset
2751 // This will trigger the creation of the Execution Tree and launch it.
2752 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2753 EXPECT_NE(iter, nullptr);
2754
2755 // Iterate the dataset and get each row
2756 std::unordered_map<std::string, mindspore::MSTensor> row;
2757 ASSERT_OK(iter->GetNextRow(&row));
2758
2759 std::vector<std::vector<std::string>> expected = {{"Hello", " ", "World"},
2760 {"Let's", " ", "Go"},
2761 {"1:hello"},
2762 {"2:world"},
2763 {"31:beijing"},
2764 {"Welcome", " ", "to", " ", "China!"},
2765 {" ", "我", " ", "不想", " ", "长大", " "},
2766 {"Welcome", " ", "to", " ", "Shenzhen!"}};
2767
2768 uint64_t i = 0;
2769 while (row.size() != 0) {
2770 auto ind = row["text"];
2771
2772 std::shared_ptr<Tensor> de_expected_tensor;
2773 int x = expected[i].size();
2774 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
2775 mindspore::MSTensor expected_tensor =
2776 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2777 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
2778
2779 ASSERT_OK(iter->GetNextRow(&row));
2780 i++;
2781 }
2782
2783 EXPECT_EQ(i, 8);
2784
2785 // Manually terminate the pipeline
2786 iter->Stop();
2787 }
2788
2789 /// Feature: RegexTokenizer op
2790 /// Description: Test RegexTokenizer when with_offsets=true
2791 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestRegexTokenizerSuccess1)2792 TEST_F(MindDataTestPipeline, TestRegexTokenizerSuccess1) {
2793 // Testing the parameter of RegexTokenizer interface when the with_offsets is true.
2794 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestRegexTokenizerSuccess1.";
2795
2796 // Create a TextFile dataset
2797 std::string data_file = datasets_root_path_ + "/testTokenizerData/regex_replace.txt";
2798 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2799 EXPECT_NE(ds, nullptr);
2800
2801 // Create regex_tokenizer operation on ds
2802 std::shared_ptr<TensorTransform> regex_tokenizer = std::make_shared<text::RegexTokenizer>("\\s+", "\\s+", true);
2803 EXPECT_NE(regex_tokenizer, nullptr);
2804
2805 // Create Map operation on ds
2806 ds = ds->Map({regex_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
2807 EXPECT_NE(ds, nullptr);
2808
2809 // Create an iterator over the result of the above dataset
2810 // This will trigger the creation of the Execution Tree and launch it.
2811 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2812 EXPECT_NE(iter, nullptr);
2813
2814 // Iterate the dataset and get each row
2815 std::unordered_map<std::string, mindspore::MSTensor> row;
2816 ASSERT_OK(iter->GetNextRow(&row));
2817
2818 std::vector<std::vector<std::string>> expected_tokens = {{"Hello", " ", "World"},
2819 {"Let's", " ", "Go"},
2820 {"1:hello"},
2821 {"2:world"},
2822 {"31:beijing"},
2823 {"Welcome", " ", "to", " ", "China!"},
2824 {" ", "我", " ", "不想", " ", "长大", " "},
2825 {"Welcome", " ", "to", " ", "Shenzhen!"}};
2826
2827 std::vector<std::vector<uint32_t>> expected_offsets_start = {
2828 {0, 5, 6}, {0, 5, 6}, {0}, {0}, {0}, {0, 7, 8, 10, 11}, {0, 2, 5, 6, 12, 14, 20}, {0, 7, 8, 10, 11}};
2829 std::vector<std::vector<uint32_t>> expected_offsets_limit = {
2830 {5, 6, 11}, {5, 6, 8}, {7}, {7}, {10}, {7, 8, 10, 11, 17}, {2, 5, 6, 12, 14, 20, 21}, {7, 8, 10, 11, 20}};
2831
2832 uint64_t i = 0;
2833 while (row.size() != 0) {
2834 auto token = row["token"];
2835 auto start = row["offsets_start"];
2836 auto limit = row["offsets_limit"];
2837
2838 std::shared_ptr<Tensor> de_expected_tokens;
2839 int x = expected_tokens[i].size();
2840 ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
2841 mindspore::MSTensor ms_expected_tokens =
2842 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
2843 EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
2844
2845 std::shared_ptr<Tensor> de_expected_offsets_start;
2846 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
2847 mindspore::MSTensor ms_expected_offsets_start =
2848 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
2849 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
2850
2851 std::shared_ptr<Tensor> de_expected_offsets_limit;
2852 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
2853 mindspore::MSTensor ms_expected_offsets_limit =
2854 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
2855 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
2856
2857 ASSERT_OK(iter->GetNextRow(&row));
2858 i++;
2859 }
2860
2861 EXPECT_EQ(i, 8);
2862
2863 // Manually terminate the pipeline
2864 iter->Stop();
2865 }
2866
2867 /// Feature: UnicodeCharTokenizer op
2868 /// Description: Test UnicodeCharTokenizer when with_offsets is default
2869 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestUnicodeCharTokenizerSuccess)2870 TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess) {
2871 // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is default.
2872 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess.";
2873
2874 // Create a TextFile dataset
2875 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
2876 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2877 EXPECT_NE(ds, nullptr);
2878
2879 // Create unicodechar_tokenizer operation on ds
2880 std::shared_ptr<TensorTransform> unicodechar_tokenizer = std::make_shared<text::UnicodeCharTokenizer>();
2881 EXPECT_NE(unicodechar_tokenizer, nullptr);
2882
2883 // Create Map operation on ds
2884 ds = ds->Map({unicodechar_tokenizer}, {"text"});
2885 EXPECT_NE(ds, nullptr);
2886
2887 // Create an iterator over the result of the above dataset
2888 // This will trigger the creation of the Execution Tree and launch it.
2889 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2890 EXPECT_NE(iter, nullptr);
2891
2892 // Iterate the dataset and get each row
2893 std::unordered_map<std::string, mindspore::MSTensor> row;
2894 ASSERT_OK(iter->GetNextRow(&row));
2895
2896 std::vector<std::vector<std::string>> expected = {
2897 {"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"},
2898 {"北", "京", "欢", "迎", "您", "!"},
2899 {"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"},
2900 {" ", " "}};
2901
2902 uint64_t i = 0;
2903 while (row.size() != 0) {
2904 auto ind = row["text"];
2905
2906 std::shared_ptr<Tensor> de_expected_tensor;
2907 int x = expected[i].size();
2908 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
2909 mindspore::MSTensor expected_tensor =
2910 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
2911 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
2912
2913 ASSERT_OK(iter->GetNextRow(&row));
2914 i++;
2915 }
2916
2917 EXPECT_EQ(i, 4);
2918
2919 // Manually terminate the pipeline
2920 iter->Stop();
2921 }
2922
2923 /// Feature: UnicodeCharTokenizer op
2924 /// Description: Test UnicodeCharTokenizer when with_offsets=true
2925 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestUnicodeCharTokenizerSuccess1)2926 TEST_F(MindDataTestPipeline, TestUnicodeCharTokenizerSuccess1) {
2927 // Testing the parameter of UnicodeCharTokenizer interface when the with_offsets is true.
2928 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeCharTokenizerSuccess1.";
2929
2930 // Create a TextFile dataset
2931 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
2932 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
2933 EXPECT_NE(ds, nullptr);
2934
2935 // Create unicodechar_tokenizer operation on ds
2936 std::shared_ptr<TensorTransform> unicodechar_tokenizer = std::make_shared<text::UnicodeCharTokenizer>(true);
2937 EXPECT_NE(unicodechar_tokenizer, nullptr);
2938
2939 // Create Map operation on ds
2940 ds = ds->Map({unicodechar_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
2941 EXPECT_NE(ds, nullptr);
2942
2943 // Create an iterator over the result of the above dataset
2944 // This will trigger the creation of the Execution Tree and launch it.
2945 std::shared_ptr<Iterator> iter = ds->CreateIterator();
2946 EXPECT_NE(iter, nullptr);
2947
2948 // Iterate the dataset and get each row
2949 std::unordered_map<std::string, mindspore::MSTensor> row;
2950 ASSERT_OK(iter->GetNextRow(&row));
2951
2952 std::vector<std::vector<std::string>> expected_tokens = {
2953 {"W", "e", "l", "c", "o", "m", "e", " ", "t", "o", " ", "B", "e", "i", "j", "i", "n", "g", "!"},
2954 {"北", "京", "欢", "迎", "您", "!"},
2955 {"我", "喜", "欢", "E", "n", "g", "l", "i", "s", "h", "!"},
2956 {" ", " "}};
2957
2958 std::vector<std::vector<uint32_t>> expected_offsets_start = {
2959 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18},
2960 {0, 3, 6, 9, 12, 15},
2961 {0, 3, 6, 9, 10, 11, 12, 13, 14, 15, 16},
2962 {0, 1}};
2963
2964 std::vector<std::vector<uint32_t>> expected_offsets_limit = {
2965 {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19},
2966 {3, 6, 9, 12, 15, 18},
2967 {3, 6, 9, 10, 11, 12, 13, 14, 15, 16, 17},
2968 {1, 2}};
2969
2970 uint64_t i = 0;
2971 while (row.size() != 0) {
2972 auto token = row["token"];
2973 auto start = row["offsets_start"];
2974 auto limit = row["offsets_limit"];
2975
2976 std::shared_ptr<Tensor> de_expected_tokens;
2977 int x = expected_tokens[i].size();
2978 ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
2979 mindspore::MSTensor ms_expected_tokens =
2980 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
2981 EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
2982
2983 std::shared_ptr<Tensor> de_expected_offsets_start;
2984 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
2985 mindspore::MSTensor ms_expected_offsets_start =
2986 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
2987 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
2988
2989 std::shared_ptr<Tensor> de_expected_offsets_limit;
2990 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
2991 mindspore::MSTensor ms_expected_offsets_limit =
2992 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
2993 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
2994
2995 ASSERT_OK(iter->GetNextRow(&row));
2996 i++;
2997 }
2998
2999 EXPECT_EQ(i, 4);
3000
3001 // Manually terminate the pipeline
3002 iter->Stop();
3003 }
3004
3005 std::vector<std::string> vocab_english = {"book", "cholera", "era", "favor", "##ite", "my",
3006 "is", "love", "dur", "##ing", "the"};
3007
3008 std::vector<std::string> vocab_chinese = {"我", "最", "喜", "欢", "的", "书", "是", "霍", "乱", "时", "期", "爱", "情"};
3009
3010 /// Feature: WordpieceTokenizer op
3011 /// Description: Test WordpieceTokenizer with default parameters on English vocab
3012 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess1)3013 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess1) {
3014 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess1.";
3015 // Test WordpieceTokenizer with default parameters on English vocab
3016
3017 // Create a TextFile dataset
3018 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
3019 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3020 EXPECT_NE(ds, nullptr);
3021
3022 // Create Take operation on ds
3023 ds = ds->Take(10);
3024 EXPECT_NE(ds, nullptr);
3025
3026 // Create a vocab from vector
3027 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3028 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3029 EXPECT_EQ(s, Status::OK());
3030
3031 // Create WordpieceTokenizer operation on ds
3032 std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab);
3033 EXPECT_NE(wordpiece_tokenizer, nullptr);
3034
3035 // Create Map operation on ds
3036 ds = ds->Map({wordpiece_tokenizer}, {"text"});
3037 EXPECT_NE(ds, nullptr);
3038
3039 // Create an iterator over the result of the above dataset
3040 // This will trigger the creation of the Execution Tree and launch it.
3041 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3042 EXPECT_NE(iter, nullptr);
3043
3044 // Iterate the dataset and get each row
3045 std::unordered_map<std::string, mindspore::MSTensor> row;
3046 ASSERT_OK(iter->GetNextRow(&row));
3047
3048 std::vector<std::vector<std::string>> expected = {
3049 {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}};
3050
3051 uint64_t i = 0;
3052 while (row.size() != 0) {
3053 auto txt = row["text"];
3054 std::shared_ptr<Tensor> de_expected_tensor;
3055 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
3056 mindspore::MSTensor expected_tensor =
3057 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3058 EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3059 ASSERT_OK(iter->GetNextRow(&row));
3060 i++;
3061 }
3062
3063 EXPECT_EQ(i, 10);
3064
3065 // Manually terminate the pipeline
3066 iter->Stop();
3067 }
3068
3069 /// Feature: WordpieceTokenizer op
3070 /// Description: Test WordpieceTokenizer with empty unknown_token
3071 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess2)3072 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess2) {
3073 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess2.";
3074 // Test WordpieceTokenizer with empty unknown_token
3075
3076 // Create a TextFile dataset
3077 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
3078 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3079 EXPECT_NE(ds, nullptr);
3080
3081 // Create Take operation on ds
3082 ds = ds->Take(10);
3083 EXPECT_NE(ds, nullptr);
3084
3085 // Create a vocab from vector
3086 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3087 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3088 EXPECT_EQ(s, Status::OK());
3089
3090 // Create WordpieceTokenizer operation on ds
3091 std::shared_ptr<TensorTransform> wordpiece_tokenizer =
3092 std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "", false);
3093 EXPECT_NE(wordpiece_tokenizer, nullptr);
3094
3095 // Create Map operation on ds
3096 ds = ds->Map({wordpiece_tokenizer}, {"text"});
3097 EXPECT_NE(ds, nullptr);
3098
3099 // Create an iterator over the result of the above dataset
3100 // This will trigger the creation of the Execution Tree and launch it.
3101 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3102 EXPECT_NE(iter, nullptr);
3103
3104 // Iterate the dataset and get each row
3105 std::unordered_map<std::string, mindspore::MSTensor> row;
3106 ASSERT_OK(iter->GetNextRow(&row));
3107
3108 std::vector<std::vector<std::string>> expected = {
3109 {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"what"}};
3110
3111 uint64_t i = 0;
3112 while (row.size() != 0) {
3113 auto txt = row["text"];
3114 std::shared_ptr<Tensor> de_expected_tensor;
3115 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
3116 mindspore::MSTensor expected_tensor =
3117 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3118 EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3119 ASSERT_OK(iter->GetNextRow(&row));
3120 i++;
3121 }
3122
3123 EXPECT_EQ(i, 10);
3124
3125 // Manually terminate the pipeline
3126 iter->Stop();
3127 }
3128
3129 /// Feature: WordpieceTokenizer op
3130 /// Description: Test WordpieceTokenizer with non-default max_bytes_per_token
3131 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess3)3132 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess3) {
3133 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess3.";
3134 // Test WordpieceTokenizer with non-default max_bytes_per_token
3135
3136 // Create a TextFile dataset
3137 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
3138 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3139 EXPECT_NE(ds, nullptr);
3140
3141 // Create Take operation on ds
3142 ds = ds->Take(10);
3143 EXPECT_NE(ds, nullptr);
3144
3145 // Create a vocab from vector
3146 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3147 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3148 EXPECT_EQ(s, Status::OK());
3149
3150 // Create WordpieceTokenizer operation on ds
3151 std::shared_ptr<TensorTransform> wordpiece_tokenizer =
3152 std::make_shared<text::WordpieceTokenizer>(vocab, "##", 4, "[UNK]", false);
3153 EXPECT_NE(wordpiece_tokenizer, nullptr);
3154
3155 // Create Map operation on ds
3156 ds = ds->Map({wordpiece_tokenizer}, {"text"});
3157 EXPECT_NE(ds, nullptr);
3158
3159 // Create an iterator over the result of the above dataset
3160 // This will trigger the creation of the Execution Tree and launch it.
3161 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3162 EXPECT_NE(iter, nullptr);
3163
3164 // Iterate the dataset and get each row
3165 std::unordered_map<std::string, mindspore::MSTensor> row;
3166 ASSERT_OK(iter->GetNextRow(&row));
3167
3168 std::vector<std::vector<std::string>> expected = {{"my"}, {"[UNK]"}, {"book"}, {"is"}, {"love"},
3169 {"[UNK]"}, {"the"}, {"[UNK]"}, {"era"}, {"[UNK]"}};
3170
3171 uint64_t i = 0;
3172 while (row.size() != 0) {
3173 auto txt = row["text"];
3174 std::shared_ptr<Tensor> de_expected_tensor;
3175 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
3176 mindspore::MSTensor expected_tensor =
3177 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3178 EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3179 ASSERT_OK(iter->GetNextRow(&row));
3180 i++;
3181 }
3182
3183 EXPECT_EQ(i, 10);
3184
3185 // Manually terminate the pipeline
3186 iter->Stop();
3187 }
3188
3189 /// Feature: WordpieceTokenizer op
3190 /// Description: Test WordpieceTokenizer with default parameters on Chinese vocab
3191 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess4)3192 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess4) {
3193 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess4.";
3194 // Test WordpieceTokenizer with default parameters on Chinese vocab
3195
3196 // Create a TextFile dataset
3197 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
3198 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3199 EXPECT_NE(ds, nullptr);
3200
3201 // Create Skip operation on ds
3202 ds = ds->Skip(10);
3203 EXPECT_NE(ds, nullptr);
3204
3205 // Create Take operation on ds
3206 ds = ds->Take(15);
3207 EXPECT_NE(ds, nullptr);
3208
3209 // Create a vocab from vector
3210 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3211 Status s = Vocab::BuildFromVector(vocab_chinese, {}, true, &vocab);
3212 EXPECT_EQ(s, Status::OK());
3213
3214 // Create WordpieceTokenizer operation on ds
3215 std::shared_ptr<TensorTransform> wordpiece_tokenizer =
3216 std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", false);
3217 EXPECT_NE(wordpiece_tokenizer, nullptr);
3218
3219 // Create Map operation on ds
3220 ds = ds->Map({wordpiece_tokenizer}, {"text"});
3221 EXPECT_NE(ds, nullptr);
3222
3223 // Create an iterator over the result of the above dataset
3224 // This will trigger the creation of the Execution Tree and launch it.
3225 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3226 EXPECT_NE(iter, nullptr);
3227
3228 // Iterate the dataset and get each row
3229 std::unordered_map<std::string, mindspore::MSTensor> row;
3230 ASSERT_OK(iter->GetNextRow(&row));
3231
3232 std::vector<std::vector<std::string>> expected = {{"我"}, {"最"}, {"喜"}, {"欢"}, {"的"}, {"书"}, {"是"}, {"霍"},
3233 {"乱"}, {"时"}, {"期"}, {"的"}, {"爱"}, {"情"}, {"[UNK]"}};
3234
3235 uint64_t i = 0;
3236 while (row.size() != 0) {
3237 auto txt = row["text"];
3238 std::shared_ptr<Tensor> de_expected_tensor;
3239 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
3240 mindspore::MSTensor expected_tensor =
3241 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3242 EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3243 ASSERT_OK(iter->GetNextRow(&row));
3244 i++;
3245 }
3246
3247 EXPECT_EQ(i, 15);
3248
3249 // Manually terminate the pipeline
3250 iter->Stop();
3251 }
3252
3253 /// Feature: WordpieceTokenizer op
3254 /// Description: Test WordpieceTokenizer with with_offsets=true
3255 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess5)3256 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess5) {
3257 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess5.";
3258 // Test WordpieceTokenizer with with_offsets true
3259
3260 // Create a TextFile dataset
3261 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
3262 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3263 EXPECT_NE(ds, nullptr);
3264
3265 // Create Take operation on ds
3266 ds = ds->Take(10);
3267 EXPECT_NE(ds, nullptr);
3268
3269 // Create a vocab from vector
3270 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3271 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3272 EXPECT_EQ(s, Status::OK());
3273
3274 // Create WordpieceTokenizer operation on ds
3275 std::shared_ptr<TensorTransform> wordpiece_tokenizer =
3276 std::make_shared<text::WordpieceTokenizer>(vocab, "##", 100, "[UNK]", true);
3277 EXPECT_NE(wordpiece_tokenizer, nullptr);
3278
3279 // Create Map operation on ds
3280 ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
3281 EXPECT_NE(ds, nullptr);
3282
3283 // Create an iterator over the result of the above dataset
3284 // This will trigger the creation of the Execution Tree and launch it.
3285 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3286 EXPECT_NE(iter, nullptr);
3287
3288 // Iterate the dataset and get each row
3289 std::unordered_map<std::string, mindspore::MSTensor> row;
3290 ASSERT_OK(iter->GetNextRow(&row));
3291
3292 std::vector<std::vector<std::string>> expected = {
3293 {"my"}, {"favor", "##ite"}, {"book"}, {"is"}, {"love"}, {"dur", "##ing"}, {"the"}, {"cholera"}, {"era"}, {"[UNK]"}};
3294 std::vector<std::vector<uint32_t>> expected_offsets_start = {{0}, {0, 5}, {0}, {0}, {0}, {0, 3}, {0}, {0}, {0}, {0}};
3295 std::vector<std::vector<uint32_t>> expected_offsets_limit = {{2}, {5, 8}, {4}, {2}, {4}, {3, 6}, {3}, {7}, {3}, {4}};
3296
3297 uint64_t i = 0;
3298 while (row.size() != 0) {
3299 auto txt = row["token"];
3300 std::shared_ptr<Tensor> de_expected_tensor;
3301 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
3302 mindspore::MSTensor expected_tensor =
3303 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3304 EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3305
3306 auto start = row["offsets_start"];
3307 std::shared_ptr<Tensor> de_expected_start_tensor;
3308 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], &de_expected_start_tensor));
3309 mindspore::MSTensor expected_start_tensor =
3310 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_start_tensor));
3311 EXPECT_MSTENSOR_EQ(start, expected_start_tensor);
3312
3313 auto limit = row["offsets_limit"];
3314 std::shared_ptr<Tensor> de_expected_limit_tensor;
3315 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], &de_expected_limit_tensor));
3316 mindspore::MSTensor expected_limit_tensor =
3317 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_limit_tensor));
3318 EXPECT_MSTENSOR_EQ(limit, expected_limit_tensor);
3319 ASSERT_OK(iter->GetNextRow(&row));
3320 i++;
3321 }
3322
3323 EXPECT_EQ(i, 10);
3324
3325 // Manually terminate the pipeline
3326 iter->Stop();
3327 }
3328
3329 /// Feature: WordpieceTokenizer op
3330 /// Description: Test WordpieceTokenizer with max_bytes_per_token=0
3331 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerSuccess6)3332 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerSuccess6) {
3333 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerSuccess6.";
3334 // Test WordpieceTokenizer with max_bytes_per_token equals to 0
3335
3336 // Create a TextFile dataset
3337 std::string data_file = datasets_root_path_ + "/testTokenizerData/wordpiece_tokenizer.txt";
3338 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3339 EXPECT_NE(ds, nullptr);
3340
3341 // Create Take operation on ds
3342 ds = ds->Take(10);
3343 EXPECT_NE(ds, nullptr);
3344
3345 // Create a vocab from vector
3346 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3347 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3348 EXPECT_EQ(s, Status::OK());
3349
3350 // Create WordpieceTokenizer operation on ds
3351 std::shared_ptr<TensorTransform> wordpiece_tokenizer =
3352 std::make_shared<text::WordpieceTokenizer>(vocab, "##", 0, "[UNK]", true);
3353 EXPECT_NE(wordpiece_tokenizer, nullptr);
3354
3355 // Create Map operation on ds
3356 ds = ds->Map({wordpiece_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
3357 EXPECT_NE(ds, nullptr);
3358
3359 // Create an iterator over the result of the above dataset
3360 // This will trigger the creation of the Execution Tree and launch it.
3361 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3362 EXPECT_NE(iter, nullptr);
3363
3364 // Iterate the dataset and get each row
3365 std::unordered_map<std::string, mindspore::MSTensor> row;
3366 ASSERT_OK(iter->GetNextRow(&row));
3367
3368 std::vector<std::vector<std::string>> expected = {{"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"},
3369 {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}, {"[UNK]"}};
3370
3371 uint64_t i = 0;
3372 while (row.size() != 0) {
3373 auto txt = row["token"];
3374 std::shared_ptr<Tensor> de_expected_tensor;
3375 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
3376 mindspore::MSTensor expected_tensor =
3377 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3378 EXPECT_MSTENSOR_EQ(txt, expected_tensor);
3379 ASSERT_OK(iter->GetNextRow(&row));
3380 i++;
3381 }
3382
3383 EXPECT_EQ(i, 10);
3384
3385 // Manually terminate the pipeline
3386 iter->Stop();
3387 }
3388
3389 /// Feature: WordpieceTokenizer op
3390 /// Description: Test WordpieceTokenizer with nullptr vocab
3391 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerFail1)3392 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail1) {
3393 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail1.";
3394 // Test WordpieceTokenizer with nullptr vocab
3395
3396 // Create a TextFile dataset
3397 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
3398 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3399 EXPECT_NE(ds, nullptr);
3400
3401 // Create WordpieceTokenizer operation on ds
3402 std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(nullptr);
3403 EXPECT_NE(wordpiece_tokenizer, nullptr);
3404
3405 // Create a Map operation on ds
3406 ds = ds->Map({wordpiece_tokenizer});
3407 EXPECT_NE(ds, nullptr);
3408
3409 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3410 // Expect failure: invalid WordpieceTokenizer input with nullptr vocab
3411 EXPECT_EQ(iter, nullptr);
3412 }
3413
3414 /// Feature: WordpieceTokenizer op
3415 /// Description: Test WordpieceTokenizer with negative max_bytes_per_token
3416 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestWordpieceTokenizerFail2)3417 TEST_F(MindDataTestPipeline, TestWordpieceTokenizerFail2) {
3418 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWordpieceTokenizerFail2.";
3419 // Test WordpieceTokenizer with negative max_bytes_per_token
3420
3421 // Create a TextFile dataset
3422 std::string data_file = datasets_root_path_ + "/testTokenizerData/bert_tokenizer.txt";
3423 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3424 EXPECT_NE(ds, nullptr);
3425
3426 // Create a vocab from vector
3427 std::shared_ptr<Vocab> vocab = std::make_shared<Vocab>();
3428 Status s = Vocab::BuildFromVector(vocab_english, {}, true, &vocab);
3429 EXPECT_EQ(s, Status::OK());
3430
3431 // Create WordpieceTokenizer operation on ds
3432 std::shared_ptr<TensorTransform> wordpiece_tokenizer = std::make_shared<text::WordpieceTokenizer>(vocab, "##", -1);
3433 EXPECT_NE(wordpiece_tokenizer, nullptr);
3434
3435 // Create a Map operation on ds
3436 ds = ds->Map({wordpiece_tokenizer});
3437 EXPECT_NE(ds, nullptr);
3438
3439 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3440 // Expect failure: invalid WordpieceTokenizer input with nullptr vocab
3441 EXPECT_EQ(iter, nullptr);
3442 }
3443
3444 /// Feature: UnicodeScriptTokenizer op
3445 /// Description: Test UnicodeScriptTokenizer when with_offsets and keep_whitespace is default
3446 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestUnicodeScriptTokenizerSuccess)3447 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess) {
3448 // Testing the parameter of UnicodeScriptTokenizer interface when the with_offsets and the keep_whitespace is default.
3449 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess.";
3450
3451 // Create a TextFile dataset
3452 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3453 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3454 EXPECT_NE(ds, nullptr);
3455
3456 // Create unicodescript_tokenizer operation on ds
3457 std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>();
3458 EXPECT_NE(unicodescript_tokenizer, nullptr);
3459
3460 // Create Map operation on ds
3461 ds = ds->Map({unicodescript_tokenizer}, {"text"});
3462 EXPECT_NE(ds, nullptr);
3463
3464 // Create an iterator over the result of the above dataset
3465 // This will trigger the creation of the Execution Tree and launch it.
3466 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3467 EXPECT_NE(iter, nullptr);
3468
3469 // Iterate the dataset and get each row
3470 std::unordered_map<std::string, mindspore::MSTensor> row;
3471 ASSERT_OK(iter->GetNextRow(&row));
3472
3473 std::vector<std::vector<std::string>> expected = {
3474 {"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}};
3475
3476 uint64_t i = 0;
3477 while (row.size() != 0) {
3478 auto ind = row["text"];
3479
3480 std::shared_ptr<Tensor> de_expected_tensor;
3481 int x = expected[i].size();
3482 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
3483 mindspore::MSTensor expected_tensor =
3484 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3485 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
3486
3487 ASSERT_OK(iter->GetNextRow(&row));
3488 i++;
3489 }
3490
3491 EXPECT_EQ(i, 4);
3492
3493 // Manually terminate the pipeline
3494 iter->Stop();
3495 }
3496
3497 /// Feature: UnicodeScriptTokenizer op
3498 /// Description: Test UnicodeScriptTokenizer when with_offsets=false and keep_whitespace=true
3499 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestUnicodeScriptTokenizerSuccess1)3500 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess1) {
3501 // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is
3502 // false.
3503 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess1.";
3504
3505 // Create a TextFile dataset
3506 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3507 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3508 EXPECT_NE(ds, nullptr);
3509
3510 // Create unicodescript_tokenizer operation on ds
3511 std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>(true);
3512 EXPECT_NE(unicodescript_tokenizer, nullptr);
3513
3514 // Create Map operation on ds
3515 ds = ds->Map({unicodescript_tokenizer}, {"text"});
3516 EXPECT_NE(ds, nullptr);
3517
3518 // Create an iterator over the result of the above dataset
3519 // This will trigger the creation of the Execution Tree and launch it.
3520 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3521 EXPECT_NE(iter, nullptr);
3522
3523 // Iterate the dataset and get each row
3524 std::unordered_map<std::string, mindspore::MSTensor> row;
3525 ASSERT_OK(iter->GetNextRow(&row));
3526
3527 std::vector<std::vector<std::string>> expected = {
3528 {"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {" "}};
3529
3530 uint64_t i = 0;
3531 while (row.size() != 0) {
3532 auto ind = row["text"];
3533
3534 std::shared_ptr<Tensor> de_expected_tensor;
3535 int x = expected[i].size();
3536 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
3537 mindspore::MSTensor expected_tensor =
3538 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3539 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
3540
3541 ASSERT_OK(iter->GetNextRow(&row));
3542 i++;
3543 }
3544
3545 EXPECT_EQ(i, 4);
3546
3547 // Manually terminate the pipeline
3548 iter->Stop();
3549 }
3550
3551 /// Feature: UnicodeScriptTokenizer op
3552 /// Description: Test UnicodeScriptTokenizer when with_offsets=true and keep_whitespace=false
3553 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestUnicodeScriptTokenizerSuccess2)3554 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess2) {
3555 // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is false and the with_offsets is
3556 // true.
3557 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess2.";
3558
3559 // Create a TextFile dataset
3560 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3561 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3562 EXPECT_NE(ds, nullptr);
3563
3564 // Create unicodescript_tokenizer operation on ds
3565 std::shared_ptr<TensorTransform> unicodescript_tokenizer =
3566 std::make_shared<text::UnicodeScriptTokenizer>(false, true);
3567 EXPECT_NE(unicodescript_tokenizer, nullptr);
3568
3569 // Create Map operation on ds
3570 ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
3571 EXPECT_NE(ds, nullptr);
3572
3573 // Create an iterator over the result of the above dataset
3574 // This will trigger the creation of the Execution Tree and launch it.
3575 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3576 EXPECT_NE(iter, nullptr);
3577
3578 // Iterate the dataset and get each row
3579 std::unordered_map<std::string, mindspore::MSTensor> row;
3580 ASSERT_OK(iter->GetNextRow(&row));
3581
3582 std::vector<std::vector<std::string>> expected_tokens = {
3583 {"Welcome", "to", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {""}};
3584
3585 std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11, 18}, {0, 15}, {0, 9, 16}, {0}};
3586 std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 18, 19}, {15, 18}, {9, 16, 17}, {0}};
3587
3588 uint64_t i = 0;
3589 while (row.size() != 0) {
3590 auto token = row["token"];
3591 auto start = row["offsets_start"];
3592 auto limit = row["offsets_limit"];
3593
3594 std::shared_ptr<Tensor> de_expected_tokens;
3595 int x = expected_tokens[i].size();
3596 ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
3597 mindspore::MSTensor ms_expected_tokens =
3598 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
3599 EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
3600
3601 std::shared_ptr<Tensor> de_expected_offsets_start;
3602 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
3603 mindspore::MSTensor ms_expected_offsets_start =
3604 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
3605 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
3606
3607 std::shared_ptr<Tensor> de_expected_offsets_limit;
3608 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
3609 mindspore::MSTensor ms_expected_offsets_limit =
3610 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
3611 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
3612
3613 ASSERT_OK(iter->GetNextRow(&row));
3614 i++;
3615 }
3616
3617 EXPECT_EQ(i, 4);
3618
3619 // Manually terminate the pipeline
3620 iter->Stop();
3621 }
3622
3623 /// Feature: UnicodeScriptTokenizer op
3624 /// Description: Test UnicodeScriptTokenizer when with_offsets=true and keep_whitespace=true
3625 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestUnicodeScriptTokenizerSuccess3)3626 TEST_F(MindDataTestPipeline, TestUnicodeScriptTokenizerSuccess3) {
3627 // Testing the parameter of UnicodeScriptTokenizer interface when the keep_whitespace is true and the with_offsets is
3628 // true.
3629 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestUnicodeScriptTokenizerSuccess3.";
3630
3631 // Create a TextFile dataset
3632 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3633 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3634 EXPECT_NE(ds, nullptr);
3635
3636 // Create unicodescript_tokenizer operation on ds
3637 std::shared_ptr<TensorTransform> unicodescript_tokenizer = std::make_shared<text::UnicodeScriptTokenizer>(true, true);
3638 EXPECT_NE(unicodescript_tokenizer, nullptr);
3639
3640 // Create Map operation on ds
3641 ds = ds->Map({unicodescript_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
3642 EXPECT_NE(ds, nullptr);
3643
3644 // Create an iterator over the result of the above dataset
3645 // This will trigger the creation of the Execution Tree and launch it.
3646 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3647 EXPECT_NE(iter, nullptr);
3648
3649 // Iterate the dataset and get each row
3650 std::unordered_map<std::string, mindspore::MSTensor> row;
3651 ASSERT_OK(iter->GetNextRow(&row));
3652
3653 std::vector<std::vector<std::string>> expected_tokens = {
3654 {"Welcome", " ", "to", " ", "Beijing", "!"}, {"北京欢迎您", "!"}, {"我喜欢", "English", "!"}, {" "}};
3655
3656 std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 7, 8, 10, 11, 18}, {0, 15}, {0, 9, 16}, {0}};
3657 std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 8, 10, 11, 18, 19}, {15, 18}, {9, 16, 17}, {2}};
3658
3659 uint64_t i = 0;
3660 while (row.size() != 0) {
3661 auto token = row["token"];
3662 auto start = row["offsets_start"];
3663 auto limit = row["offsets_limit"];
3664
3665 std::shared_ptr<Tensor> de_expected_tokens;
3666 int x = expected_tokens[i].size();
3667 ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
3668 mindspore::MSTensor ms_expected_tokens =
3669 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
3670 EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
3671
3672 std::shared_ptr<Tensor> de_expected_offsets_start;
3673 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
3674 mindspore::MSTensor ms_expected_offsets_start =
3675 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
3676 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
3677
3678 std::shared_ptr<Tensor> de_expected_offsets_limit;
3679 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
3680 mindspore::MSTensor ms_expected_offsets_limit =
3681 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
3682 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
3683
3684 ASSERT_OK(iter->GetNextRow(&row));
3685 i++;
3686 }
3687
3688 EXPECT_EQ(i, 4);
3689
3690 // Manually terminate the pipeline
3691 iter->Stop();
3692 }
3693
3694 /// Feature: WhitespaceTokenizer op
3695 /// Description: Test WhitespaceTokenizer when with_offsets is default
3696 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestWhitespaceTokenizerSuccess)3697 TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess) {
3698 // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is default.
3699 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess.";
3700
3701 // Create a TextFile dataset
3702 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
3703 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3704 EXPECT_NE(ds, nullptr);
3705
3706 // Create white_tokenizer operation on ds
3707 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
3708 EXPECT_NE(white_tokenizer, nullptr);
3709
3710 // Create Map operation on ds
3711 ds = ds->Map({white_tokenizer}, {"text"});
3712 EXPECT_NE(ds, nullptr);
3713
3714 // Create an iterator over the result of the above dataset
3715 // This will trigger the creation of the Execution Tree and launch it.
3716 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3717 EXPECT_NE(iter, nullptr);
3718
3719 // Iterate the dataset and get each row
3720 std::unordered_map<std::string, mindspore::MSTensor> row;
3721 ASSERT_OK(iter->GetNextRow(&row));
3722
3723 std::vector<std::vector<std::string>> expected = {
3724 {"This", "is", "a", "text", "file."}, {"Be", "happy", "every", "day."}, {"Good", "luck", "to", "everyone."}};
3725
3726 uint64_t i = 0;
3727 while (row.size() != 0) {
3728 auto ind = row["text"];
3729
3730 std::shared_ptr<Tensor> de_expected_tensor;
3731 int x = expected[i].size();
3732 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({x}), &de_expected_tensor));
3733 mindspore::MSTensor expected_tensor =
3734 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
3735 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
3736
3737 ASSERT_OK(iter->GetNextRow(&row));
3738 i++;
3739 }
3740
3741 EXPECT_EQ(i, 3);
3742
3743 // Manually terminate the pipeline
3744 iter->Stop();
3745 }
3746
3747 /// Feature: WhitespaceTokenizer op
3748 /// Description: Test WhitespaceTokenizer when with_offsets=true
3749 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestWhitespaceTokenizerSuccess1)3750 TEST_F(MindDataTestPipeline, TestWhitespaceTokenizerSuccess1) {
3751 // Testing the parameter of WhitespaceTokenizer interface when the with_offsets is true.
3752 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestWhitespaceTokenizerSuccess1.";
3753
3754 // Create a TextFile dataset
3755 std::string data_file = datasets_root_path_ + "/testTokenizerData/1.txt";
3756 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3757 EXPECT_NE(ds, nullptr);
3758
3759 // Create white_tokenizer operation on ds
3760 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>(true);
3761 EXPECT_NE(white_tokenizer, nullptr);
3762
3763 // Create Map operation on ds
3764 ds = ds->Map({white_tokenizer}, {"text"}, {"token", "offsets_start", "offsets_limit"});
3765 EXPECT_NE(ds, nullptr);
3766
3767 // Create an iterator over the result of the above dataset
3768 // This will trigger the creation of the Execution Tree and launch it.
3769 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3770 EXPECT_NE(iter, nullptr);
3771
3772 // Iterate the dataset and get each row
3773 std::unordered_map<std::string, mindspore::MSTensor> row;
3774 ASSERT_OK(iter->GetNextRow(&row));
3775
3776 std::vector<std::vector<std::string>> expected_tokens = {
3777 {"Welcome", "to", "Beijing!"}, {"北京欢迎您!"}, {"我喜欢English!"}, {""}};
3778
3779 std::vector<std::vector<uint32_t>> expected_offsets_start = {{0, 8, 11}, {0}, {0}, {0}};
3780 std::vector<std::vector<uint32_t>> expected_offsets_limit = {{7, 10, 19}, {18}, {17}, {0}};
3781
3782 uint64_t i = 0;
3783 while (row.size() != 0) {
3784 auto token = row["token"];
3785 auto start = row["offsets_start"];
3786 auto limit = row["offsets_limit"];
3787
3788 std::shared_ptr<Tensor> de_expected_tokens;
3789 int x = expected_tokens[i].size();
3790 ASSERT_OK(Tensor::CreateFromVector(expected_tokens[i], TensorShape({x}), &de_expected_tokens));
3791 mindspore::MSTensor ms_expected_tokens =
3792 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tokens));
3793 EXPECT_MSTENSOR_EQ(token, ms_expected_tokens);
3794
3795 std::shared_ptr<Tensor> de_expected_offsets_start;
3796 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_start[i], TensorShape({x}), &de_expected_offsets_start));
3797 mindspore::MSTensor ms_expected_offsets_start =
3798 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_start));
3799 EXPECT_MSTENSOR_EQ(start, ms_expected_offsets_start);
3800
3801 std::shared_ptr<Tensor> de_expected_offsets_limit;
3802 ASSERT_OK(Tensor::CreateFromVector(expected_offsets_limit[i], TensorShape({x}), &de_expected_offsets_limit));
3803 mindspore::MSTensor ms_expected_offsets_limit =
3804 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_offsets_limit));
3805 EXPECT_MSTENSOR_EQ(limit, ms_expected_offsets_limit);
3806
3807 ASSERT_OK(iter->GetNextRow(&row));
3808 i++;
3809 }
3810
3811 EXPECT_EQ(i, 4);
3812
3813 // Manually terminate the pipeline
3814 iter->Stop();
3815 }
3816
3817 /// Feature: Vectors
3818 /// Description: Test with default parameter in function BuildFromFile and function Lookup
3819 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestVectorsDefaultParam)3820 TEST_F(MindDataTestPipeline, TestVectorsDefaultParam) {
3821 // Test with default parameter.
3822 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDefaultParam.";
3823
3824 // Create a TextFile dataset
3825 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
3826 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3827 EXPECT_NE(ds, nullptr);
3828
3829 std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
3830 std::shared_ptr<Vectors> vectors;
3831 Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
3832 EXPECT_EQ(s, Status::OK());
3833
3834 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors);
3835 EXPECT_NE(lookup, nullptr);
3836
3837 // Create Map operation on ds
3838 ds = ds->Map({lookup}, {"text"});
3839 EXPECT_NE(ds, nullptr);
3840
3841 // Create an iterator over the result of the above dataset
3842 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3843 EXPECT_NE(iter, nullptr);
3844
3845 // Iterate the dataset and get each row
3846 std::unordered_map<std::string, mindspore::MSTensor> row;
3847 ASSERT_OK(iter->GetNextRow(&row));
3848
3849 uint64_t i = 0;
3850 std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
3851 {0, 0, 0, 0, 0, 0},
3852 {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
3853 {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
3854 {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
3855 {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
3856 {0, 0, 0, 0, 0, 0}};
3857 while (row.size() != 0) {
3858 auto ind = row["text"];
3859 MS_LOG(INFO) << ind.Shape();
3860 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
3861 TensorPtr de_expected_item;
3862 dsize_t dim = 6;
3863 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
3864 mindspore::MSTensor ms_expected_item =
3865 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
3866 EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
3867
3868 ASSERT_OK(iter->GetNextRow(&row));
3869 i++;
3870 }
3871
3872 EXPECT_EQ(i, 7);
3873
3874 // Manually terminate the pipeline
3875 iter->Stop();
3876 }
3877
3878 /// Feature: Vectors
3879 /// Description: Test with all parameters which include `path` and `max_vector` in function BuildFromFile
3880 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestVectorsAllBuildfromfileParams)3881 TEST_F(MindDataTestPipeline, TestVectorsAllBuildfromfileParams) {
3882 // Test with two parameters.
3883 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllBuildfromfileParams.";
3884
3885 // Create a TextFile dataset
3886 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
3887 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3888 EXPECT_NE(ds, nullptr);
3889
3890 std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
3891 std::shared_ptr<Vectors> vectors;
3892 Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
3893 EXPECT_EQ(s, Status::OK());
3894
3895 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors);
3896 EXPECT_NE(lookup, nullptr);
3897
3898 // Create Map operation on ds
3899 ds = ds->Map({lookup}, {"text"});
3900 EXPECT_NE(ds, nullptr);
3901
3902 // Create an iterator over the result of the above dataset
3903 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3904 EXPECT_NE(iter, nullptr);
3905
3906 // Iterate the dataset and get each row
3907 std::unordered_map<std::string, mindspore::MSTensor> row;
3908 ASSERT_OK(iter->GetNextRow(&row));
3909
3910 uint64_t i = 0;
3911 std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
3912 {0, 0, 0, 0, 0, 0},
3913 {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
3914 {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
3915 {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
3916 {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
3917 {0, 0, 0, 0, 0, 0}};
3918 while (row.size() != 0) {
3919 auto ind = row["text"];
3920 MS_LOG(INFO) << ind.Shape();
3921 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
3922 TensorPtr de_expected_item;
3923 dsize_t dim = 6;
3924 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
3925 mindspore::MSTensor ms_expected_item =
3926 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
3927 EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
3928
3929 ASSERT_OK(iter->GetNextRow(&row));
3930 i++;
3931 }
3932
3933 EXPECT_EQ(i, 7);
3934
3935 // Manually terminate the pipeline
3936 iter->Stop();
3937 }
3938
3939 /// Feature: Vectors
3940 /// Description: Test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
3941 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestVectorsUnknownInit)3942 TEST_F(MindDataTestPipeline, TestVectorsUnknownInit) {
3943 // Test with two parameters.
3944 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsUnknownInit.";
3945
3946 // Create a TextFile dataset
3947 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
3948 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
3949 EXPECT_NE(ds, nullptr);
3950
3951 std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
3952 std::shared_ptr<Vectors> vectors;
3953 Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
3954 EXPECT_EQ(s, Status::OK());
3955
3956 std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
3957 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init);
3958 EXPECT_NE(lookup, nullptr);
3959
3960 // Create Map operation on ds
3961 ds = ds->Map({lookup}, {"text"});
3962 EXPECT_NE(ds, nullptr);
3963
3964 // Create an iterator over the result of the above dataset
3965 std::shared_ptr<Iterator> iter = ds->CreateIterator();
3966 EXPECT_NE(iter, nullptr);
3967
3968 // Iterate the dataset and get each row
3969 std::unordered_map<std::string, mindspore::MSTensor> row;
3970 ASSERT_OK(iter->GetNextRow(&row));
3971
3972 uint64_t i = 0;
3973 std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
3974 {-1, -1, -1, -1, -1, -1},
3975 {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
3976 {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
3977 {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
3978 {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
3979 {-1, -1, -1, -1, -1, -1}};
3980 while (row.size() != 0) {
3981 auto ind = row["text"];
3982 MS_LOG(INFO) << ind.Shape();
3983 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
3984 TensorPtr de_expected_item;
3985 dsize_t dim = 6;
3986 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
3987 mindspore::MSTensor ms_expected_item =
3988 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
3989 EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
3990
3991 ASSERT_OK(iter->GetNextRow(&row));
3992 i++;
3993 }
3994
3995 EXPECT_EQ(i, 7);
3996
3997 // Manually terminate the pipeline
3998 iter->Stop();
3999 }
4000
4001 /// Feature: Vectors
4002 /// Description: Test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
4003 /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
4004 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestVectorsAllParams)4005 TEST_F(MindDataTestPipeline, TestVectorsAllParams) {
4006 // Test with all parameters.
4007 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsAllParams.";
4008 // Create a TextFile dataset
4009 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4010 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4011 EXPECT_NE(ds, nullptr);
4012
4013 std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
4014 std::shared_ptr<Vectors> vectors;
4015 Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
4016 EXPECT_EQ(s, Status::OK());
4017
4018 std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
4019 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
4020 EXPECT_NE(lookup, nullptr);
4021
4022 // Create Map operation on ds
4023 ds = ds->Map({lookup}, {"text"});
4024 EXPECT_NE(ds, nullptr);
4025
4026 // Create an iterator over the result of the above dataset
4027 std::shared_ptr<Iterator> iter = ds->CreateIterator();
4028 EXPECT_NE(iter, nullptr);
4029
4030 // Iterate the dataset and get each row
4031 std::unordered_map<std::string, mindspore::MSTensor> row;
4032 ASSERT_OK(iter->GetNextRow(&row));
4033
4034 uint64_t i = 0;
4035 std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4036 {-1, -1, -1, -1, -1, -1},
4037 {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4038 {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4039 {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4040 {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4041 {-1, -1, -1, -1, -1, -1}};
4042 while (row.size() != 0) {
4043 auto ind = row["text"];
4044 MS_LOG(INFO) << ind.Shape();
4045 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4046 TensorPtr de_expected_item;
4047 dsize_t dim = 6;
4048 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4049 mindspore::MSTensor ms_expected_item =
4050 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4051 EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4052
4053 ASSERT_OK(iter->GetNextRow(&row));
4054 i++;
4055 }
4056
4057 EXPECT_EQ(i, 7);
4058
4059 // Manually terminate the pipeline
4060 iter->Stop();
4061 }
4062
4063 /// Feature: Vectors
4064 /// Description: Test with pre-vectors set that have the different dimension
4065 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestVectorsDifferentDimension)4066 TEST_F(MindDataTestPipeline, TestVectorsDifferentDimension) {
4067 // Tokens don't have the same number of vectors.
4068 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsDifferentDimension.";
4069
4070 // Create a TextFile dataset
4071 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4072 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4073 EXPECT_NE(ds, nullptr);
4074
4075 std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_dim_different.txt";
4076 std::shared_ptr<Vectors> vectors;
4077 Status s = Vectors::BuildFromFile(&vectors, vectors_dir, 100);
4078 EXPECT_NE(s, Status::OK());
4079 }
4080
4081 /// Feature: Vectors
4082 /// Description: Test with pre-vectors set that has the head-info
4083 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestVectorsWithHeadInfo)4084 TEST_F(MindDataTestPipeline, TestVectorsWithHeadInfo) {
4085 // Test with words that has head info.
4086 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithHeadInfo.";
4087 // Create a TextFile dataset
4088 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4089 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4090 EXPECT_NE(ds, nullptr);
4091
4092 std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_with_info.txt";
4093 std::shared_ptr<Vectors> vectors;
4094 Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
4095 EXPECT_EQ(s, Status::OK());
4096
4097 std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
4098 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(vectors, unknown_init, true);
4099 EXPECT_NE(lookup, nullptr);
4100
4101 // Create Map operation on ds
4102 ds = ds->Map({lookup}, {"text"});
4103 EXPECT_NE(ds, nullptr);
4104
4105 // Create an iterator over the result of the above dataset
4106 std::shared_ptr<Iterator> iter = ds->CreateIterator();
4107 EXPECT_NE(iter, nullptr);
4108
4109 // Iterate the dataset and get each row
4110 std::unordered_map<std::string, mindspore::MSTensor> row;
4111 ASSERT_OK(iter->GetNextRow(&row));
4112
4113 uint64_t i = 0;
4114 std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4115 {-1, -1, -1, -1, -1, -1},
4116 {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4117 {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4118 {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4119 {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4120 {-1, -1, -1, -1, -1, -1}};
4121 while (row.size() != 0) {
4122 auto ind = row["text"];
4123 MS_LOG(INFO) << ind.Shape();
4124 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4125 TensorPtr de_expected_item;
4126 dsize_t dim = 6;
4127 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4128 mindspore::MSTensor ms_expected_item =
4129 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4130 EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4131
4132 ASSERT_OK(iter->GetNextRow(&row));
4133 i++;
4134 }
4135
4136 EXPECT_EQ(i, 7);
4137
4138 // Manually terminate the pipeline
4139 iter->Stop();
4140 }
4141
4142 /// Feature: Vectors
4143 /// Description: Test with the parameter max_vectors that is <= 0
4144 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestVectorsMaxVectorsLessThanZero)4145 TEST_F(MindDataTestPipeline, TestVectorsMaxVectorsLessThanZero) {
4146 // Test with max_vectors <= 0.
4147 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsMaxVectorsLessThanZero.";
4148
4149 // Create a TextFile dataset
4150 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4151 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4152 EXPECT_NE(ds, nullptr);
4153
4154 std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors.txt";
4155 std::shared_ptr<Vectors> vectors;
4156 Status s = Vectors::BuildFromFile(&vectors, vectors_dir, -1);
4157 EXPECT_NE(s, Status::OK());
4158 }
4159
4160 /// Feature: Vectors
4161 /// Description: Test with the pre-vectors file that is empty
4162 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestVectorsWithEmptyFile)4163 TEST_F(MindDataTestPipeline, TestVectorsWithEmptyFile) {
4164 // Read empty file.
4165 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithEmptyFile.";
4166
4167 // Create a TextFile dataset
4168 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4169 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4170 EXPECT_NE(ds, nullptr);
4171
4172 std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_empty.txt";
4173 std::shared_ptr<Vectors> vectors;
4174 Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
4175 EXPECT_NE(s, Status::OK());
4176 }
4177
4178 /// Feature: Vectors
4179 /// Description: Test with the pre-vectors file that is not exist
4180 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestVectorsWithNotExistFile)4181 TEST_F(MindDataTestPipeline, TestVectorsWithNotExistFile) {
4182 // Test with not exist file.
4183 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithNotExistFile.";
4184
4185 // Create a TextFile dataset
4186 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4187 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4188 EXPECT_NE(ds, nullptr);
4189
4190 std::string vectors_dir = datasets_root_path_ + "/testVectors/no_vectors.txt";
4191 std::shared_ptr<Vectors> vectors;
4192 Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
4193 EXPECT_NE(s, Status::OK());
4194 }
4195
4196 /// Feature: Vectors
4197 /// Description: Test with the pre-vectors set that has a situation that info-head is not the first line in the set
4198 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestVectorsWithWrongInfoFile)4199 TEST_F(MindDataTestPipeline, TestVectorsWithWrongInfoFile) {
4200 // Wrong info.
4201 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestVectorsWithWrongInfoFile.";
4202
4203 // Create a TextFile dataset
4204 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4205 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4206 EXPECT_NE(ds, nullptr);
4207
4208 std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_with_wrong_info.txt";
4209 std::shared_ptr<Vectors> vectors;
4210 Status s = Vectors::BuildFromFile(&vectors, vectors_dir);
4211 EXPECT_NE(s, Status::OK());
4212 }
4213
4214 /// Feature: FastText
4215 /// Description: Test with default parameter in function BuildFromFile and function Lookup
4216 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestFastTextDefaultParam)4217 TEST_F(MindDataTestPipeline, TestFastTextDefaultParam) {
4218 // Test with default parameter.
4219 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDefaultParam.";
4220
4221 // Create a TextFile dataset
4222 std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4223 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4224 EXPECT_NE(ds, nullptr);
4225
4226 std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
4227 std::shared_ptr<FastText> fast_text;
4228 Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
4229 EXPECT_EQ(s, Status::OK());
4230
4231 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text);
4232 EXPECT_NE(lookup, nullptr);
4233
4234 // Create Map operation on ds
4235 ds = ds->Map({lookup}, {"text"});
4236 EXPECT_NE(ds, nullptr);
4237
4238 // Create an iterator over the result of the above dataset
4239 std::shared_ptr<Iterator> iter = ds->CreateIterator();
4240 EXPECT_NE(iter, nullptr);
4241
4242 // Iterate the dataset and get each row
4243 std::unordered_map<std::string, mindspore::MSTensor> row;
4244 ASSERT_OK(iter->GetNextRow(&row));
4245
4246 uint64_t i = 0;
4247 std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4248 {0, 0, 0, 0, 0, 0},
4249 {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4250 {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4251 {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4252 {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4253 {0, 0, 0, 0, 0, 0}};
4254 while (row.size() != 0) {
4255 auto ind = row["text"];
4256 MS_LOG(INFO) << ind.Shape();
4257 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4258 TensorPtr de_expected_item;
4259 dsize_t dim = 6;
4260 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4261 mindspore::MSTensor ms_expected_item =
4262 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4263 EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4264
4265 ASSERT_OK(iter->GetNextRow(&row));
4266 i++;
4267 }
4268
4269 EXPECT_EQ(i, 7);
4270
4271 // Manually terminate the pipeline
4272 iter->Stop();
4273 }
4274
4275 /// Feature: FastText
4276 /// Description: Test with all parameters which include `path` and `max_vector` in function BuildFromFile
4277 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestFastTextAllBuildfromfileParams)4278 TEST_F(MindDataTestPipeline, TestFastTextAllBuildfromfileParams) {
4279 // Test with two parameters.
4280 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllBuildfromfileParams.";
4281
4282 // Create a TextFile dataset
4283 std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4284 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4285 EXPECT_NE(ds, nullptr);
4286
4287 std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
4288 std::shared_ptr<FastText> fast_text;
4289 Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
4290 EXPECT_EQ(s, Status::OK());
4291
4292 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text);
4293 EXPECT_NE(lookup, nullptr);
4294
4295 // Create Map operation on ds
4296 ds = ds->Map({lookup}, {"text"});
4297 EXPECT_NE(ds, nullptr);
4298
4299 // Create an iterator over the result of the above dataset
4300 std::shared_ptr<Iterator> iter = ds->CreateIterator();
4301 EXPECT_NE(iter, nullptr);
4302
4303 // Iterate the dataset and get each row
4304 std::unordered_map<std::string, mindspore::MSTensor> row;
4305 ASSERT_OK(iter->GetNextRow(&row));
4306
4307 uint64_t i = 0;
4308 std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4309 {0, 0, 0, 0, 0, 0},
4310 {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4311 {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4312 {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4313 {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4314 {0, 0, 0, 0, 0, 0}};
4315 while (row.size() != 0) {
4316 auto ind = row["text"];
4317 MS_LOG(INFO) << ind.Shape();
4318 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4319 TensorPtr de_expected_item;
4320 dsize_t dim = 6;
4321 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4322 mindspore::MSTensor ms_expected_item =
4323 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4324 EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4325
4326 ASSERT_OK(iter->GetNextRow(&row));
4327 i++;
4328 }
4329
4330 EXPECT_EQ(i, 7);
4331
4332 // Manually terminate the pipeline
4333 iter->Stop();
4334 }
4335
4336 /// Feature: FastText
4337 /// Description: Test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
4338 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestFastTextUnknownInit)4339 TEST_F(MindDataTestPipeline, TestFastTextUnknownInit) {
4340 // Test with two parameters.
4341 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextUnknownInit.";
4342
4343 // Create a TextFile dataset
4344 std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4345 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4346 EXPECT_NE(ds, nullptr);
4347
4348 std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
4349 std::shared_ptr<FastText> fast_text;
4350 Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
4351 EXPECT_EQ(s, Status::OK());
4352
4353 std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
4354 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text, unknown_init);
4355 EXPECT_NE(lookup, nullptr);
4356
4357 // Create Map operation on ds
4358 ds = ds->Map({lookup}, {"text"});
4359 EXPECT_NE(ds, nullptr);
4360
4361 // Create an iterator over the result of the above dataset
4362 std::shared_ptr<Iterator> iter = ds->CreateIterator();
4363 EXPECT_NE(iter, nullptr);
4364
4365 // Iterate the dataset and get each row
4366 std::unordered_map<std::string, mindspore::MSTensor> row;
4367 ASSERT_OK(iter->GetNextRow(&row));
4368
4369 uint64_t i = 0;
4370 std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4371 {-1, -1, -1, -1, -1, -1},
4372 {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4373 {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4374 {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4375 {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4376 {-1, -1, -1, -1, -1, -1}};
4377 while (row.size() != 0) {
4378 auto ind = row["text"];
4379 MS_LOG(INFO) << ind.Shape();
4380 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4381 TensorPtr de_expected_item;
4382 dsize_t dim = 6;
4383 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4384 mindspore::MSTensor ms_expected_item =
4385 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4386 EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4387
4388 ASSERT_OK(iter->GetNextRow(&row));
4389 i++;
4390 }
4391
4392 EXPECT_EQ(i, 7);
4393
4394 // Manually terminate the pipeline
4395 iter->Stop();
4396 }
4397
4398 /// Feature: FastText
4399 /// Description: Test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
4400 /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
4401 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestFastTextAllParams)4402 TEST_F(MindDataTestPipeline, TestFastTextAllParams) {
4403 // Test with all parameters.
4404 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextAllParams.";
4405 // Create a TextFile dataset
4406 std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4407 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4408 EXPECT_NE(ds, nullptr);
4409
4410 std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
4411 std::shared_ptr<FastText> fast_text;
4412 Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
4413 EXPECT_EQ(s, Status::OK());
4414
4415 std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
4416 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(fast_text, unknown_init, true);
4417 EXPECT_NE(lookup, nullptr);
4418
4419 // Create Map operation on ds
4420 ds = ds->Map({lookup}, {"text"});
4421 EXPECT_NE(ds, nullptr);
4422
4423 // Create an iterator over the result of the above dataset
4424 std::shared_ptr<Iterator> iter = ds->CreateIterator();
4425 EXPECT_NE(iter, nullptr);
4426
4427 // Iterate the dataset and get each row
4428 std::unordered_map<std::string, mindspore::MSTensor> row;
4429 ASSERT_OK(iter->GetNextRow(&row));
4430
4431 uint64_t i = 0;
4432 std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4433 {-1, -1, -1, -1, -1, -1},
4434 {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4435 {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4436 {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4437 {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4438 {-1, -1, -1, -1, -1, -1}};
4439 while (row.size() != 0) {
4440 auto ind = row["text"];
4441 MS_LOG(INFO) << ind.Shape();
4442 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4443 TensorPtr de_expected_item;
4444 dsize_t dim = 6;
4445 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4446 mindspore::MSTensor ms_expected_item =
4447 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4448 EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4449
4450 ASSERT_OK(iter->GetNextRow(&row));
4451 i++;
4452 }
4453
4454 EXPECT_EQ(i, 7);
4455
4456 // Manually terminate the pipeline
4457 iter->Stop();
4458 }
4459
4460 /// Feature: FastText
4461 /// Description: Test with pre-vectors set that have the different dimension
4462 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestFastTextDifferentDimension)4463 TEST_F(MindDataTestPipeline, TestFastTextDifferentDimension) {
4464 // Tokens don't have the same number of vectors.
4465 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextDifferentDimension.";
4466
4467 // Create a TextFile dataset
4468 std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4469 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4470 EXPECT_NE(ds, nullptr);
4471
4472 std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_dim_different.vec";
4473 std::shared_ptr<FastText> fast_text;
4474 Status s = FastText::BuildFromFile(&fast_text, vectors_dir, 100);
4475 EXPECT_NE(s, Status::OK());
4476 }
4477
4478 /// Feature: FastText
4479 /// Description: Test with the parameter max_vectors that is <= 0
4480 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestFastTextMaxVectorsLessThanZero)4481 TEST_F(MindDataTestPipeline, TestFastTextMaxVectorsLessThanZero) {
4482 // Test with max_vectors <= 0.
4483 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextMaxVectorsLessThanZero.";
4484
4485 // Create a TextFile dataset
4486 std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4487 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4488 EXPECT_NE(ds, nullptr);
4489
4490 std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.vec";
4491 std::shared_ptr<FastText> fast_text;
4492 Status s = FastText::BuildFromFile(&fast_text, vectors_dir, -1);
4493 EXPECT_NE(s, Status::OK());
4494 }
4495
4496 /// Feature: FastText
4497 /// Description: Test with the pre-vectors file that is empty
4498 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestFastTextWithEmptyFile)4499 TEST_F(MindDataTestPipeline, TestFastTextWithEmptyFile) {
4500 // Read empty file.
4501 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithEmptyFile.";
4502
4503 // Create a TextFile dataset
4504 std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4505 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4506 EXPECT_NE(ds, nullptr);
4507
4508 std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_empty.vec";
4509 std::shared_ptr<FastText> fast_text;
4510 Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
4511 EXPECT_NE(s, Status::OK());
4512 }
4513
4514 /// Feature: FastText
4515 /// Description: Test with the pre-vectors file that is not exist
4516 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestFastTextWithNotExistFile)4517 TEST_F(MindDataTestPipeline, TestFastTextWithNotExistFile) {
4518 // Test with not exist file.
4519 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithNotExistFile.";
4520
4521 // Create a TextFile dataset
4522 std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4523 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4524 EXPECT_NE(ds, nullptr);
4525
4526 std::string vectors_dir = datasets_root_path_ + "/test_fast_text/no_fasttext.vec";
4527 std::shared_ptr<FastText> fast_text;
4528 Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
4529 EXPECT_NE(s, Status::OK());
4530 }
4531
4532 /// Feature: FastText
4533 /// Description: Test with the pre-vectors set that has a situation that info-head is not the first line in the set
4534 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestFastTextWithWrongInfoFile)4535 TEST_F(MindDataTestPipeline, TestFastTextWithWrongInfoFile) {
4536 // Wrong info.
4537 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongInfoFile.";
4538
4539 // Create a TextFile dataset
4540 std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4541 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4542 EXPECT_NE(ds, nullptr);
4543
4544 std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fasttext_with_wrong_info.vec";
4545 std::shared_ptr<FastText> fast_text;
4546 Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
4547 EXPECT_NE(s, Status::OK());
4548 }
4549
4550 /// Feature: FastText
4551 /// Description: Test with the pre-vectors set that has a wrong suffix
4552 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestFastTextWithWrongSuffix)4553 TEST_F(MindDataTestPipeline, TestFastTextWithWrongSuffix) {
4554 // Wrong info.
4555 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestFastTextWithWrongSuffix.";
4556
4557 // Create a TextFile dataset
4558 std::string data_file = datasets_root_path_ + "/test_fast_text/words.txt";
4559 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4560 EXPECT_NE(ds, nullptr);
4561
4562 std::string vectors_dir = datasets_root_path_ + "/test_fast_text/fast_text.txt";
4563 std::shared_ptr<FastText> fast_text;
4564 Status s = FastText::BuildFromFile(&fast_text, vectors_dir);
4565 EXPECT_NE(s, Status::OK());
4566 }
4567
4568 /// Feature: GloVe
4569 /// Description: Test with default parameter in function BuildFromFile and function Lookup
4570 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestGloVeDefaultParam)4571 TEST_F(MindDataTestPipeline, TestGloVeDefaultParam) {
4572 // Test with default parameter.
4573 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeDefaultParam.";
4574
4575 // Create a TextFile dataset
4576 std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4577 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4578 EXPECT_NE(ds, nullptr);
4579
4580 std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
4581 std::shared_ptr<GloVe> glove;
4582 Status s = GloVe::BuildFromFile(&glove, vectors_dir);
4583 EXPECT_EQ(s, Status::OK());
4584
4585 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove);
4586 EXPECT_NE(lookup, nullptr);
4587
4588 // Create Map operation on ds
4589 ds = ds->Map({lookup}, {"text"});
4590 EXPECT_NE(ds, nullptr);
4591
4592 // Create an iterator over the result of the above dataset
4593 std::shared_ptr<Iterator> iter = ds->CreateIterator();
4594 EXPECT_NE(iter, nullptr);
4595
4596 // Iterate the dataset and get each row
4597 std::unordered_map<std::string, mindspore::MSTensor> row;
4598 ASSERT_OK(iter->GetNextRow(&row));
4599
4600 uint64_t i = 0;
4601 std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4602 {0, 0, 0, 0, 0, 0},
4603 {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4604 {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4605 {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4606 {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4607 {0, 0, 0, 0, 0, 0}};
4608 while (row.size() != 0) {
4609 auto ind = row["text"];
4610 MS_LOG(INFO) << ind.Shape();
4611 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4612 TensorPtr de_expected_item;
4613 dsize_t dim = 6;
4614 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4615 mindspore::MSTensor ms_expected_item =
4616 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4617 EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4618
4619 ASSERT_OK(iter->GetNextRow(&row));
4620 i++;
4621 }
4622
4623 EXPECT_EQ(i, 7);
4624
4625 // Manually terminate the pipeline
4626 iter->Stop();
4627 }
4628
4629 /// Feature: GloVe
4630 /// Description: Test with all parameters which include `path` and `max_vector` in function BuildFromFile
4631 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestGloVeAllBuildfromfileParams)4632 TEST_F(MindDataTestPipeline, TestGloVeAllBuildfromfileParams) {
4633 // Test with two parameters.
4634 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeAllBuildfromfileParams.";
4635
4636 // Create a TextFile dataset
4637 std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4638 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4639 EXPECT_NE(ds, nullptr);
4640
4641 std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
4642 std::shared_ptr<GloVe> glove;
4643 Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100);
4644 EXPECT_EQ(s, Status::OK());
4645
4646 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove);
4647 EXPECT_NE(lookup, nullptr);
4648
4649 // Create Map operation on ds
4650 ds = ds->Map({lookup}, {"text"});
4651 EXPECT_NE(ds, nullptr);
4652
4653 // Create an iterator over the result of the above dataset
4654 std::shared_ptr<Iterator> iter = ds->CreateIterator();
4655 EXPECT_NE(iter, nullptr);
4656
4657 // Iterate the dataset and get each row
4658 std::unordered_map<std::string, mindspore::MSTensor> row;
4659 ASSERT_OK(iter->GetNextRow(&row));
4660
4661 uint64_t i = 0;
4662 std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4663 {0, 0, 0, 0, 0, 0},
4664 {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4665 {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4666 {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4667 {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4668 {0, 0, 0, 0, 0, 0}};
4669 while (row.size() != 0) {
4670 auto ind = row["text"];
4671 MS_LOG(INFO) << ind.Shape();
4672 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4673 TensorPtr de_expected_item;
4674 dsize_t dim = 6;
4675 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4676 mindspore::MSTensor ms_expected_item =
4677 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4678 EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4679
4680 ASSERT_OK(iter->GetNextRow(&row));
4681 i++;
4682 }
4683
4684 EXPECT_EQ(i, 7);
4685
4686 // Manually terminate the pipeline
4687 iter->Stop();
4688 }
4689
4690 /// Feature: GloVe
4691 /// Description: Test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
4692 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestGloVeUnknownInit)4693 TEST_F(MindDataTestPipeline, TestGloVeUnknownInit) {
4694 // Test with two parameters.
4695 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeUnknownInit.";
4696
4697 // Create a TextFile dataset
4698 std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4699 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4700 EXPECT_NE(ds, nullptr);
4701
4702 std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
4703 std::shared_ptr<GloVe> glove;
4704 Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100);
4705 EXPECT_EQ(s, Status::OK());
4706
4707 std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
4708 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove, unknown_init);
4709 EXPECT_NE(lookup, nullptr);
4710
4711 // Create Map operation on ds
4712 ds = ds->Map({lookup}, {"text"});
4713 EXPECT_NE(ds, nullptr);
4714
4715 // Create an iterator over the result of the above dataset
4716 std::shared_ptr<Iterator> iter = ds->CreateIterator();
4717 EXPECT_NE(iter, nullptr);
4718
4719 // Iterate the dataset and get each row
4720 std::unordered_map<std::string, mindspore::MSTensor> row;
4721 ASSERT_OK(iter->GetNextRow(&row));
4722
4723 uint64_t i = 0;
4724 std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4725 {-1, -1, -1, -1, -1, -1},
4726 {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4727 {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4728 {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4729 {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4730 {-1, -1, -1, -1, -1, -1}};
4731 while (row.size() != 0) {
4732 auto ind = row["text"];
4733 MS_LOG(INFO) << ind.Shape();
4734 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4735 TensorPtr de_expected_item;
4736 dsize_t dim = 6;
4737 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4738 mindspore::MSTensor ms_expected_item =
4739 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4740 EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4741
4742 ASSERT_OK(iter->GetNextRow(&row));
4743 i++;
4744 }
4745
4746 EXPECT_EQ(i, 7);
4747
4748 // Manually terminate the pipeline
4749 iter->Stop();
4750 }
4751
4752 /// Feature: GloVe
4753 /// Description: Test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
4754 /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
4755 /// Expectation: Return correct MSTensor which is equal to the expected
TEST_F(MindDataTestPipeline,TestGloVeAllParams)4756 TEST_F(MindDataTestPipeline, TestGloVeAllParams) {
4757 // Test with all parameters.
4758 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeAllParams.";
4759 // Create a TextFile dataset
4760 std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4761 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4762 EXPECT_NE(ds, nullptr);
4763
4764 std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
4765 std::shared_ptr<GloVe> glove;
4766 Status s = GloVe::BuildFromFile(&glove, vectors_dir);
4767 EXPECT_EQ(s, Status::OK());
4768
4769 std::vector<float> unknown_init = {-1, -1, -1, -1, -1, -1};
4770 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(glove, unknown_init, true);
4771 EXPECT_NE(lookup, nullptr);
4772
4773 // Create Map operation on ds
4774 ds = ds->Map({lookup}, {"text"});
4775 EXPECT_NE(ds, nullptr);
4776
4777 // Create an iterator over the result of the above dataset
4778 std::shared_ptr<Iterator> iter = ds->CreateIterator();
4779 EXPECT_NE(iter, nullptr);
4780
4781 // Iterate the dataset and get each row
4782 std::unordered_map<std::string, mindspore::MSTensor> row;
4783 ASSERT_OK(iter->GetNextRow(&row));
4784
4785 uint64_t i = 0;
4786 std::vector<std::vector<float>> expected = {{0.418, 0.24968, -0.41242, 0.1217, 0.34527, -0.04445718411},
4787 {-1, -1, -1, -1, -1, -1},
4788 {0.15164, 0.30177, -0.16763, 0.17684, 0.31719, 0.33973},
4789 {0.70853, 0.57088, -0.4716, 0.18048, 0.54449, 0.72603},
4790 {0.68047, -0.039263, 0.30186, -0.17792, 0.42962, 0.032246},
4791 {0.26818, 0.14346, -0.27877, 0.016257, 0.11384, 0.69923},
4792 {-1, -1, -1, -1, -1, -1}};
4793 while (row.size() != 0) {
4794 auto ind = row["text"];
4795 MS_LOG(INFO) << ind.Shape();
4796 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4797 TensorPtr de_expected_item;
4798 dsize_t dim = 6;
4799 ASSERT_OK(Tensor::CreateFromVector(expected[i], TensorShape({dim}), &de_expected_item));
4800 mindspore::MSTensor ms_expected_item =
4801 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4802 EXPECT_MSTENSOR_EQ(ind, ms_expected_item);
4803
4804 ASSERT_OK(iter->GetNextRow(&row));
4805 i++;
4806 }
4807
4808 EXPECT_EQ(i, 7);
4809
4810 // Manually terminate the pipeline
4811 iter->Stop();
4812 }
4813
4814 /// Feature: GloVe
4815 /// Description: Test with pre-vectors set that have the different dimension
4816 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestGloVeDifferentDimension)4817 TEST_F(MindDataTestPipeline, TestGloVeDifferentDimension) {
4818 // Tokens don't have the same number of glove.
4819 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeDifferentDimension.";
4820
4821 // Create a TextFile dataset
4822 std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4823 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4824 EXPECT_NE(ds, nullptr);
4825
4826 std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.dim_different.txt";
4827 std::shared_ptr<GloVe> glove;
4828 Status s = GloVe::BuildFromFile(&glove, vectors_dir, 100);
4829 EXPECT_NE(s, Status::OK());
4830 }
4831
4832 /// Feature: GloVe
4833 /// Description: Test with the parameter max_vectors that is <= 0
4834 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestGloVeMaxVectorsLessThanZero)4835 TEST_F(MindDataTestPipeline, TestGloVeMaxVectorsLessThanZero) {
4836 // Test with max_vectors <= 0.
4837 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeMaxVectorsLessThanZero.";
4838
4839 // Create a TextFile dataset
4840 std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4841 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4842 EXPECT_NE(ds, nullptr);
4843
4844 std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.test.txt";
4845 std::shared_ptr<GloVe> glove;
4846 Status s = GloVe::BuildFromFile(&glove, vectors_dir, -1);
4847 EXPECT_NE(s, Status::OK());
4848 }
4849
4850 /// Feature: GloVe
4851 /// Description: Test with the pre-vectors file that is empty
4852 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestGloVeWithEmptyFile)4853 TEST_F(MindDataTestPipeline, TestGloVeWithEmptyFile) {
4854 // Read empty file.
4855 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithEmptyFile.";
4856
4857 // Create a TextFile dataset
4858 std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4859 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4860 EXPECT_NE(ds, nullptr);
4861
4862 std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.empty.txt";
4863 std::shared_ptr<GloVe> glove;
4864 Status s = GloVe::BuildFromFile(&glove, vectors_dir);
4865 EXPECT_NE(s, Status::OK());
4866 }
4867
4868 /// Feature: GloVe
4869 /// Description: Test with the pre-vectors file that is not exist
4870 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestGloVeWithNotExistFile)4871 TEST_F(MindDataTestPipeline, TestGloVeWithNotExistFile) {
4872 // Test with not exist file.
4873 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithNotExistFile.";
4874
4875 // Create a TextFile dataset
4876 std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4877 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4878 EXPECT_NE(ds, nullptr);
4879
4880 std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.empty.txt";
4881 std::shared_ptr<GloVe> glove;
4882 Status s = GloVe::BuildFromFile(&glove, vectors_dir);
4883 EXPECT_NE(s, Status::OK());
4884 }
4885
4886 /// Feature: GloVe
4887 /// Description: Test with the pre-vectors set that has a situation that info-head is not the first line in the set
4888 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestGloVeWithWrongInfoFile)4889 TEST_F(MindDataTestPipeline, TestGloVeWithWrongInfoFile) {
4890 // Wrong info.
4891 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithWrongInfoFile.";
4892
4893 // Create a TextFile dataset
4894 std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4895 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4896 EXPECT_NE(ds, nullptr);
4897
4898 std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.with_wrong_info.txt";
4899 std::shared_ptr<GloVe> glove;
4900 Status s = GloVe::BuildFromFile(&glove, vectors_dir);
4901 EXPECT_NE(s, Status::OK());
4902 }
4903
4904 /// Feature: GloVe
4905 /// Description: Test with the pre-vectors set that has a wrong format
4906 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestGloVeWithWrongFormat)4907 TEST_F(MindDataTestPipeline, TestGloVeWithWrongFormat) {
4908 // Wrong info.
4909 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestGloVeWithWrongFormat.";
4910
4911 // Create a TextFile dataset
4912 std::string data_file = datasets_root_path_ + "/testGloVe/words.txt";
4913 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4914 EXPECT_NE(ds, nullptr);
4915
4916 std::string vectors_dir = datasets_root_path_ + "/testGloVe/glove.6B.tests.vec";
4917 std::shared_ptr<GloVe> glove;
4918 Status s = GloVe::BuildFromFile(&glove, vectors_dir);
4919 EXPECT_NE(s, Status::OK());
4920 }
4921
4922 /// Feature: CharNGram
4923 /// Description: Test with default parameter in function BuildFromFile and function Lookup
4924 /// Expectation: Return correct MSTensor which is equal to the excepted
TEST_F(MindDataTestPipeline,TestCharNGramDefaultParam)4925 TEST_F(MindDataTestPipeline, TestCharNGramDefaultParam) {
4926 // Test with default parameter.
4927 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramDefaultParam.";
4928
4929 // Create a TextFile dataset
4930 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4931 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4932 EXPECT_NE(ds, nullptr);
4933
4934 std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
4935 std::shared_ptr<CharNGram> char_n_gram;
4936 Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
4937 EXPECT_EQ(s, Status::OK());
4938 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram);
4939 EXPECT_NE(lookup, nullptr);
4940
4941 // Create Map operation on ds
4942 ds = ds->Map({lookup}, {"text"});
4943 EXPECT_NE(ds, nullptr);
4944
4945 // Create an iterator over the result of the above dataset
4946 std::shared_ptr<Iterator> iter = ds->CreateIterator();
4947 EXPECT_NE(iter, nullptr);
4948
4949 // Iterate the dataset and get each row
4950 std::unordered_map<std::string, mindspore::MSTensor> row;
4951 ASSERT_OK(iter->GetNextRow(&row));
4952
4953 uint64_t i = 0;
4954 std::vector<std::vector<float>> expected = {{0, 0, 0, 0, 0},
4955 {0, 0, 0, 0, 0},
4956 {0.117336, 0.362446, -0.983326, 0.939264, -0.05648},
4957 {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
4958 {0, 0, 0, 0, 0},
4959 {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
4960 {0, 0, 0, 0, 0}};
4961 while (row.size() != 0) {
4962 auto ind = row["text"];
4963 MS_LOG(INFO) << ind.Shape();
4964 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
4965 TensorPtr de_expected_item;
4966 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
4967 mindspore::MSTensor ms_expected_item =
4968 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
4969 std::vector<int64_t> ind_shape = ind.Shape();
4970 std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
4971 EXPECT_EQ(ind_shape, ms_expected_shape);
4972
4973 ASSERT_OK(iter->GetNextRow(&row));
4974 i++;
4975 }
4976
4977 EXPECT_EQ(i, 7);
4978
4979 // Manually terminate the pipeline
4980 iter->Stop();
4981 }
4982
4983 /// Feature: CharNGram.
4984 /// Description: Test with all parameters which include `path` and `max_vector` in function BuildFromFile
4985 /// Expectation: Return correct MSTensor which is equal to the excepted
TEST_F(MindDataTestPipeline,TestCharNGramAllBuildfromfileParams)4986 TEST_F(MindDataTestPipeline, TestCharNGramAllBuildfromfileParams) {
4987 // Test with two parameters.
4988 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramAllBuildfromfileParams.";
4989
4990 // Create a TextFile dataset
4991 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
4992 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
4993 EXPECT_NE(ds, nullptr);
4994
4995 std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
4996 std::shared_ptr<CharNGram> char_n_gram;
4997 Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, 18);
4998 EXPECT_EQ(s, Status::OK());
4999
5000 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram);
5001 EXPECT_NE(lookup, nullptr);
5002
5003 // Create Map operation on ds
5004 ds = ds->Map({lookup}, {"text"});
5005 EXPECT_NE(ds, nullptr);
5006
5007 // Create an iterator over the result of the above dataset
5008 std::shared_ptr<Iterator> iter = ds->CreateIterator();
5009 EXPECT_NE(iter, nullptr);
5010
5011 // Iterate the dataset and get each row
5012 std::unordered_map<std::string, mindspore::MSTensor> row;
5013 ASSERT_OK(iter->GetNextRow(&row));
5014
5015 uint64_t i = 0;
5016 std::vector<std::vector<float>> expected = {{0, 0, 0, 0, 0},
5017 {0, 0, 0, 0, 0},
5018 {-0.155665, 0.664073, -0.538499, 1.22657, -0.2162},
5019 {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
5020 {0, 0, 0, 0, 0},
5021 {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
5022 {0, 0, 0, 0, 0}};
5023 while (row.size() != 0) {
5024 auto ind = row["text"];
5025 MS_LOG(INFO) << ind.Shape();
5026 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
5027 TensorPtr de_expected_item;
5028 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
5029 mindspore::MSTensor ms_expected_item =
5030 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
5031 std::vector<int64_t> ind_shape = ind.Shape();
5032 std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
5033 EXPECT_EQ(ind_shape, ms_expected_shape);
5034
5035 ASSERT_OK(iter->GetNextRow(&row));
5036 i++;
5037 }
5038
5039 EXPECT_EQ(i, 7);
5040
5041 // Manually terminate the pipeline
5042 iter->Stop();
5043 }
5044
5045 /// Feature: CharNGram
5046 /// Description: Test with all parameters in function BuildFromFile and `unknown_init` in function Lookup
5047 /// Expectation: Return correct MSTensor which is equal to the excepted
TEST_F(MindDataTestPipeline,TestCharNGramUnknownInit)5048 TEST_F(MindDataTestPipeline, TestCharNGramUnknownInit) {
5049 // Test with two parameters.
5050 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramUnknownInit.";
5051
5052 // Create a TextFile dataset
5053 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
5054 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5055 EXPECT_NE(ds, nullptr);
5056
5057 std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
5058 std::shared_ptr<CharNGram> char_n_gram;
5059 Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, 18);
5060 EXPECT_EQ(s, Status::OK());
5061
5062 std::vector<float> unknown_init(5, -1);
5063 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram, unknown_init);
5064 EXPECT_NE(lookup, nullptr);
5065
5066 // Create Map operation on ds
5067 ds = ds->Map({lookup}, {"text"});
5068 EXPECT_NE(ds, nullptr);
5069
5070 // Create an iterator over the result of the above dataset
5071 std::shared_ptr<Iterator> iter = ds->CreateIterator();
5072 EXPECT_NE(iter, nullptr);
5073
5074 // Iterate the dataset and get each row
5075 std::unordered_map<std::string, mindspore::MSTensor> row;
5076 ASSERT_OK(iter->GetNextRow(&row));
5077
5078 uint64_t i = 0;
5079 std::vector<std::vector<float>> expected = {{-1, -1, -1, -1, -1},
5080 {-1, -1, -1, -1, -1},
5081 {-0.155665, 0.664073, -0.538499, 1.22657, -0.2162},
5082 {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
5083 {-1, -1, -1, -1, -1},
5084 {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
5085 {-1, -1, -1, -1, -1}};
5086 while (row.size() != 0) {
5087 auto ind = row["text"];
5088 MS_LOG(INFO) << ind.Shape();
5089 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
5090 TensorPtr de_expected_item;
5091 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
5092 mindspore::MSTensor ms_expected_item =
5093 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
5094 std::vector<int64_t> ind_shape = ind.Shape();
5095 std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
5096 EXPECT_EQ(ind_shape, ms_expected_shape);
5097
5098 ASSERT_OK(iter->GetNextRow(&row));
5099 i++;
5100 }
5101
5102 EXPECT_EQ(i, 7);
5103
5104 // Manually terminate the pipeline
5105 iter->Stop();
5106 }
5107
5108 /// Feature: CharNGram
5109 /// Description: Test with all parameters which include `path` and `max_vectors` in function BuildFromFile and `token`,
5110 /// `unknown_init` and `lower_case_backup` in function Lookup. But some tokens have some big letters
5111 /// Expectation: Return correct MSTensor which is equal to the excepted
TEST_F(MindDataTestPipeline,TestCharNGramAllParams)5112 TEST_F(MindDataTestPipeline, TestCharNGramAllParams) {
5113 // Test with all parameters.
5114 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramAllParams.";
5115 // Create a TextFile dataset
5116 std::string data_file = datasets_root_path_ + "/testVectors/words_with_big_letter.txt";
5117 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5118 EXPECT_NE(ds, nullptr);
5119
5120 std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
5121 std::shared_ptr<CharNGram> char_n_gram;
5122 Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
5123 EXPECT_EQ(s, Status::OK());
5124
5125 std::vector<float> unknown_init(5, -1);
5126 std::shared_ptr<TensorTransform> lookup = std::make_shared<text::ToVectors>(char_n_gram, unknown_init, true);
5127 EXPECT_NE(lookup, nullptr);
5128
5129 // Create Map operation on ds
5130 ds = ds->Map({lookup}, {"text"});
5131 EXPECT_NE(ds, nullptr);
5132
5133 // Create an iterator over the result of the above dataset
5134 std::shared_ptr<Iterator> iter = ds->CreateIterator();
5135 EXPECT_NE(iter, nullptr);
5136
5137 // Iterate the dataset and get each row
5138 std::unordered_map<std::string, mindspore::MSTensor> row;
5139 ASSERT_OK(iter->GetNextRow(&row));
5140
5141 uint64_t i = 0;
5142 std::vector<std::vector<float>> expected = {{-1, -1, -1, -1, -1},
5143 {-1, -1, -1, -1, -1},
5144 {0.117336, 0.362446, -0.983326, 0.939264, -0.05648},
5145 {0.657201, 2.11761, -1.59276, 0.432072, 1.21395},
5146 {-1, -1, -1, -1, -1},
5147 {-2.26956, 0.288491, -0.740001, 0.661703, 0.147355},
5148 {-1, -1, -1, -1, -1}};
5149 while (row.size() != 0) {
5150 auto ind = row["text"];
5151 MS_LOG(INFO) << ind.Shape();
5152 TEST_MS_LOG_MSTENSOR(INFO, "ind: ", ind);
5153 TensorPtr de_expected_item;
5154 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_item));
5155 mindspore::MSTensor ms_expected_item =
5156 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_item));
5157 std::vector<int64_t> ind_shape = ind.Shape();
5158 std::vector<int64_t> ms_expected_shape = ms_expected_item.Shape();
5159 EXPECT_EQ(ind_shape, ms_expected_shape);
5160
5161 ASSERT_OK(iter->GetNextRow(&row));
5162 i++;
5163 }
5164
5165 EXPECT_EQ(i, 7);
5166
5167 // Manually terminate the pipeline
5168 iter->Stop();
5169 }
5170
5171 /// Feature: CharNGram
5172 /// Description: Test with pre-vectors set that have the different dimension
5173 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestCharNGramDifferentDimension)5174 TEST_F(MindDataTestPipeline, TestCharNGramDifferentDimension) {
5175 // Tokens don't have the same number of vectors.
5176 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramDifferentDimension.";
5177
5178 // Create a TextFile dataset
5179 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
5180 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5181 EXPECT_NE(ds, nullptr);
5182
5183 std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20_dim_different.txt";
5184 std::shared_ptr<CharNGram> char_n_gram;
5185 Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
5186 EXPECT_NE(s, Status::OK());
5187 }
5188
5189 /// Feature: CharNGram
5190 /// Description: Test with the parameter max_vectors that is <= 0
5191 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestCharNGramMaxVectorsLessThanZero)5192 TEST_F(MindDataTestPipeline, TestCharNGramMaxVectorsLessThanZero) {
5193 // Test with max_vectors <= 0.
5194 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramMaxVectorsLessThanZero.";
5195
5196 // Create a TextFile dataset
5197 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
5198 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5199 EXPECT_NE(ds, nullptr);
5200
5201 std::string vectors_dir = datasets_root_path_ + "/testVectors/char_n_gram_20.txt";
5202 std::shared_ptr<CharNGram> char_n_gram;
5203 Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir, -1);
5204 EXPECT_NE(s, Status::OK());
5205 }
5206
5207 /// Feature: CharNGram
5208 /// Description: Test with the pre-vectors file that is empty
5209 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestCharNGramWithEmptyFile)5210 TEST_F(MindDataTestPipeline, TestCharNGramWithEmptyFile) {
5211 // Read empty file.
5212 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramWithEmptyFile.";
5213
5214 // Create a TextFile dataset
5215 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
5216 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5217 EXPECT_NE(ds, nullptr);
5218
5219 std::string vectors_dir = datasets_root_path_ + "/testVectors/vectors_empty.txt";
5220 std::shared_ptr<CharNGram> char_n_gram;
5221 Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
5222 EXPECT_NE(s, Status::OK());
5223 }
5224
5225 /// Feature: CharNGram
5226 /// Description: Test with the pre-vectors file that is not exist
5227 /// Expectation: Throw correct error and message
TEST_F(MindDataTestPipeline,TestCharNGramsWithNotExistFile)5228 TEST_F(MindDataTestPipeline, TestCharNGramsWithNotExistFile) {
5229 // Test with not exist file.
5230 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestCharNGramsWithNotExistFile.";
5231
5232 // Create a TextFile dataset
5233 std::string data_file = datasets_root_path_ + "/testVectors/words.txt";
5234 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5235 EXPECT_NE(ds, nullptr);
5236
5237 std::string vectors_dir = datasets_root_path_ + "/testVectors/no_vectors.txt";
5238 std::shared_ptr<CharNGram> char_n_gram;
5239 Status s = CharNGram::BuildFromFile(&char_n_gram, vectors_dir);
5240 EXPECT_NE(s, Status::OK());
5241 }
5242
5243 /// Feature: AddToken op
5244 /// Description: Test input 1d of AddToken op successfully
5245 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestAddTokenPipelineSuccess)5246 TEST_F(MindDataTestPipeline, TestAddTokenPipelineSuccess) {
5247 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestAddTokenPipelineSuccess.";
5248
5249 // Create a TextFile dataset
5250 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
5251 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5252 EXPECT_NE(ds, nullptr);
5253
5254 // Create Take operation on ds
5255 ds = ds->Take(1);
5256 EXPECT_NE(ds, nullptr);
5257
5258 // Create white_tokenizer operation on ds
5259 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
5260 EXPECT_NE(white_tokenizer, nullptr);
5261
5262 // Create add_token operation on ds
5263 std::shared_ptr<TensorTransform> add_token = std::make_shared<text::AddToken>("TOKEN", true);
5264 EXPECT_NE(add_token, nullptr);
5265
5266 // Create Map operation on ds
5267 ds = ds->Map({white_tokenizer, add_token}, {"text"});
5268 EXPECT_NE(ds, nullptr);
5269
5270 // Create an iterator over the result of the above dataset
5271 // This will trigger the creation of the Execution Tree and launch it.
5272 std::shared_ptr<Iterator> iter = ds->CreateIterator();
5273 EXPECT_NE(iter, nullptr);
5274
5275 // Iterate the dataset and get each row
5276 std::unordered_map<std::string, mindspore::MSTensor> row;
5277 ASSERT_OK(iter->GetNextRow(&row));
5278
5279 std::vector<std::string> expected = {"TOKEN", "This", "is", "a", "text", "file."};
5280 std::shared_ptr<Tensor> de_expected_tensor;
5281 ASSERT_OK(Tensor::CreateFromVector(expected, &de_expected_tensor));
5282 mindspore::MSTensor expected_tensor =
5283 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
5284
5285 uint64_t i = 0;
5286 while (row.size() != 0) {
5287 auto ind = row["text"];
5288 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
5289 ASSERT_OK(iter->GetNextRow(&row));
5290 i++;
5291 }
5292
5293 EXPECT_EQ(i, 1);
5294
5295 // Manually terminate the pipeline
5296 iter->Stop();
5297 }
5298
5299 /// Feature: Truncate
5300 /// Description: Test Truncate basic usage max_seq_len less length
5301 /// Expectation: Output is equal to the expected output
TEST_F(MindDataTestPipeline,TestTruncateSuccess1D)5302 TEST_F(MindDataTestPipeline, TestTruncateSuccess1D) {
5303 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateSuccess1D.";
5304 // Testing basic Truncate
5305
5306 // Create a TextFile dataset
5307 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
5308 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5309 EXPECT_NE(ds, nullptr);
5310
5311 // Create white_tokenizer operation on ds
5312 std::shared_ptr<TensorTransform> white_tokenizer = std::make_shared<text::WhitespaceTokenizer>();
5313 EXPECT_NE(white_tokenizer, nullptr);
5314
5315 // Create a truncate operation on ds
5316 std::shared_ptr<TensorTransform> truncate = std::make_shared<text::Truncate>(3);
5317 EXPECT_NE(truncate, nullptr);
5318
5319 // Create Map operation on ds
5320 ds = ds->Map({white_tokenizer, truncate}, {"text"});
5321 EXPECT_NE(ds, nullptr);
5322
5323 // Create an iterator over the result of the above dataset
5324 // This will trigger the creation of the Execution Tree and launch it.
5325 std::shared_ptr<Iterator> iter = ds->CreateIterator();
5326 EXPECT_NE(iter, nullptr);
5327
5328 // Iterate the dataset and get each row
5329 std::unordered_map<std::string, mindspore::MSTensor> row;
5330 ASSERT_OK(iter->GetNextRow(&row));
5331
5332 std::vector<std::vector<std::string>> expected = {
5333 {"This", "is", "a"}, {"Be", "happy", "every"}, {"Good", "luck", "to"}};
5334
5335 uint64_t i = 0;
5336 while (row.size() != 0) {
5337 auto ind = row["text"];
5338
5339 std::shared_ptr<Tensor> de_expected_tensor;
5340 ASSERT_OK(Tensor::CreateFromVector(expected[i], &de_expected_tensor));
5341 mindspore::MSTensor expected_tensor =
5342 mindspore::MSTensor(std::make_shared<mindspore::dataset::DETensor>(de_expected_tensor));
5343 EXPECT_MSTENSOR_EQ(ind, expected_tensor);
5344
5345 ASSERT_OK(iter->GetNextRow(&row));
5346 i++;
5347 }
5348
5349 EXPECT_EQ(i, 3);
5350
5351 // Manually terminate the pipeline
5352 iter->Stop();
5353 }
5354
5355 /// Feature: Truncate
5356 /// Description: Test the incorrect parameter of Truncate interface
5357 /// Expectation: Error message is logged, and CreateIterator() for invalid pipeline returns nullptr
TEST_F(MindDataTestPipeline,TestTruncateFail)5358 TEST_F(MindDataTestPipeline, TestTruncateFail) {
5359 // Testing the incorrect parameter of Truncate interface.
5360 MS_LOG(INFO) << "Doing MindDataTestPipeline-TestTruncateFail.";
5361
5362 // Create a TextFile dataset
5363 std::string data_file = datasets_root_path_ + "/testTextFileDataset/1.txt";
5364 std::shared_ptr<Dataset> ds = TextFile({data_file}, 0, ShuffleMode::kFalse);
5365 EXPECT_NE(ds, nullptr);
5366
5367 // Testing the parameter max_seq_len less than 0
5368 // Create a truncate operation on ds
5369 std::shared_ptr<TensorTransform> truncate = std::make_shared<text::Truncate>(-1);
5370 EXPECT_NE(truncate, nullptr);
5371
5372 // Create a Map operation on ds
5373 ds = ds->Map({truncate});
5374 EXPECT_NE(ds, nullptr);
5375
5376 std::shared_ptr<Iterator> iter = ds->CreateIterator();
5377 // Expect failure: invalid Truncate input (The parameter max_seq_len must be greater than 0)
5378 EXPECT_EQ(iter, nullptr);
5379 }
5380