1# Copyright 2019 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15import copy 16import numpy as np 17import pytest 18 19import mindspore.common.dtype as mstype 20import mindspore.dataset as ds 21import mindspore.dataset.engine.iterators as it 22from mindspore import log as logger 23from mindspore import Tensor 24 25 26# Generate 1d int numpy array from 0 - 63 27def generator_1d(): 28 for i in range(64): 29 yield (np.array([i]),) 30 31 32class DatasetGenerator: 33 def __init__(self): 34 pass 35 36 def __getitem__(self, item): 37 return (np.array([item]),) 38 39 def __len__(self): 40 return 10 41 42 43class DatasetGeneratorLarge: 44 def __init__(self): 45 self.data = np.array(range(4000)) 46 47 def __getitem__(self, item): 48 return (self.data + item, self.data *10) 49 50 def __len__(self): 51 return 10 52 53 54def test_generator_0(): 55 """ 56 Test 1D Generator 57 """ 58 logger.info("Test 1D Generator : 0 - 63") 59 60 # apply dataset operations 61 data1 = ds.GeneratorDataset(generator_1d, ["data"]) 62 63 i = 0 64 for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 65 golden = np.array([i]) 66 np.testing.assert_array_equal(item["data"], golden) 67 i = i + 1 68 69 70# Generate md int numpy array from [[0, 1], [2, 3]] to [[63, 64], [65, 66]] 71def generator_md(): 72 for i in range(64): 73 yield (np.array([[i, i + 1], [i + 2, i + 3]]),) 74 75 76def test_generator_1(): 77 """ 78 Test MD Generator 79 """ 80 logger.info("Test MD Generator : 0 - 63, with shape [2, 2]") 81 82 # apply dataset operations 83 data1 = ds.GeneratorDataset(generator_md, ["data"]) 84 85 i = 0 86 for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 87 golden = np.array([[i, i + 1], [i + 2, i + 3]]) 88 np.testing.assert_array_equal(item["data"], golden) 89 i = i + 1 90 91 92# Generate two columns, the first column is from Generator1D, the second column is from GeneratorMD 93def generator_mc(maxid=64): 94 for i in range(maxid): 95 yield (np.array([i]), np.array([[i, i + 1], [i + 2, i + 3]])) 96 97 98def test_generator_2(): 99 """ 100 Test multi column generator 101 """ 102 logger.info("Test multi column generator") 103 104 # apply dataset operations 105 data1 = ds.GeneratorDataset(generator_mc, ["col0", "col1"]) 106 107 i = 0 108 for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 109 golden = np.array([i]) 110 np.testing.assert_array_equal(item["col0"], golden) 111 golden = np.array([[i, i + 1], [i + 2, i + 3]]) 112 np.testing.assert_array_equal(item["col1"], golden) 113 i = i + 1 114 115 116def test_generator_3(): 117 """ 118 Test 1D Generator + repeat(4) 119 """ 120 logger.info("Test 1D Generator : 0 - 63 + Repeat(4)") 121 122 # apply dataset operations 123 data1 = ds.GeneratorDataset(generator_1d, ["data"]) 124 125 data1 = data1.repeat(4) 126 127 i = 0 128 for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 129 golden = np.array([i]) 130 np.testing.assert_array_equal(item["data"], golden) 131 i = i + 1 132 if i == 64: 133 i = 0 134 135 136def test_generator_4(): 137 """ 138 Test fixed size 1D Generator + batch 139 """ 140 logger.info("Test 1D Generator : 0 - 63 + batch(4)") 141 142 # apply dataset operations 143 data1 = ds.GeneratorDataset(generator_1d, ["data"]) 144 145 data1 = data1.batch(4) 146 147 i = 0 148 for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 149 golden = np.array([[i], [i + 1], [i + 2], [i + 3]]) 150 np.testing.assert_array_equal(item["data"], golden) 151 i = i + 4 152 153 154def generator_with_type(t): 155 for i in range(64): 156 yield (np.array([i], dtype=t),) 157 158 159def type_tester(t): 160 logger.info("Test with Type {}".format(t.__name__)) 161 162 # apply dataset operations 163 data1 = ds.GeneratorDataset((lambda: generator_with_type(t)), ["data"]) 164 165 data1 = data1.batch(4) 166 167 i = 0 168 for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 169 golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t) 170 np.testing.assert_array_equal(item["data"], golden) 171 i = i + 4 172 173 174def test_generator_5(): 175 """ 176 Test 1D Generator on different data type 177 """ 178 logger.info("Test 1D Generator on all data types") 179 180 types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32, np.float64] 181 182 for t in types: 183 type_tester(t) 184 185 186def type_tester_with_type_check(t, c): 187 logger.info("Test with Type {}".format(t.__name__)) 188 189 # apply dataset operations 190 data1 = ds.GeneratorDataset((lambda: generator_with_type(t)), ["data"], column_types=[c]) 191 192 data1 = data1.batch(4) 193 194 i = 0 195 for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 196 golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t) 197 np.testing.assert_array_equal(item["data"], golden) 198 i = i + 4 199 200 201def test_generator_6(): 202 """ 203 Test 1D Generator on different data type with type check 204 """ 205 logger.info("Test 1D Generator on all data types with type check") 206 207 np_types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32, 208 np.float64] 209 de_types = [mstype.int8, mstype.int16, mstype.int32, mstype.int64, mstype.uint8, mstype.uint16, mstype.uint32, 210 mstype.uint64, mstype.float32, mstype.float64] 211 212 for i, _ in enumerate(np_types): 213 type_tester_with_type_check(np_types[i], de_types[i]) 214 215 216def generator_with_type_2c(t): 217 for i in range(64): 218 yield (np.array([i], dtype=t), np.array([i], dtype=t)) 219 220 221def type_tester_with_type_check_2c(t, c): 222 logger.info("Test with Type {}".format(t.__name__)) 223 224 # apply dataset operations 225 data1 = ds.GeneratorDataset((lambda: generator_with_type_2c(t)), ["data0", "data1"], column_types=c) 226 227 data1 = data1.batch(4) 228 229 i = 0 230 for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 231 golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t) 232 np.testing.assert_array_equal(item["data0"], golden) 233 i = i + 4 234 235 236def test_generator_7(): 237 """ 238 Test 2 column Generator on different data type with type check 239 """ 240 logger.info("Test 2 column Generator on all data types with type check") 241 242 np_types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32, 243 np.float64] 244 de_types = [mstype.int8, mstype.int16, mstype.int32, mstype.int64, mstype.uint8, mstype.uint16, mstype.uint32, 245 mstype.uint64, mstype.float32, mstype.float64] 246 247 for i, _ in enumerate(np_types): 248 type_tester_with_type_check_2c(np_types[i], [None, de_types[i]]) 249 250 251def test_generator_8(): 252 """ 253 Test multi column generator with few mapops 254 """ 255 logger.info("Test multi column generator with mapops to check the order too") 256 257 # apply dataset operations 258 data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"]) 259 data1 = data1.map(operations=(lambda x: x * 3), input_columns="col0", output_columns="out0", 260 num_parallel_workers=2) 261 data1 = data1.map(operations=(lambda x: (x * 7, x)), input_columns="col1", output_columns=["out1", "out2"], 262 num_parallel_workers=2, column_order=["out0", "out1", "out2"]) 263 data1 = data1.map(operations=(lambda x: x + 1), input_columns="out2", output_columns="out2", 264 num_parallel_workers=2) 265 266 i = 0 267 for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 268 golden = np.array([i * 3]) 269 np.testing.assert_array_equal(item["out0"], golden) 270 golden = np.array([[i * 7, (i + 1) * 7], [(i + 2) * 7, (i + 3) * 7]]) 271 np.testing.assert_array_equal(item["out1"], golden) 272 golden = np.array([[i + 1, i + 2], [i + 3, i + 4]]) 273 np.testing.assert_array_equal(item["out2"], golden) 274 i = i + 1 275 276 277def test_generator_9(): 278 """ 279 Test map column order when len(input_columns) == len(output_columns). 280 """ 281 logger.info("Test map column order when len(input_columns) == len(output_columns).") 282 283 # apply dataset operations 284 data1 = ds.GeneratorDataset(generator_mc(2048), ["image", "label"]) 285 data2 = ds.GeneratorDataset(generator_mc(2048), ["label", "image"]) 286 data1 = data1.map(operations=(lambda x: x * 3), input_columns="label", 287 num_parallel_workers=4) 288 data2 = data2.map(operations=(lambda x: x * 3), input_columns="label", 289 num_parallel_workers=4) 290 291 # Expected column order is not changed. 292 # data1 = data[0] is "image" and data[1] is "label" 293 # data2 = data[0] is "label" and data[1] is "image" 294 i = 0 295 for data1, data2 in zip(data1, data2): # each data is a dictionary 296 golden = np.array([i]) 297 np.testing.assert_array_equal(data1[0].asnumpy(), golden) 298 golden = np.array([[i * 3, (i + 1) * 3], [(i + 2) * 3, (i + 3) * 3]]) 299 np.testing.assert_array_equal(data1[1].asnumpy(), golden) 300 301 golden = np.array([i * 3]) 302 np.testing.assert_array_equal(data2[0].asnumpy(), golden) 303 golden = np.array([[i, i + 1], [i + 2, i + 3]]) 304 np.testing.assert_array_equal(data2[1].asnumpy(), golden) 305 i = i + 1 306 307 308def test_generator_10(): 309 """ 310 Test map column order when len(input_columns) != len(output_columns). 311 """ 312 logger.info("Test map column order when len(input_columns) != len(output_columns).") 313 314 # apply dataset operations 315 data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"]) 316 data1 = data1.map(operations=(lambda x: (x, x * 5)), input_columns="col1", output_columns=["out1", "out2"], 317 column_order=['col0', 'out1', 'out2'], num_parallel_workers=2) 318 319 # Expected column order is |col0|out1|out2| 320 i = 0 321 for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): 322 golden = np.array([i]) 323 np.testing.assert_array_equal(item[0], golden) 324 golden = np.array([[i, i + 1], [i + 2, i + 3]]) 325 np.testing.assert_array_equal(item[1], golden) 326 golden = np.array([[i * 5, (i + 1) * 5], [(i + 2) * 5, (i + 3) * 5]]) 327 np.testing.assert_array_equal(item[2], golden) 328 i = i + 1 329 330 331def test_generator_11(): 332 """ 333 Test map column order when len(input_columns) != len(output_columns). 334 """ 335 logger.info("Test map column order when len(input_columns) != len(output_columns), " 336 "and column_order drops some columns.") 337 338 # apply dataset operations 339 data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"]) 340 data1 = data1.map(operations=(lambda x: (x, x * 5)), input_columns="col1", output_columns=["out1", "out2"], 341 column_order=['out1', 'out2'], num_parallel_workers=2) 342 343 # Expected column order is |out1|out2| 344 i = 0 345 for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): 346 # len should be 2 because col0 is dropped (not included in column_order) 347 assert len(item) == 2 348 golden = np.array([[i, i + 1], [i + 2, i + 3]]) 349 np.testing.assert_array_equal(item[0], golden) 350 golden = np.array([[i * 5, (i + 1) * 5], [(i + 2) * 5, (i + 3) * 5]]) 351 np.testing.assert_array_equal(item[1], golden) 352 i = i + 1 353 354 355def test_generator_12(): 356 """ 357 Test map column order when input_columns and output_columns are None. 358 """ 359 logger.info("Test map column order when input_columns and output_columns are None.") 360 361 # apply dataset operations 362 data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"]) 363 data1 = data1.map(operations=(lambda x: (x * 5)), num_parallel_workers=2) 364 365 # Expected column order is |col0|col1| 366 i = 0 367 for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): 368 assert len(item) == 2 369 golden = np.array([i * 5]) 370 np.testing.assert_array_equal(item[0], golden) 371 golden = np.array([[i, i + 1], [i + 2, i + 3]]) 372 np.testing.assert_array_equal(item[1], golden) 373 i = i + 1 374 375 data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"]) 376 data1 = data1.map(operations=(lambda x: (x * 5)), column_order=["col1", "col0"], num_parallel_workers=2) 377 378 # Expected column order is |col0|col1| 379 i = 0 380 for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): 381 assert len(item) == 2 382 golden = np.array([i * 5]) 383 np.testing.assert_array_equal(item[1], golden) 384 golden = np.array([[i, i + 1], [i + 2, i + 3]]) 385 np.testing.assert_array_equal(item[0], golden) 386 i = i + 1 387 388 389def test_generator_13(): 390 """ 391 Test map column order when input_columns is None. 392 """ 393 logger.info("Test map column order when input_columns is None.") 394 395 # apply dataset operations 396 data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"]) 397 data1 = data1.map(operations=(lambda x: (x * 5)), output_columns=["out0"], num_parallel_workers=2) 398 399 # Expected column order is |out0|col1| 400 i = 0 401 for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): 402 assert len(item) == 2 403 golden = np.array([i * 5]) 404 np.testing.assert_array_equal(item[0], golden) 405 golden = np.array([[i, i + 1], [i + 2, i + 3]]) 406 np.testing.assert_array_equal(item[1], golden) 407 i = i + 1 408 409 for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 410 # len should be 2 because col0 is dropped (not included in column_order) 411 assert len(item) == 2 412 golden = np.array([i * 5]) 413 np.testing.assert_array_equal(item["out0"], golden) 414 golden = np.array([[i, i + 1], [i + 2, i + 3]]) 415 np.testing.assert_array_equal(item["col1"], golden) 416 i = i + 1 417 418 419def test_generator_14(): 420 """ 421 Test 1D Generator MP + CPP sampler 422 """ 423 logger.info("Test 1D Generator MP : 0 - 63") 424 # Sometimes there are some ITERATORS left in ITERATORS_LIST when run all UTs together, 425 # and cause core dump and blocking in this UT. Add cleanup() here to fix it. 426 it._cleanup() # pylint: disable=W0212 427 428 ##reduce memory needed by reducing queue size 429 prefetch_original = ds.config.get_prefetch_size() 430 ds.config.set_prefetch_size(1) 431 432 source = [(np.array([x]),) for x in range(256)] 433 ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler(), 434 num_parallel_workers=4, max_rowsize=1).repeat(2) 435 i = 0 436 for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 437 golden = np.array([i]) 438 np.testing.assert_array_equal(data["data"], golden) 439 i = i + 1 440 if i == 256: 441 i = 0 442 443 ds.config.set_prefetch_size(prefetch_original) 444 445def test_generator_15(): 446 """ 447 Test 1D Generator MP + Python sampler 448 """ 449 logger.info("Test 1D Generator MP : 0 - 63") 450 451 ##reduce memory needed by reducing queue size 452 prefetch_original = ds.config.get_prefetch_size() 453 ds.config.set_prefetch_size(1) 454 455 sampler = [x for x in range(256)] 456 source = [(np.array([x]),) for x in range(256)] 457 ds1 = ds.GeneratorDataset(source, ["data"], sampler=sampler, 458 num_parallel_workers=4, max_rowsize=1).repeat(1) 459 i = 0 460 for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 461 golden = np.array([i]) 462 np.testing.assert_array_equal(data["data"], golden) 463 i = i + 1 464 if i == 256: 465 i = 0 466 467 ds.config.set_prefetch_size(prefetch_original) 468 469def test_generator_16(): 470 """ 471 Test multi column generator Mp + CPP sampler 472 """ 473 logger.info("Test multi column generator") 474 475 source = [(np.array([x]), np.array([x + 1])) for x in range(256)] 476 # apply dataset operations 477 data1 = ds.GeneratorDataset(source, ["col0", "col1"], sampler=ds.SequentialSampler()) 478 479 i = 0 480 for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 481 golden = np.array([i]) 482 np.testing.assert_array_equal(item["col0"], golden) 483 golden = np.array([i + 1]) 484 np.testing.assert_array_equal(item["col1"], golden) 485 i = i + 1 486 487 488def test_generator_17(): 489 """ 490 Test multi column generator Mp + Python sampler 491 """ 492 logger.info("Test multi column generator") 493 494 sampler = [x for x in range(256)] 495 source = [(np.array([x]), np.array([x + 1])) for x in range(256)] 496 # apply dataset operations 497 data1 = ds.GeneratorDataset(source, ["col0", "col1"], sampler=sampler) 498 499 i = 0 500 for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 501 golden = np.array([i]) 502 np.testing.assert_array_equal(item["col0"], golden) 503 golden = np.array([i + 1]) 504 np.testing.assert_array_equal(item["col1"], golden) 505 i = i + 1 506 507 508def test_generator_18(): 509 """ 510 Test multiprocessing flag (same as test 13 with python_multiprocessing=True flag) 511 """ 512 logger.info("Test map column order when input_columns is None.") 513 514 #reduce shm usage by disabling this optimization 515 mem_original = ds.config.get_enable_shared_mem() 516 ds.config.set_enable_shared_mem(False) 517 518 # apply dataset operations 519 data1 = ds.GeneratorDataset(generator_mc(2048), ["col0", "col1"], python_multiprocessing=True) 520 data1 = data1.map(operations=(lambda x: (x * 5)), output_columns=["out0"], num_parallel_workers=2, 521 python_multiprocessing=True) 522 523 # Expected column order is |out0|col1| 524 i = 0 525 for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): 526 assert len(item) == 2 527 golden = np.array([i * 5]) 528 np.testing.assert_array_equal(item[0], golden) 529 golden = np.array([[i, i + 1], [i + 2, i + 3]]) 530 np.testing.assert_array_equal(item[1], golden) 531 i = i + 1 532 533 for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 534 # len should be 2 because col0 is dropped (not included in column_order) 535 assert len(item) == 2 536 golden = np.array([i * 5]) 537 np.testing.assert_array_equal(item["out0"], golden) 538 539 ds.config.set_enable_shared_mem(mem_original) 540 541def test_generator_19(): 542 """ 543 Test multiprocessing flag with 2 different large columns 544 """ 545 logger.info("Test map column order when input_columns is None.") 546 547 # apply dataset operations 548 data1 = ds.GeneratorDataset(DatasetGeneratorLarge(), ["col0", "col1"], python_multiprocessing=True, shuffle=False) 549 550 # Expected column order is |out0|col1| 551 i = 0 552 for item in data1.create_tuple_iterator(num_epochs=1, output_numpy=True): 553 assert len(item) == 2 554 golden = np.array(range(4000)) + i 555 np.testing.assert_array_equal(item[0], golden) 556 golden = np.array(range(4000)) * 10 557 np.testing.assert_array_equal(item[1], golden) 558 i = i + 1 559 560 561class RandomAccessDataset: 562 def __init__(self): 563 self.__data = np.random.sample((5, 1)) 564 565 def __getitem__(self, item): 566 return self.__data[item] 567 568 def __len__(self): 569 return 5 570 571 572class RandomAccessDatasetWithoutLen: 573 def __init__(self): 574 self.__data = np.random.sample((5, 1)) 575 576 def __getitem__(self, item): 577 return self.__data[item] 578 579 580class IterableDataset: 581 def __init__(self): 582 self.count = 0 583 self.max = 10 584 585 def __iter__(self): 586 return self 587 588 def __next__(self): 589 if self.count >= self.max: 590 raise StopIteration 591 self.count += 1 592 return (np.array(self.count),) 593 594 595def test_generator_20(): 596 """ 597 Test mappable and unmappable dataset as source for GeneratorDataset. 598 """ 599 logger.info("Test mappable and unmappable dataset as source for GeneratorDataset.") 600 601 # Mappable dataset 602 data1 = ds.GeneratorDataset(RandomAccessDataset(), ["col0"]) 603 dataset_size1 = data1.get_dataset_size() 604 assert dataset_size1 == 5 605 606 # Mappable dataset without __len__ 607 data2 = ds.GeneratorDataset(RandomAccessDatasetWithoutLen(), ["col0"]) 608 try: 609 data2.get_dataset_size() 610 except RuntimeError as e: 611 assert "'__len__' method is required" in str(e) 612 613 # Unmappable dataset 614 data3 = ds.GeneratorDataset(IterableDataset(), ["col0"]) 615 dataset_size3 = data3.get_dataset_size() 616 assert dataset_size3 == 10 617 618 619def test_generator_error_1(): 620 def generator_np(): 621 for i in range(64): 622 yield (np.array([{i}]),) 623 624 with pytest.raises(RuntimeError) as info: 625 data1 = ds.GeneratorDataset(generator_np, ["data"]) 626 for _ in data1: 627 pass 628 assert "Invalid data type" in str(info.value) 629 630 631def test_generator_error_2(): 632 def generator_np(): 633 for i in range(64): 634 yield ({i},) 635 636 with pytest.raises(RuntimeError) as info: 637 data1 = ds.GeneratorDataset(generator_np, ["data"]) 638 for _ in data1: 639 pass 640 print("========", str(info.value)) 641 assert "Generator should return a tuple of NumPy arrays" in str(info.value) 642 643 644def test_generator_error_3(): 645 with pytest.raises(ValueError) as info: 646 # apply dataset operations 647 data1 = ds.GeneratorDataset(generator_mc(2048), ["label", "image"]) 648 data1 = data1.map(operations=(lambda x: (x, x * 5)), input_columns=["label"], output_columns=["out1", "out2"], 649 num_parallel_workers=2) 650 651 for _ in data1: 652 pass 653 assert "When length of input_columns and output_columns are not equal, column_order must be specified." in \ 654 str(info.value) 655 656 657def test_generator_error_4(): 658 with pytest.raises(RuntimeError) as info: 659 # apply dataset operations 660 data1 = ds.GeneratorDataset(generator_mc(2048), ["label", "image"]) 661 data1 = data1.map(operations=(lambda x: (x, x * 5)), input_columns=["label"], 662 num_parallel_workers=2) 663 664 for _ in data1: 665 pass 666 assert "Unexpected error. Result of a tensorOp doesn't match output column names" in str(info.value) 667 668 669def test_generator_sequential_sampler(): 670 source = [(np.array([x]),) for x in range(64)] 671 ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler()) 672 i = 0 673 for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 674 golden = np.array([i]) 675 np.testing.assert_array_equal(data["data"], golden) 676 i = i + 1 677 678 679def test_generator_random_sampler(): 680 source = [(np.array([x]),) for x in range(64)] 681 ds1 = ds.GeneratorDataset(source, ["data"], shuffle=True) 682 for _ in ds1.create_dict_iterator(num_epochs=1): # each data is a dictionary 683 pass 684 685 686def test_generator_distributed_sampler(): 687 source = [(np.array([x]),) for x in range(64)] 688 for sid in range(8): 689 ds1 = ds.GeneratorDataset(source, ["data"], shuffle=False, num_shards=8, shard_id=sid) 690 i = sid 691 for data in ds1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 692 golden = np.array([i]) 693 np.testing.assert_array_equal(data["data"], golden) 694 i = i + 8 695 696 697def test_generator_num_samples(): 698 source = [(np.array([x]),) for x in range(64)] 699 num_samples = 32 700 ds1 = ds.GeneratorDataset(source, ["data"], sampler=ds.SequentialSampler(num_samples=num_samples)) 701 ds2 = ds.GeneratorDataset(source, ["data"], sampler=[i for i in range(32)], num_samples=num_samples) 702 ds3 = ds.GeneratorDataset(generator_1d, ["data"], num_samples=num_samples) 703 704 count = 0 705 for _ in ds1.create_dict_iterator(num_epochs=1): 706 count = count + 1 707 assert count == num_samples 708 709 count = 0 710 for _ in ds2.create_dict_iterator(num_epochs=1): 711 count = count + 1 712 assert count == num_samples 713 714 count = 0 715 for _ in ds3.create_dict_iterator(num_epochs=1): 716 count = count + 1 717 assert count == num_samples 718 719 720def test_generator_num_samples_underflow(): 721 source = [(np.array([x]),) for x in range(64)] 722 num_samples = 256 723 ds2 = ds.GeneratorDataset(source, ["data"], sampler=[i for i in range(64)], num_samples=num_samples) 724 ds3 = ds.GeneratorDataset(generator_1d, ["data"], num_samples=num_samples) 725 726 count = 0 727 for _ in ds2.create_dict_iterator(num_epochs=1): 728 count = count + 1 729 assert count == 64 730 731 count = 0 732 for _ in ds3.create_dict_iterator(num_epochs=1): 733 count = count + 1 734 assert count == 64 735 736 737def type_tester_with_type_check_2c_schema(t, c): 738 logger.info("Test with Type {}".format(t.__name__)) 739 740 schema = ds.Schema() 741 schema.add_column("data0", c[0]) 742 schema.add_column("data1", c[1]) 743 744 # apply dataset operations 745 data1 = ds.GeneratorDataset((lambda: generator_with_type_2c(t)), schema=schema) 746 747 data1 = data1.batch(4) 748 749 i = 0 750 for item in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 751 golden = np.array([[i], [i + 1], [i + 2], [i + 3]], dtype=t) 752 np.testing.assert_array_equal(item["data0"], golden) 753 i = i + 4 754 755 756def test_generator_schema(): 757 """ 758 Test 2 column Generator on different data type with type check with schema input 759 """ 760 logger.info("Test 2 column Generator on all data types with type check") 761 762 np_types = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64, np.float32, 763 np.float64] 764 de_types = [mstype.int8, mstype.int16, mstype.int32, mstype.int64, mstype.uint8, mstype.uint16, mstype.uint32, 765 mstype.uint64, mstype.float32, mstype.float64] 766 767 for i, _ in enumerate(np_types): 768 type_tester_with_type_check_2c_schema(np_types[i], [de_types[i], de_types[i]]) 769 770 771def test_generator_dataset_size_0(): 772 """ 773 Test GeneratorDataset get_dataset_size by iterator method. 774 """ 775 logger.info("Test 1D Generator : 0 - 63 get_dataset_size") 776 777 data1 = ds.GeneratorDataset(generator_1d, ["data"]) 778 data_size = data1.get_dataset_size() 779 780 num_rows = 0 781 for _ in data1.create_dict_iterator(num_epochs=1, output_numpy=True): # each data is a dictionary 782 num_rows = num_rows + 1 783 assert data_size == num_rows 784 785 786def test_generator_dataset_size_1(): 787 """ 788 Test GeneratorDataset get_dataset_size by __len__ method. 789 """ 790 logger.info("Test DatasetGenerator get_dataset_size") 791 792 dataset_generator = DatasetGenerator() 793 data1 = ds.GeneratorDataset(dataset_generator, ["data"]) 794 795 data_size = data1.get_dataset_size() 796 797 num_rows = 0 798 for _ in data1.create_dict_iterator(num_epochs=1): 799 num_rows = num_rows + 1 800 assert data_size == num_rows 801 802 803def test_generator_dataset_size_2(): 804 """ 805 Test GeneratorDataset + repeat get_dataset_size 806 """ 807 logger.info("Test 1D Generator + repeat get_dataset_size") 808 809 data1 = ds.GeneratorDataset(generator_1d, ["data"]) 810 data1 = data1.repeat(2) 811 812 data_size = data1.get_dataset_size() 813 814 num_rows = 0 815 for _ in data1.create_dict_iterator(num_epochs=1): 816 num_rows = num_rows + 1 817 assert data_size == num_rows 818 819 820def test_generator_dataset_size_3(): 821 """ 822 Test GeneratorDataset + batch get_dataset_size 823 """ 824 logger.info("Test 1D Generator + batch get_dataset_size") 825 826 data1 = ds.GeneratorDataset(generator_1d, ["data"]) 827 data1 = data1.batch(4) 828 829 data_size = data1.get_dataset_size() 830 831 num_rows = 0 832 for _ in data1.create_dict_iterator(num_epochs=1): 833 num_rows += 1 834 assert data_size == num_rows 835 836 837def test_generator_dataset_size_4(): 838 """ 839 Test GeneratorDataset + num_shards 840 """ 841 logger.info("Test 1D Generator : 0 - 63 + num_shards get_dataset_size") 842 843 dataset_generator = DatasetGenerator() 844 data1 = ds.GeneratorDataset(dataset_generator, ["data"], num_shards=3, shard_id=0) 845 data_size = data1.get_dataset_size() 846 847 num_rows = 0 848 for _ in data1.create_dict_iterator(num_epochs=1): # each data is a dictionary 849 num_rows = num_rows + 1 850 assert data_size == num_rows 851 852 853def test_generator_dataset_size_5(): 854 """ 855 Test get_dataset_size after create_dict_iterator 856 """ 857 logger.info("Test get_dataset_size after create_dict_iterator") 858 859 dataset_generator = DatasetGenerator() 860 data1 = ds.GeneratorDataset(dataset_generator, ["data"], num_shards=3, shard_id=0) 861 862 num_rows = 0 863 for _ in data1.create_dict_iterator(num_epochs=1): # each data is a dictionary 864 num_rows = num_rows + 1 865 data_size = data1.get_dataset_size() 866 assert data_size == num_rows 867 868 869def manual_test_generator_keyboard_interrupt(): 870 """ 871 Test keyboard_interrupt 872 """ 873 logger.info("Test 1D Generator MP : 0 - 63") 874 875 class MyDS(): 876 def __getitem__(self, item): 877 while True: 878 pass 879 880 def __len__(self): 881 return 1024 882 883 ds1 = ds.GeneratorDataset(MyDS(), ["data"], num_parallel_workers=4).repeat(2) 884 for _ in ds1.create_dict_iterator(num_epochs=1): # each data is a dictionary 885 pass 886 887 888def test_explicit_deepcopy(): 889 """ 890 Test explicit_deepcopy 891 """ 892 logger.info("Test explicit_deepcopy") 893 894 ds1 = ds.NumpySlicesDataset([1, 2], shuffle=False) 895 ds2 = copy.deepcopy(ds1) 896 for d1, d2 in zip(ds1, ds2): 897 assert d1 == d2 898 899def test_func_generator_dataset_005(): 900 """ 901 generator: class __getitem__ 902 """ 903 result = [np.random.randn(242, 242, 242), np.random.randn(42, 24, 442)] 904 905 class MyData(): 906 def __init__(self, input_para): 907 self.data = input_para 908 909 def __getitem__(self, item): 910 return (Tensor(self.data[0]), Tensor(self.data[1])) 911 912 def __len__(self): 913 return 2 914 915 column_names = ["col1", "col2"] 916 dataset = ds.GeneratorDataset(MyData(result), column_names) 917 i = 0 918 for data in dataset.create_dict_iterator(output_numpy=True): 919 assert "col1" in str(data.keys()) 920 assert (data["col1"] == result[0]).all() 921 assert (data["col2"] == result[1]).all() 922 i += 1 923 assert i == 2 924 925 926if __name__ == "__main__": 927 test_generator_0() 928 test_generator_1() 929 test_generator_2() 930 test_generator_3() 931 test_generator_4() 932 test_generator_5() 933 test_generator_6() 934 test_generator_7() 935 test_generator_8() 936 test_generator_9() 937 test_generator_10() 938 test_generator_11() 939 test_generator_12() 940 test_generator_13() 941 test_generator_14() 942 test_generator_15() 943 test_generator_16() 944 test_generator_17() 945 test_generator_18() 946 test_generator_19() 947 test_generator_error_1() 948 test_generator_error_2() 949 test_generator_error_3() 950 test_generator_error_4() 951 test_generator_sequential_sampler() 952 test_generator_distributed_sampler() 953 test_generator_random_sampler() 954 test_generator_num_samples() 955 test_generator_num_samples_underflow() 956 test_generator_schema() 957 test_generator_dataset_size_0() 958 test_generator_dataset_size_1() 959 test_generator_dataset_size_2() 960 test_generator_dataset_size_3() 961 test_generator_dataset_size_4() 962 test_generator_dataset_size_5() 963 test_explicit_deepcopy() 964 test_func_generator_dataset_005() 965