/** * Copyright 2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "common/common.h" #include "minddata/dataset/core/global_context.h" #include "minddata/dataset/engine/serdes.h" #include "minddata/dataset/include/dataset/datasets.h" #include "minddata/dataset/include/dataset/vision.h" #include "minddata/dataset/include/dataset/transforms.h" #include "minddata/dataset/kernels/ir/data/transforms_ir.h" using namespace mindspore::dataset; using mindspore::dataset::DatasetNode; using mindspore::dataset::ShuffleMode; using mindspore::dataset::Tensor; class MindDataTestDeserialize : public UT::DatasetOpTesting { protected: }; void compare_dataset(std::shared_ptr ds) { nlohmann::json out_json; ASSERT_OK(Serdes::SaveToJSON(ds, "dataset_pipeline.json", &out_json)); // output the deserialized out_json to ds1 and then out_json1 std::shared_ptr ds1; ASSERT_OK(Serdes::Deserialize("dataset_pipeline.json", &ds1)); EXPECT_NE(ds1, nullptr); // check original and deserialized dataset are the same nlohmann::json out_json1; ASSERT_OK(Serdes::SaveToJSON(ds1, "dataset_pipeline_1.json", &out_json1)); std::stringstream json_ss; json_ss << out_json; std::stringstream json_ss1; json_ss1 << out_json1; EXPECT_EQ(json_ss.str(), json_ss1.str()); return; } // test mnist dataset, and special cases of tensor operations (no input or tensor operation input) TEST_F(MindDataTestDeserialize, TestDeserializeMnist) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-Minist."; std::string data_dir = "./data/dataset/testMnistData"; std::string usage = "all"; std::shared_ptr sampler = std::make_shared(true, 100); std::shared_ptr ds = std::make_shared(data_dir, usage, sampler, nullptr); std::shared_ptr operation0 = std::make_shared(); std::shared_ptr operation1 = std::make_shared(operation0, 0.5); std::shared_ptr operation2 = std::make_shared(); std::shared_ptr operation3 = std::make_shared(); std::shared_ptr operation4 = std::make_shared(); std::shared_ptr operation5 = std::make_shared(); std::shared_ptr operation6 = std::make_shared(); std::vector, double>>> policy; std::vector, double>> sub_policy; sub_policy.push_back(std::make_pair(operation1, 0.4)); policy.push_back(sub_policy); std::shared_ptr operation7 = std::make_shared(policy); std::vector> transforms; transforms.push_back(operation2); transforms.push_back(operation3); transforms.push_back(operation4); std::shared_ptr operation8 = std::make_shared(transforms, 3); transforms.push_back(operation5); transforms.push_back(operation6); transforms.push_back(operation7); transforms.push_back(operation8); ds = std::make_shared(ds, transforms); ds = std::make_shared(ds, 10, true); compare_dataset(ds); } // test celeba dataset and part of the tensor operation TEST_F(MindDataTestDeserialize, TestDeserializeCelebA) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-CelebA."; std::string data_dir = "./data/dataset/testCelebAData/"; std::string usage = "all"; std::shared_ptr sampler = std::make_shared(1, 0, true, 2, 1, 1, true); bool decode = true; std::set extensions = {}; std::shared_ptr cache = nullptr; std::shared_ptr ds = std::make_shared(data_dir, usage, sampler, decode, extensions, cache); std::vector size = {80, 80}; std::vector size1 = {80, 80}; std::vector coordinates = {5, 5}; std::vector padding = {20, 20, 20, 20}; std::vector fill_value = {20, 20, 20}; std::vector ignore = {20, 20, 20, 20}; std::vector mean = {2.0, 2.0, 2.0, 2.0}; std::vector std = {0.5, 0.5, 0.5, 0.5}; std::vector translation = {0.5, 0.5}; std::vector shear = {0.5, 0.5}; std::vector sigma = {0.5, 0.5}; InterpolationMode interpolation = InterpolationMode::kLinear; std::shared_ptr operation0 = std::make_shared(0.0, translation, 0.5, shear, interpolation, fill_value); std::shared_ptr operation1 = std::make_shared(0.5, ignore); std::shared_ptr operation2 = std::make_shared(size); std::shared_ptr operation3 = std::make_shared(ImageBatchFormat::kNHWC, 0.1, 0.1); std::shared_ptr operation4 = std::make_shared(1, 1); std::shared_ptr operation5 = std::make_shared(true); std::shared_ptr operation6 = std::make_shared(coordinates, sigma); std::shared_ptr operation7 = std::make_shared(1.0); std::shared_ptr operation8 = std::make_shared(mean, std); std::shared_ptr operation9 = std::make_shared(mean, std, "float"); std::shared_ptr operation10 = std::make_shared(padding, fill_value, BorderType::kConstant); std::shared_ptr operation11 = std::make_shared(1.0, 0.5); std::shared_ptr operation12 = std::make_shared(10, 10, 0); std::shared_ptr operation13 = std::make_shared(size, interpolation); std::shared_ptr operation14 = std::make_shared(size, interpolation); std::vector> operations; operations.push_back(operation0); operations.push_back(operation1); operations.push_back(operation2); operations.push_back(operation3); operations.push_back(operation4); operations.push_back(operation5); operations.push_back(operation6); operations.push_back(operation7); operations.push_back(operation8); operations.push_back(operation9); operations.push_back(operation10); operations.push_back(operation11); operations.push_back(operation12); operations.push_back(operation13); operations.push_back(operation14); ds = std::make_shared(ds, 2); ds = std::make_shared(ds, operations); compare_dataset(ds); } // test cifar10 dataset and random tensor operations TEST_F(MindDataTestDeserialize, TestDeserializeCifar10) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-Cifar10."; std::string data_dir = "./data/dataset/testCifar10Data"; std::string usage = "all"; std::shared_ptr cache = nullptr; std::shared_ptr sampler = std::make_shared(0, 10); std::shared_ptr ds = std::make_shared(data_dir, usage, sampler, cache); std::vector center = {50.0, 50.0}; std::vector threshold = {5, 5}; std::vector fill_value = {150, 150, 150}; std::vector bit_range = {5, 15}; std::vector degrees = {0.0, 0.0}; std::vector scale = {0.5, 0.5}; std::vector ratio = {0.5, 0.5}; std::vector size = {224, 224}; std::vector padding = {20, 20, 20, 20}; std::vector translate_range = {0.0, 0.0, 0.0, 0.0}; std::vector scale_range = {1.0, 1.0}; std::vector shear_ranges = {0.0, 0.0, 0.0, 0.0}; InterpolationMode interpolation = InterpolationMode::kLinear; std::shared_ptr operation1 = std::make_shared( degrees, InterpolationMode::kNearestNeighbour, true, center, fill_value); std::shared_ptr operation2 = std::make_shared( degrees, translate_range, scale_range, shear_ranges, interpolation, fill_value); std::shared_ptr operation3 = std::make_shared(0.5, 10.5); std::shared_ptr operation4 = std::make_shared(size, scale, ratio, interpolation, 2); std::shared_ptr operation5 = std::make_shared(size, padding, true, fill_value, BorderType::kConstant); std::shared_ptr operation6 = std::make_shared(0.1); std::shared_ptr operation7 = std::make_shared(0.1); std::shared_ptr operation8 = std::make_shared(bit_range); std::shared_ptr operation9 = std::make_shared(size); std::shared_ptr operation10 = std::make_shared(size); std::shared_ptr operation11 = std::make_shared(size, scale, ratio, interpolation, 2); std::shared_ptr operation12 = std::make_shared(size, scale, ratio, interpolation, 2); std::shared_ptr operation13 = std::make_shared(degrees, interpolation, true, center, fill_value); std::shared_ptr operation14 = std::make_shared(degrees); std::shared_ptr operation15 = std::make_shared(threshold); std::shared_ptr operation16 = std::make_shared(0.1); std::shared_ptr operation17 = std::make_shared(0.1); std::vector> operations; operations.push_back(operation1); operations.push_back(operation2); operations.push_back(operation3); operations.push_back(operation4); operations.push_back(operation5); operations.push_back(operation6); operations.push_back(operation7); operations.push_back(operation8); operations.push_back(operation9); operations.push_back(operation10); operations.push_back(operation11); operations.push_back(operation12); operations.push_back(operation13); operations.push_back(operation14); operations.push_back(operation15); operations.push_back(operation16); operations.push_back(operation17); ds = std::make_shared(ds, operations); ds = std::make_shared(ds, 1, true); ds = std::make_shared(ds, 1); compare_dataset(ds); } TEST_F(MindDataTestDeserialize, TestDeserializeCifar100) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-Cifar100."; std::string data_dir = "./data/dataset/testCifar100Data"; std::string usage = "all"; std::shared_ptr cache = nullptr; std::shared_ptr sampler = std::make_shared(0, 10); std::shared_ptr ds = std::make_shared(data_dir, usage, sampler, cache); ds = std::make_shared(ds, 6); std::shared_ptr operation = std::make_shared(); std::vector> ops = {operation}; ds = std::make_shared(ds, ops); std::vector> operations; std::vector size = {32, 32}; std::vector padding = {4, 4, 4, 4}; bool pad_if_needed = false; std::vector fill_value = {4, 4, 4}; InterpolationMode interpolation = InterpolationMode::kLinear; std::shared_ptr operation1 = std::make_shared(size, padding, pad_if_needed, fill_value, BorderType::kConstant); size = {224, 224}; std::shared_ptr operation2 = std::make_shared(size, interpolation); std::shared_ptr operation3 = std::make_shared(0.5, 0.0); std::vector mean = {0.49, 0.48, 0.46}; std::vector std = {0.20, 0.199, 0.201}; std::shared_ptr operation4 = std::make_shared(mean, std); operations.push_back(operation1); operations.push_back(operation2); operations.push_back(operation3); operations.push_back(operation4); ds = std::make_shared(ds, operations); ds = std::make_shared(ds, 3, true); ds = std::make_shared(ds, 1); compare_dataset(ds); } TEST_F(MindDataTestDeserialize, TestDeserializeCSV) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-CSV."; std::string data_file = "./data/dataset/testCSV/1.csv"; std::vector dataset_files = {data_file}; char field_delim = ','; std::vector column_names = {"col1", "col2", "col3", "col4"}; std::vector columns = {"col1", "col4", "col2"}; std::vector> column_defaults = {}; std::shared_ptr cache = nullptr; std::shared_ptr sampler = std::make_shared(0, 10); std::shared_ptr ds = std::make_shared(dataset_files, field_delim, column_defaults, column_names, 3, ShuffleMode::kGlobal, 1, 0, cache); ds = std::make_shared(ds, columns); compare_dataset(ds); } TEST_F(MindDataTestDeserialize, TestDeserializeImageFolder) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-ImageFolder."; std::string dataset_dir = "./data/dataset/testPK/data"; std::shared_ptr child_sampler = std::make_shared(3, true, 1); std::vector weights = {1.0, 0.1, 0.02, 0.3, 0.4, 0.05, 1.2, 0.13, 0.14, 0.015, 0.16, 1.1}; std::set extensions = {}; std::shared_ptr cache = nullptr; std::map class_indexing = {}; std::shared_ptr sampler = std::make_shared(weights, 11); sampler->AddChildSampler(child_sampler); std::shared_ptr ds = std::make_shared(dataset_dir, false, sampler, false, extensions, class_indexing, cache); ds = std::make_shared(ds, 1); std::vector size = {224, 224}; std::vector scale = {0.5, 0.5}; std::vector ratio = {0.5, 0.5}; std::vector center = {50.0, 50.0}; std::vector fill_value = {150, 150, 150}; InterpolationMode interpolation = InterpolationMode::kLinear; std::shared_ptr operation1 = std::make_shared(size); std::vector> ops = {operation1}; ds = std::make_shared(ds, ops); std::vector> operations; std::shared_ptr operation2 = std::make_shared(size, scale, ratio, 2); std::shared_ptr operation3 = std::make_shared(0.5, interpolation, true, center, fill_value); operations.push_back(operation2); operations.push_back(operation3); ds = std::make_shared(ds, operations); ds = std::make_shared(ds, 2, true); compare_dataset(ds); } TEST_F(MindDataTestDeserialize, TestDeserializeManifest) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-Manifest."; std::string data_file = "./data/dataset/testManifestData/cpp.json"; std::shared_ptr sampler = std::make_shared(0, 10); std::map class_indexing = {}; std::shared_ptr cache = nullptr; std::shared_ptr ds = std::make_shared(data_file, "train", sampler, class_indexing, false, cache); std::vector coordinates = {50, 50}; std::vector size = {224, 224}; std::shared_ptr operation1 = std::make_shared(coordinates, size); std::shared_ptr operation2 = std::make_shared(); std::shared_ptr operation3 = std::make_shared(); std::shared_ptr operation4 = std::make_shared(5, 5, SliceMode::kDrop, 1); std::shared_ptr operation5 = std::make_shared(); std::vector> operations; operations.push_back(operation1); operations.push_back(operation2); operations.push_back(operation3); operations.push_back(operation4); operations.push_back(operation5); ds = std::make_shared(ds, operations); ds = std::make_shared(ds, 2, false); compare_dataset(ds); } TEST_F(MindDataTestDeserialize, TestDeserializeVOC) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-VOC."; std::string dataset_dir = "./data/dataset/testVOC2012"; std::vector indices = {0, 1}; std::shared_ptr sampler = std::make_shared(indices, 3); std::string task = "Detection"; std::string usage = "train"; std::map class_indexing = {}; std::shared_ptr cache = nullptr; std::shared_ptr ds = std::make_shared(dataset_dir, task, usage, class_indexing, true, sampler, cache); std::vector brightness = {0.5, 0.5}; std::vector contrast = {1.0, 1.0}; std::vector hue = {0.0, 0.0}; std::vector saturation = {1.0, 1.0}; std::shared_ptr operation = std::make_shared(brightness, contrast, saturation, hue); std::vector> ops = {operation}; ds = std::make_shared(ds, ops); ds = std::make_shared(ds, 2); compare_dataset(ds); } TEST_F(MindDataTestDeserialize, TestDeserializeCLUE) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-CLUE."; std::string train_file = "./data/dataset/testCLUE/afqmc/train.json"; std::string task = "AFQMC"; std::string usage = "train"; std::vector files = {train_file}; std::shared_ptr cache = nullptr; std::shared_ptr ds = std::make_shared(files, task, usage, 1, ShuffleMode::kFalse, 1, 0, cache); ds = std::make_shared(ds, 1); std::shared_ptr operation1 = std::make_shared(true); std::vector> ops = {operation1}; ds = std::make_shared(ds, ops); compare_dataset(ds); } TEST_F(MindDataTestDeserialize, TestDeserializeCoco) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-Coco."; std::string folder_path = "./data/dataset/testCOCO/train"; std::string annotation_file = "./data/dataset/testCOCO/annotations/train.json"; std::string task = "Detection"; std::vector indices = {0, 1}; std::shared_ptr sampler = std::make_shared(indices, 3); std::shared_ptr cache = nullptr; std::shared_ptr ds = std::make_shared(folder_path, annotation_file, task, true, sampler, cache, false); std::vector fill_value = {150, 150, 150}; std::vector degrees = {0.0, 0.0}; std::vector scale = {0.5, 0.5}; std::vector ratio = {0.5, 0.5}; std::vector size = {224, 224}; std::vector padding = {20, 20, 20, 20}; InterpolationMode interpolation = InterpolationMode::kLinear; std::shared_ptr operation1 = std::make_shared(size, scale, ratio, interpolation, 2); std::shared_ptr operation2 = std::make_shared(size, padding, true, fill_value, BorderType::kConstant); std::shared_ptr operation3 = std::make_shared(0.1); std::shared_ptr operation4 = std::make_shared(0.1); std::vector> operations; operations.push_back(operation1); operations.push_back(operation2); operations.push_back(operation3); operations.push_back(operation4); ds = std::make_shared(ds, operations); compare_dataset(ds); } TEST_F(MindDataTestDeserialize, TestDeserializeTFRecord) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-TFRecord."; int num_samples = 12; int32_t num_shards = 1; int32_t shard_id = 0; bool shard_equal_rows = false; std::shared_ptr cache = nullptr; std::vector columns_list = {}; std::vector dataset_files = {"./data/dataset/testTFTestAllTypes/test.data"}; std::shared_ptr schema = Schema(); ASSERT_OK(schema->add_column("col1", mindspore::DataType::kNumberTypeInt32, {4})); ASSERT_OK(schema->add_column("col2", mindspore::DataType::kNumberTypeInt64, {4})); std::shared_ptr ds = std::make_shared(dataset_files, schema, columns_list, num_samples, ShuffleMode::kFiles, num_shards, shard_id, shard_equal_rows, cache); ds = std::make_shared(ds, 10000, true); std::vector input_columns = {"col_sint16", "col_sint32", "col_sint64", "col_float", "col_1d", "col_2d", "col_3d", "col_binary"}; std::vector output_columns = {"column_sint16", "column_sint32", "column_sint64", "column_float", "column_1d", "column_2d", "column_3d", "column_binary"}; std::shared_ptr operation = std::make_shared(); std::vector> ops = {operation}; ds = std::make_shared(ds, ops, input_columns, output_columns); std::string train_file = "./data/dataset/testCLUE/afqmc/train.json"; std::string task1 = "AFQMC"; std::string usage = "train"; std::vector files = {train_file}; std::shared_ptr ds_child1 = std::make_shared(files, task1, usage, 0, ShuffleMode::kFalse, 1, 0, cache); std::vector dataset_files2 = {"./data/dataset/testTextFileDataset/1.txt"}; std::shared_ptr ds_child2 = std::make_shared(dataset_files2, 2, ShuffleMode::kFiles, 1, 0, cache); std::vector> datasets = {ds, ds_child1, ds_child2}; ds = std::make_shared(datasets); compare_dataset(ds); } TEST_F(MindDataTestDeserialize, TestDeserializeTextfile) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-Textfile."; std::vector dataset_files = {"./data/dataset/testTextFileDataset/1.txt"}; std::shared_ptr cache = nullptr; std::shared_ptr ds = std::make_shared(dataset_files, 2, ShuffleMode::kFiles, 1, 0, cache); std::shared_ptr operation = std::make_shared(); std::vector> ops = {operation}; ds = std::make_shared(ds, ops); ds = std::make_shared(ds, 10, true); compare_dataset(ds); } TEST_F(MindDataTestDeserialize, TestDeserializeInvalidJson) { std::shared_ptr ds; // check the invalid json path would return error ASSERT_ERROR(Serdes::Deserialize("invalid_dataset.json", &ds)); // check the invalid json object would return error ASSERT_ERROR(Serdes::Deserialize("./data/dataset/testDataset1/datasetTestInvalidJson.json", &ds)); EXPECT_EQ(ds, nullptr); } TEST_F(MindDataTestDeserialize, TestDeserializeFill) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-Fill."; std::vector dataset_files = {"./data/dataset/testTextFileDataset/1.txt"}; std::shared_ptr cache = nullptr; std::shared_ptr ds = std::make_shared(dataset_files, 2, ShuffleMode::kFiles, 1, 0, cache); std::shared_ptr fill_value; ASSERT_OK(Tensor::CreateScalar(true, &fill_value)); std::shared_ptr operation1 = std::make_shared(fill_value); std::shared_ptr operation2 = std::make_shared("int32_t"); std::vector> ops = {operation1, operation2}; ds = std::make_shared(ds, ops); ds = std::make_shared(ds, "queue", "type", 1, true, 10, true); compare_dataset(ds); } TEST_F(MindDataTestDeserialize, TestDeserializeTensor) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-Tensor."; std::shared_ptr test_tensor; std::vector input = {1.1, 0.2, 0.3, 0.4, 0.5, 0.6, 1.2, 0.7, 0.8, 0.9, 1.0, 2.0, 1.3, 3.0, 4.0}; ASSERT_OK(Tensor::CreateFromVector(input, TensorShape{3, 5}, &test_tensor)); nlohmann::json json_obj; ASSERT_OK(test_tensor->to_json(&json_obj)); std::shared_ptr test_tensor1; ASSERT_OK(Tensor::from_json(json_obj, &test_tensor1)); nlohmann::json json_obj1; ASSERT_OK(test_tensor1->to_json(&json_obj1)); std::stringstream json_ss; json_ss << json_obj; std::stringstream json_ss1; json_ss1 << json_obj1; EXPECT_EQ(json_ss.str(), json_ss1.str()); } // Helper function to get the session id from SESSION_ID env variable Status GetSessionFromEnv(session_id_type *session_id); TEST_F(MindDataTestDeserialize, DISABLED_TestDeserializeCache) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-Cache."; std::string data_dir = "./data/dataset/testCache"; std::string usage = "all"; session_id_type env_session; ASSERT_TRUE(GetSessionFromEnv(&env_session)); std::shared_ptr some_cache = CreateDatasetCache(env_session, 0, false, "127.0.0.1", 50052, 1, 1); std::shared_ptr sampler = std::make_shared(0, 10); std::shared_ptr ds = std::make_shared(data_dir, usage, sampler, some_cache); compare_dataset(ds); } TEST_F(MindDataTestDeserialize, TestDeserializeConcatAlbumFlickr) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-ConcatAlbumFlickr."; std::string dataset_dir = "./data/dataset/testAlbum"; std::vector column_names = {"col1", "col2", "col3"}; bool decode = false; std::shared_ptr sampler = std::make_shared(0, 10); std::string data_schema = "./data/dataset/testAlbum/datasetSchema.json"; std::shared_ptr ds = std::make_shared(dataset_dir, data_schema, column_names, decode, sampler, nullptr); std::shared_ptr operation = std::make_shared(0.5, 0.5); std::vector> ops = {operation}; ds = std::make_shared(ds, ops); std::string dataset_path = "./data/dataset/testFlickrData/flickr30k/flickr30k-images"; std::string annotation_file = "./data/dataset/testFlickrData/flickr30k/test1.token"; std::shared_ptr ds_child1 = std::make_shared(dataset_path, annotation_file, decode, sampler, nullptr); std::vector> datasets = {ds, ds_child1}; std::pair pair = std::make_pair(1, 1); std::vector> children_flag_and_nums = {pair}; std::vector> children_start_end_index = {pair}; ds = std::make_shared(datasets, sampler, children_flag_and_nums, children_start_end_index); compare_dataset(ds); } TEST_F(MindDataTestDeserialize, TestDeserializePyFunc) { MS_LOG(INFO) << "Doing MindDataTestDeserialize-PyFunc."; std::shared_ptr ds1; ASSERT_OK(Serdes::Deserialize("./data/dataset/tf_file_dataset/pyvision_dataset_pipeline.json", &ds1)); EXPECT_NE(ds1, nullptr); }