1# Copyright 2020 Huawei Technologies Co., Ltd 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14# ============================================================================== 15 16import time 17import numpy as np 18 19import mindspore.dataset as ds 20 21 22# This UT test tests the following cases 23 24# 1. padding: input_shape=[x] output_shape=[y] where y > x 25# 2. padding in one dimension and truncate in the other. input_shape=[x1,x2] output_shape=[y1,y2] y1>x1 and y2<x2 26# 3. automatic padding for a specific column 27# 4. default setting for all columns 28# 5. test None in different places 29 30# this generator function yield two columns 31# col1d: [0],[1], [2], [3] 32# col2d: [[100],[200]], [[101],[201]], [102],[202]], [103],[203]] 33def gen_2cols(num): 34 for i in range(num): 35 yield (np.array([i]), np.array([[i + 100], [i + 200]])) 36 37 38# this generator function yield one column of variable shapes 39# col: [0], [0,1], [0,1,2], [0,1,2,3] 40def gen_var_col(num): 41 for i in range(num): 42 yield (np.array([j for j in range(i + 1)]),) 43 44 45# this generator function yield two columns of variable shapes 46# col1: [0], [0,1], [0,1,2], [0,1,2,3] 47# col2: [100], [100,101], [100,101,102], [100,110,102,103] 48def gen_var_cols(num): 49 for i in range(num): 50 yield (np.array([j for j in range(i + 1)]), np.array([100 + j for j in range(i + 1)])) 51 52 53# this generator function yield two columns of variable shapes 54# col1: [[0]], [[0,1]], [[0,1,2]], [[0,1,2,3]] 55# col2: [[100]], [[100,101]], [[100,101,102]], [[100,110,102,103]] 56def gen_var_cols_2d(num): 57 for i in range(num): 58 yield (np.array([[j for j in range(i + 1)]]), np.array([[100 + j for j in range(i + 1)]])) 59 60 61def test_batch_padding_01(): 62 data1 = ds.GeneratorDataset((lambda: gen_2cols(2)), ["col1d", "col2d"]) 63 data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col2d": ([2, 2], -2), "col1d": ([2], -1)}) 64 data1 = data1.repeat(2) 65 for data in data1.create_dict_iterator(num_epochs=1, output_numpy=True): 66 np.testing.assert_array_equal([[0, -1], [1, -1]], data["col1d"]) 67 np.testing.assert_array_equal([[[100, -2], [200, -2]], [[101, -2], [201, -2]]], data["col2d"]) 68 69 70def test_batch_padding_02(): 71 data1 = ds.GeneratorDataset((lambda: gen_2cols(2)), ["col1d", "col2d"]) 72 data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col2d": ([1, 2], -2)}) 73 data1 = data1.repeat(2) 74 for data in data1.create_dict_iterator(num_epochs=1, output_numpy=True): 75 np.testing.assert_array_equal([[0], [1]], data["col1d"]) 76 np.testing.assert_array_equal([[[100, -2]], [[101, -2]]], data["col2d"]) 77 78 79def test_batch_padding_03(): 80 data1 = ds.GeneratorDataset((lambda: gen_var_col(4)), ["col"]) 81 data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col": (None, -1)}) # pad automatically 82 data1 = data1.repeat(2) 83 res = dict() 84 for ind, data in enumerate(data1.create_dict_iterator(num_epochs=1, output_numpy=True)): 85 res[ind] = data["col"].copy() 86 np.testing.assert_array_equal(res[0], [[0, -1], [0, 1]]) 87 np.testing.assert_array_equal(res[1], [[0, 1, 2, -1], [0, 1, 2, 3]]) 88 np.testing.assert_array_equal(res[2], [[0, -1], [0, 1]]) 89 np.testing.assert_array_equal(res[3], [[0, 1, 2, -1], [0, 1, 2, 3]]) 90 91 92def test_batch_padding_04(): 93 data1 = ds.GeneratorDataset((lambda: gen_var_cols(2)), ["col1", "col2"]) 94 data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={}) # pad automatically 95 data1 = data1.repeat(2) 96 for data in data1.create_dict_iterator(num_epochs=1, output_numpy=True): 97 np.testing.assert_array_equal(data["col1"], [[0, 0], [0, 1]]) 98 np.testing.assert_array_equal(data["col2"], [[100, 0], [100, 101]]) 99 100 101def test_batch_padding_05(): 102 data1 = ds.GeneratorDataset((lambda: gen_var_cols_2d(3)), ["col1", "col2"]) 103 data1 = data1.batch(batch_size=3, drop_remainder=False, 104 pad_info={"col2": ([2, None], -2), "col1": (None, -1)}) # pad automatically 105 for data in data1.create_dict_iterator(num_epochs=1, output_numpy=True): 106 np.testing.assert_array_equal(data["col1"], [[[0, -1, -1]], [[0, 1, -1]], [[0, 1, 2]]]) 107 np.testing.assert_array_equal(data["col2"], [[[100, -2, -2], [-2, -2, -2]], [[100, 101, -2], [-2, -2, -2]], 108 [[100, 101, 102], [-2, -2, -2]]]) 109 110 111def batch_padding_performance_3d(): 112 cifar10_dir = "../data/dataset/testCifar10Data" 113 data1 = ds.Cifar10Dataset(cifar10_dir, shuffle=False) # shape = [32,32,3] 114 data1 = data1.repeat(24) 115 pad_info = {"image": ([36, 36, 3], 0)} 116 # pad_info = None 117 data1 = data1.batch(batch_size=24, drop_remainder=True, pad_info=pad_info) 118 start_time = time.time() 119 num_batches = 0 120 for _ in data1.create_dict_iterator(num_epochs=1): 121 num_batches += 1 122 _ = "total number of batch:" + str(num_batches) + " time elapsed:" + str(time.time() - start_time) 123 # print(res) 124 125 126def batch_padding_performance_1d(): 127 cifar10_dir = "../data/dataset/testCifar10Data" 128 data1 = ds.Cifar10Dataset(cifar10_dir, shuffle=False) # shape = [32,32,3] 129 data1 = data1.repeat(24) 130 data1 = data1.map(operations=(lambda x: x.reshape(-1)), input_columns="image") 131 pad_info = {"image": ([3888], 0)} # 3888 =36*36*3 132 # pad_info = None 133 data1 = data1.batch(batch_size=24, drop_remainder=True, pad_info=pad_info) 134 start_time = time.time() 135 num_batches = 0 136 for _ in data1.create_dict_iterator(num_epochs=1): 137 num_batches += 1 138 _ = "total number of batch:" + str(num_batches) + " time elapsed:" + str(time.time() - start_time) 139 # print(res) 140 141 142def batch_pyfunc_padding_3d(): 143 cifar10_dir = "../data/dataset/testCifar10Data" 144 data1 = ds.Cifar10Dataset(cifar10_dir, shuffle=False) # shape = [32,32,3] 145 data1 = data1.repeat(24) 146 # pad_info = {"image": ([36, 36, 3], 0)} 147 data1 = data1.map(operations=(lambda x: np.pad(x, ((0, 4), (0, 4), (0, 0)))), input_columns="image", 148 python_multiprocessing=False) 149 data1 = data1.batch(batch_size=24, drop_remainder=True) 150 start_time = time.time() 151 num_batches = 0 152 for _ in data1.create_dict_iterator(num_epochs=1): 153 num_batches += 1 154 _ = "total number of batch:" + str(num_batches) + " time elapsed:" + str(time.time() - start_time) 155 # print(res) 156 157 158def batch_pyfunc_padding_1d(): 159 cifar10_dir = "../data/dataset/testCifar10Data" 160 data1 = ds.Cifar10Dataset(cifar10_dir, shuffle=False) # shape = [32,32,3] 161 data1 = data1.repeat(24) 162 data1 = data1.map(operations=(lambda x: x.reshape(-1)), input_columns="image") 163 data1 = data1.map(operations=(lambda x: np.pad(x, (0, 816))), input_columns="image", python_multiprocessing=False) 164 data1 = data1.batch(batch_size=24, drop_remainder=True) 165 start_time = time.time() 166 num_batches = 0 167 for _ in data1.create_dict_iterator(num_epochs=1): 168 num_batches += 1 169 _ = "total number of batch:" + str(num_batches) + " time elapsed:" + str(time.time() - start_time) 170 # print(res) 171 172 173# this function runs pad_batch and numpy.pad then compare the results 174def test_pad_via_map(): 175 cifar10_dir = "../data/dataset/testCifar10Data" 176 177 def pad_map_config(): 178 data1 = ds.Cifar10Dataset(cifar10_dir, shuffle=False, num_samples=1000) # shape = [32,32,3] 179 data1 = data1.map(operations=(lambda x: x.reshape(-1)), input_columns="image") # reshape to 1d 180 data1 = data1.map(operations=(lambda x: np.pad(x, (0, 816))), input_columns="image") 181 data1 = data1.batch(batch_size=25, drop_remainder=True) 182 res = [] 183 for data in data1.create_dict_iterator(num_epochs=1, output_numpy=True): 184 res.append(data["image"]) 185 return res 186 187 def pad_batch_config(): 188 data2 = ds.Cifar10Dataset(cifar10_dir, shuffle=False, num_samples=1000) # shape = [32,32,3] 189 data2 = data2.map(operations=(lambda x: x.reshape(-1)), input_columns="image") # reshape to 1d 190 data2 = data2.batch(batch_size=25, drop_remainder=True, pad_info={"image": ([3888], 0)}) 191 res = [] 192 for data in data2.create_dict_iterator(num_epochs=1, output_numpy=True): 193 res.append(data["image"]) 194 return res 195 196 res_from_map = pad_map_config() 197 res_from_batch = pad_batch_config() 198 assert len(res_from_batch) == len(res_from_batch) 199 for i, _ in enumerate(res_from_map): 200 np.testing.assert_array_equal(res_from_map[i], res_from_batch[i]) 201 202 203if __name__ == '__main__': 204 test_batch_padding_01() 205 test_batch_padding_02() 206 test_batch_padding_03() 207 test_batch_padding_04() 208 test_batch_padding_05() 209 # batch_padding_performance_3d() 210 # batch_padding_performance_1d() 211 # batch_pyfunc_padding_3d() 212 # batch_pyfunc_padding_1d() 213 test_pad_via_map() 214