• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2020 Huawei Technologies Co., Ltd
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7# http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14# ==============================================================================
15
16import time
17import numpy as np
18
19import mindspore.dataset as ds
20
21
22# This UT test tests the following cases
23
24# 1. padding: input_shape=[x] output_shape=[y] where y > x
25# 2. padding in one dimension and truncate in the other. input_shape=[x1,x2] output_shape=[y1,y2] y1>x1 and y2<x2
26# 3. automatic padding for a specific column
27# 4. default setting for all columns
28# 5. test None in different places
29
30# this generator function yield two columns
31# col1d: [0],[1], [2], [3]
32# col2d: [[100],[200]], [[101],[201]], [102],[202]], [103],[203]]
33def gen_2cols(num):
34    for i in range(num):
35        yield (np.array([i]), np.array([[i + 100], [i + 200]]))
36
37
38# this generator function yield one column of variable shapes
39# col: [0], [0,1], [0,1,2], [0,1,2,3]
40def gen_var_col(num):
41    for i in range(num):
42        yield (np.array([j for j in range(i + 1)]),)
43
44
45# this generator function yield two columns of variable shapes
46# col1: [0], [0,1], [0,1,2], [0,1,2,3]
47# col2: [100], [100,101], [100,101,102], [100,110,102,103]
48def gen_var_cols(num):
49    for i in range(num):
50        yield (np.array([j for j in range(i + 1)]), np.array([100 + j for j in range(i + 1)]))
51
52
53# this generator function yield two columns of variable shapes
54# col1: [[0]], [[0,1]], [[0,1,2]], [[0,1,2,3]]
55# col2: [[100]], [[100,101]], [[100,101,102]], [[100,110,102,103]]
56def gen_var_cols_2d(num):
57    for i in range(num):
58        yield (np.array([[j for j in range(i + 1)]]), np.array([[100 + j for j in range(i + 1)]]))
59
60
61def test_batch_padding_01():
62    data1 = ds.GeneratorDataset((lambda: gen_2cols(2)), ["col1d", "col2d"])
63    data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col2d": ([2, 2], -2), "col1d": ([2], -1)})
64    data1 = data1.repeat(2)
65    for data in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
66        np.testing.assert_array_equal([[0, -1], [1, -1]], data["col1d"])
67        np.testing.assert_array_equal([[[100, -2], [200, -2]], [[101, -2], [201, -2]]], data["col2d"])
68
69
70def test_batch_padding_02():
71    data1 = ds.GeneratorDataset((lambda: gen_2cols(2)), ["col1d", "col2d"])
72    data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col2d": ([1, 2], -2)})
73    data1 = data1.repeat(2)
74    for data in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
75        np.testing.assert_array_equal([[0], [1]], data["col1d"])
76        np.testing.assert_array_equal([[[100, -2]], [[101, -2]]], data["col2d"])
77
78
79def test_batch_padding_03():
80    data1 = ds.GeneratorDataset((lambda: gen_var_col(4)), ["col"])
81    data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={"col": (None, -1)})  # pad automatically
82    data1 = data1.repeat(2)
83    res = dict()
84    for ind, data in enumerate(data1.create_dict_iterator(num_epochs=1, output_numpy=True)):
85        res[ind] = data["col"].copy()
86    np.testing.assert_array_equal(res[0], [[0, -1], [0, 1]])
87    np.testing.assert_array_equal(res[1], [[0, 1, 2, -1], [0, 1, 2, 3]])
88    np.testing.assert_array_equal(res[2], [[0, -1], [0, 1]])
89    np.testing.assert_array_equal(res[3], [[0, 1, 2, -1], [0, 1, 2, 3]])
90
91
92def test_batch_padding_04():
93    data1 = ds.GeneratorDataset((lambda: gen_var_cols(2)), ["col1", "col2"])
94    data1 = data1.batch(batch_size=2, drop_remainder=False, pad_info={})  # pad automatically
95    data1 = data1.repeat(2)
96    for data in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
97        np.testing.assert_array_equal(data["col1"], [[0, 0], [0, 1]])
98        np.testing.assert_array_equal(data["col2"], [[100, 0], [100, 101]])
99
100
101def test_batch_padding_05():
102    data1 = ds.GeneratorDataset((lambda: gen_var_cols_2d(3)), ["col1", "col2"])
103    data1 = data1.batch(batch_size=3, drop_remainder=False,
104                        pad_info={"col2": ([2, None], -2), "col1": (None, -1)})  # pad automatically
105    for data in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
106        np.testing.assert_array_equal(data["col1"], [[[0, -1, -1]], [[0, 1, -1]], [[0, 1, 2]]])
107        np.testing.assert_array_equal(data["col2"], [[[100, -2, -2], [-2, -2, -2]], [[100, 101, -2], [-2, -2, -2]],
108                                                     [[100, 101, 102], [-2, -2, -2]]])
109
110
111def batch_padding_performance_3d():
112    cifar10_dir = "../data/dataset/testCifar10Data"
113    data1 = ds.Cifar10Dataset(cifar10_dir, shuffle=False)  # shape = [32,32,3]
114    data1 = data1.repeat(24)
115    pad_info = {"image": ([36, 36, 3], 0)}
116    # pad_info = None
117    data1 = data1.batch(batch_size=24, drop_remainder=True, pad_info=pad_info)
118    start_time = time.time()
119    num_batches = 0
120    for _ in data1.create_dict_iterator(num_epochs=1):
121        num_batches += 1
122    _ = "total number of batch:" + str(num_batches) + " time elapsed:" + str(time.time() - start_time)
123    # print(res)
124
125
126def batch_padding_performance_1d():
127    cifar10_dir = "../data/dataset/testCifar10Data"
128    data1 = ds.Cifar10Dataset(cifar10_dir, shuffle=False)  # shape = [32,32,3]
129    data1 = data1.repeat(24)
130    data1 = data1.map(operations=(lambda x: x.reshape(-1)), input_columns="image")
131    pad_info = {"image": ([3888], 0)}  # 3888 =36*36*3
132    # pad_info = None
133    data1 = data1.batch(batch_size=24, drop_remainder=True, pad_info=pad_info)
134    start_time = time.time()
135    num_batches = 0
136    for _ in data1.create_dict_iterator(num_epochs=1):
137        num_batches += 1
138    _ = "total number of batch:" + str(num_batches) + " time elapsed:" + str(time.time() - start_time)
139    # print(res)
140
141
142def batch_pyfunc_padding_3d():
143    cifar10_dir = "../data/dataset/testCifar10Data"
144    data1 = ds.Cifar10Dataset(cifar10_dir, shuffle=False)  # shape = [32,32,3]
145    data1 = data1.repeat(24)
146    # pad_info = {"image": ([36, 36, 3], 0)}
147    data1 = data1.map(operations=(lambda x: np.pad(x, ((0, 4), (0, 4), (0, 0)))), input_columns="image",
148                      python_multiprocessing=False)
149    data1 = data1.batch(batch_size=24, drop_remainder=True)
150    start_time = time.time()
151    num_batches = 0
152    for _ in data1.create_dict_iterator(num_epochs=1):
153        num_batches += 1
154    _ = "total number of batch:" + str(num_batches) + " time elapsed:" + str(time.time() - start_time)
155    # print(res)
156
157
158def batch_pyfunc_padding_1d():
159    cifar10_dir = "../data/dataset/testCifar10Data"
160    data1 = ds.Cifar10Dataset(cifar10_dir, shuffle=False)  # shape = [32,32,3]
161    data1 = data1.repeat(24)
162    data1 = data1.map(operations=(lambda x: x.reshape(-1)), input_columns="image")
163    data1 = data1.map(operations=(lambda x: np.pad(x, (0, 816))), input_columns="image", python_multiprocessing=False)
164    data1 = data1.batch(batch_size=24, drop_remainder=True)
165    start_time = time.time()
166    num_batches = 0
167    for _ in data1.create_dict_iterator(num_epochs=1):
168        num_batches += 1
169    _ = "total number of batch:" + str(num_batches) + " time elapsed:" + str(time.time() - start_time)
170    # print(res)
171
172
173# this function runs pad_batch and numpy.pad then compare the results
174def test_pad_via_map():
175    cifar10_dir = "../data/dataset/testCifar10Data"
176
177    def pad_map_config():
178        data1 = ds.Cifar10Dataset(cifar10_dir, shuffle=False, num_samples=1000)  # shape = [32,32,3]
179        data1 = data1.map(operations=(lambda x: x.reshape(-1)), input_columns="image")  # reshape to 1d
180        data1 = data1.map(operations=(lambda x: np.pad(x, (0, 816))), input_columns="image")
181        data1 = data1.batch(batch_size=25, drop_remainder=True)
182        res = []
183        for data in data1.create_dict_iterator(num_epochs=1, output_numpy=True):
184            res.append(data["image"])
185        return res
186
187    def pad_batch_config():
188        data2 = ds.Cifar10Dataset(cifar10_dir, shuffle=False, num_samples=1000)  # shape = [32,32,3]
189        data2 = data2.map(operations=(lambda x: x.reshape(-1)), input_columns="image")  # reshape to 1d
190        data2 = data2.batch(batch_size=25, drop_remainder=True, pad_info={"image": ([3888], 0)})
191        res = []
192        for data in data2.create_dict_iterator(num_epochs=1, output_numpy=True):
193            res.append(data["image"])
194        return res
195
196    res_from_map = pad_map_config()
197    res_from_batch = pad_batch_config()
198    assert len(res_from_batch) == len(res_from_batch)
199    for i, _ in enumerate(res_from_map):
200        np.testing.assert_array_equal(res_from_map[i], res_from_batch[i])
201
202
203if __name__ == '__main__':
204    test_batch_padding_01()
205    test_batch_padding_02()
206    test_batch_padding_03()
207    test_batch_padding_04()
208    test_batch_padding_05()
209    # batch_padding_performance_3d()
210    # batch_padding_performance_1d()
211    # batch_pyfunc_padding_3d()
212    # batch_pyfunc_padding_1d()
213    test_pad_via_map()
214