1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <vector>
17
18 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
19 #include "tensorflow/core/framework/allocator.h"
20 #include "tensorflow/core/framework/fake_input.h"
21 #include "tensorflow/core/framework/node_def_builder.h"
22 #include "tensorflow/core/framework/op_kernel.h"
23 #include "tensorflow/core/framework/tensor.h"
24 #include "tensorflow/core/framework/tensor_testutil.h"
25 #include "tensorflow/core/framework/types.h"
26 #include "tensorflow/core/graph/node_builder.h"
27 #include "tensorflow/core/kernels/ops_testutil.h"
28 #include "tensorflow/core/kernels/ops_util.h"
29 #include "tensorflow/core/lib/core/status_test_util.h"
30 #include "tensorflow/core/platform/test.h"
31 #include "tensorflow/core/platform/test_benchmark.h"
32
33 namespace tensorflow {
34 class FusedBatchNormOpTest : public OpsTestBase {};
35
TEST_F(FusedBatchNormOpTest,Training)36 TEST_F(FusedBatchNormOpTest, Training) {
37 TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
38 .Input(FakeInput(DT_FLOAT))
39 .Input(FakeInput(DT_FLOAT))
40 .Input(FakeInput(DT_FLOAT))
41 .Input(FakeInput(DT_FLOAT))
42 .Input(FakeInput(DT_FLOAT))
43 .Attr("epsilon", 0.001)
44 .Attr("is_training", true)
45 .Finalize(node_def()));
46 TF_EXPECT_OK(InitOp());
47 AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
48 {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
49 AddInputFromArray<float>(TensorShape({2}), {4.0, 4.0});
50 AddInputFromArray<float>(TensorShape({2}), {2.0, 2.0});
51 AddInputFromArray<float>(TensorShape({0}), {});
52 AddInputFromArray<float>(TensorShape({0}), {});
53
54 TF_ASSERT_OK(RunOpKernel());
55
56 Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
57 test::FillValues<float>(&expected, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
58 3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
59 test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
60
61 Tensor expected_mean(allocator(), DT_FLOAT, TensorShape({2}));
62 test::FillValues<float>(&expected_mean, {10, 10});
63 test::ExpectTensorNear<float>(expected_mean, *GetOutput(1), 0.01);
64
65 Tensor expected_variance(allocator(), DT_FLOAT, TensorShape({2}));
66 test::FillValues<float>(&expected_variance, {14.00, 14.00});
67 test::ExpectTensorNear<float>(expected_variance, *GetOutput(2), 0.01);
68 }
69
TEST_F(FusedBatchNormOpTest,Inference)70 TEST_F(FusedBatchNormOpTest, Inference) {
71 TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
72 .Input(FakeInput(DT_FLOAT))
73 .Input(FakeInput(DT_FLOAT))
74 .Input(FakeInput(DT_FLOAT))
75 .Input(FakeInput(DT_FLOAT))
76 .Input(FakeInput(DT_FLOAT))
77 .Attr("epsilon", 0.001)
78 .Attr("is_training", false)
79 .Finalize(node_def()));
80 TF_EXPECT_OK(InitOp());
81 AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
82 {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
83 AddInputFromArray<float>(TensorShape({2}), {4.0, 4.0});
84 AddInputFromArray<float>(TensorShape({2}), {2.0, 2.0});
85 AddInputFromArray<float>(TensorShape({2}), {10, 10});
86 AddInputFromArray<float>(TensorShape({2}), {11.67f, 11.67f});
87
88 TF_ASSERT_OK(RunOpKernel());
89
90 Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
91 test::FillValues<float>(&expected, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
92 3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
93 test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
94 }
95
96 class FusedBatchNormGradOpTest : public OpsTestBase {};
97
TEST_F(FusedBatchNormGradOpTest,Simple)98 TEST_F(FusedBatchNormGradOpTest, Simple) {
99 TF_EXPECT_OK(NodeDefBuilder("batch_norm_grad_op", "FusedBatchNormGrad")
100 .Input(FakeInput(DT_FLOAT))
101 .Input(FakeInput(DT_FLOAT))
102 .Input(FakeInput(DT_FLOAT))
103 .Input(FakeInput(DT_FLOAT))
104 .Input(FakeInput(DT_FLOAT))
105 .Attr("epsilon", 0.001)
106 .Finalize(node_def()));
107 TF_EXPECT_OK(InitOp());
108 AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
109 {2, 2, 9, 9, -4, -4, 5, 5, 8, 8, 7, 7});
110 AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
111 {1, 1, 7, 7, 4, 4, -3, -3, -11, -11, 13, 13});
112 AddInputFromArray<float>(TensorShape({2}), {4, 4});
113 AddInputFromArray<float>(TensorShape({2}), {1.833f, 1.833f});
114 AddInputFromArray<float>(TensorShape({2}), {57.472f, 57.472f});
115
116 TF_ASSERT_OK(RunOpKernel());
117
118 Tensor expected_x(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
119 test::FillValues<float>(&expected_x, {-1.34, -1.34, 2.47, 2.47, -4.44, -4.44,
120 0.17, 0.17, 1.60, 1.60, 1.53, 1.53});
121 test::ExpectTensorNear<float>(expected_x, *GetOutput(0), 0.01);
122
123 Tensor expected_scale(allocator(), DT_FLOAT, TensorShape({2}));
124 test::FillValues<float>(&expected_scale, {-1.6488, -1.6488});
125 test::ExpectTensorNear<float>(expected_scale, *GetOutput(1), 0.01);
126
127 Tensor expected_offset(allocator(), DT_FLOAT, TensorShape({2}));
128 test::FillValues<float>(&expected_offset, {27, 27});
129 test::ExpectTensorNear<float>(expected_offset, *GetOutput(2), 0.01);
130 }
131
132 //----------------------------------------------------------------------------//
133 // Performance benchmarks are below. //
134 //----------------------------------------------------------------------------//
135
136 using fp32 = float;
137 using fp16 = Eigen::half;
138
139 template <typename T>
FusedBatchNormInference(int n,int h,int w,int c,bool is_training,TensorFormat data_format)140 static Graph* FusedBatchNormInference(int n, int h, int w, int c,
141 bool is_training,
142 TensorFormat data_format) {
143 Graph* g = new Graph(OpRegistry::Global());
144
145 DataType dtype = DataTypeToEnum<T>::value;
146 Tensor x_t(dtype, data_format == FORMAT_NHWC ? TensorShape({n, h, w, c})
147 : TensorShape({n, c, h, w}));
148 x_t.flat<T>().setRandom();
149
150 Tensor other_t(DT_FLOAT, TensorShape({c}));
151 other_t.flat<float>().setRandom();
152
153 Tensor empty_t(DT_FLOAT, TensorShape({0}));
154
155 Node* x = test::graph::Constant(g, x_t, "x");
156 Node* other = test::graph::Constant(g, other_t, "other");
157 Node* empty = test::graph::Constant(g, empty_t, "empty");
158
159 Node* fused_batch_norm;
160 TF_CHECK_OK(NodeBuilder(g->NewName("fused_batch_norm"), "FusedBatchNormV3")
161 .Input(x)
162 .Input(other) // scale
163 .Input(other) // offset
164 .Input(is_training ? empty : other) // mean
165 .Input(is_training ? empty : other) // variance
166 .Attr("T", dtype)
167 .Attr("U", DT_FLOAT)
168 .Attr("epsilon", 0.001)
169 .Attr("is_training", is_training)
170 .Attr("data_format", ToString(data_format))
171 .Finalize(g, &fused_batch_norm));
172
173 return g;
174 }
175
176 template <typename T>
FusedBatchNormGrad(int n,int h,int w,int c,bool is_training,TensorFormat data_format)177 static Graph* FusedBatchNormGrad(int n, int h, int w, int c, bool is_training,
178 TensorFormat data_format) {
179 Graph* g = new Graph(OpRegistry::Global());
180
181 DataType dtype = DataTypeToEnum<T>::value;
182 TensorShape shape = data_format == FORMAT_NHWC ? TensorShape({n, h, w, c})
183 : TensorShape({n, c, h, w});
184
185 Tensor y_backprop_t(dtype, shape);
186 y_backprop_t.flat<T>().setRandom();
187
188 Tensor x_t(dtype, shape);
189 x_t.flat<T>().setRandom();
190
191 Tensor other_t(DT_FLOAT, TensorShape({c}));
192 other_t.flat<float>().setRandom();
193
194 Node* y_backprop = test::graph::Constant(g, y_backprop_t, "y_backprop");
195 Node* x = test::graph::Constant(g, x_t, "x");
196 Node* other = test::graph::Constant(g, other_t, "other");
197
198 Node* fused_batch_norm;
199 TF_CHECK_OK(
200 NodeBuilder(g->NewName("fused_batch_norm_grad"), "FusedBatchNormGradV3")
201 .Input(y_backprop)
202 .Input(x)
203 .Input(other) // scale
204 .Input(other) // saved_mean_or_pop_mean
205 .Input(other) // saved_maybe_inv_var_or_pop_var
206 .Input(other) // reserve_space
207 .Attr("T", dtype)
208 .Attr("U", DT_FLOAT)
209 .Attr("epsilon", 0.001)
210 .Attr("is_training", is_training)
211 .Attr("data_format", ToString(data_format))
212 .Finalize(g, &fused_batch_norm));
213
214 return g;
215 }
216
217 #define BM_NAME(NAME, N, H, W, C, T, IT, FORMAT, DEVICE) \
218 BM_##NAME##_##N##_##H##_##W##_##C##_##IT##_##FORMAT##_##T##_##DEVICE
219
220 // -------------------------------------------------------------------------- //
221 // FusedBatchNorm inference
222 // -------------------------------------------------------------------------- //
223
224 #define BM_FusedBatchNorm(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE) \
225 static void BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, \
226 DEVICE)(int iters) { \
227 testing::UseRealTime(); \
228 testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C); \
229 test::Benchmark(#DEVICE, FusedBatchNormInference<T>( \
230 N, H, W, C, IS_TRAINING, FORMAT_##FORMAT)) \
231 .Run(iters); \
232 } \
233 BENCHMARK( \
234 BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE));
235
236 BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, cpu);
237 BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, cpu);
238
239 BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NHWC, cpu);
240 BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NHWC, cpu);
241
242 #ifdef GOOGLE_CUDA
243 BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, gpu);
244 BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, gpu);
245
246 BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NCHW, gpu);
247 BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NCHW, gpu);
248
249 BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NHWC, gpu);
250 BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NHWC, gpu);
251
252 BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NCHW, gpu);
253 BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NCHW, gpu);
254 #endif // GOOGLE_CUDA
255
256 // -------------------------------------------------------------------------- //
257 // FusedBatchNorm gradient
258 // -------------------------------------------------------------------------- //
259
260 #define BM_FusedBatchNormGrad(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE) \
261 static void BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, \
262 DEVICE)(int iters) { \
263 testing::UseRealTime(); \
264 testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C); \
265 test::Benchmark(#DEVICE, FusedBatchNormGrad<T>(N, H, W, C, IS_TRAINING, \
266 FORMAT_##FORMAT)) \
267 .Run(iters); \
268 } \
269 BENCHMARK(BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, \
270 DEVICE));
271
272 #define BM_FusedBatchNormGradResnetShapes(T, IS_TRAINING, FORMAT, DEVICE) \
273 BM_FusedBatchNormGrad(64, 56, 56, 64, T, IS_TRAINING, FORMAT, DEVICE); \
274 BM_FusedBatchNormGrad(64, 56, 56, 128, T, IS_TRAINING, FORMAT, DEVICE); \
275 BM_FusedBatchNormGrad(64, 56, 56, 256, T, IS_TRAINING, FORMAT, DEVICE); \
276 \
277 BM_FusedBatchNormGrad(64, 28, 28, 128, T, IS_TRAINING, FORMAT, DEVICE); \
278 BM_FusedBatchNormGrad(64, 28, 28, 256, T, IS_TRAINING, FORMAT, DEVICE); \
279 BM_FusedBatchNormGrad(64, 28, 28, 512, T, IS_TRAINING, FORMAT, DEVICE); \
280 \
281 BM_FusedBatchNormGrad(64, 14, 14, 128, T, IS_TRAINING, FORMAT, DEVICE); \
282 BM_FusedBatchNormGrad(64, 14, 14, 256, T, IS_TRAINING, FORMAT, DEVICE); \
283 BM_FusedBatchNormGrad(64, 14, 14, 1024, T, IS_TRAINING, FORMAT, DEVICE)
284
285 BM_FusedBatchNormGradResnetShapes(fp32, true, NHWC, cpu);
286 BM_FusedBatchNormGradResnetShapes(fp32, false, NHWC, cpu);
287
288 #ifdef GOOGLE_CUDA
289 BM_FusedBatchNormGradResnetShapes(fp32, true, NHWC, gpu);
290 BM_FusedBatchNormGradResnetShapes(fp16, true, NHWC, gpu);
291 BM_FusedBatchNormGradResnetShapes(fp32, true, NCHW, gpu);
292 BM_FusedBatchNormGradResnetShapes(fp16, true, NCHW, gpu);
293
294 BM_FusedBatchNormGradResnetShapes(fp32, false, NHWC, gpu);
295 BM_FusedBatchNormGradResnetShapes(fp16, false, NHWC, gpu);
296 #endif // GOOGLE_CUDA
297
298 } // namespace tensorflow
299