1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <vector>
17
18 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
19 #include "tensorflow/core/framework/allocator.h"
20 #include "tensorflow/core/framework/fake_input.h"
21 #include "tensorflow/core/framework/node_def_builder.h"
22 #include "tensorflow/core/framework/op_kernel.h"
23 #include "tensorflow/core/framework/tensor.h"
24 #include "tensorflow/core/framework/tensor_testutil.h"
25 #include "tensorflow/core/framework/types.h"
26 #include "tensorflow/core/graph/node_builder.h"
27 #include "tensorflow/core/kernels/ops_testutil.h"
28 #include "tensorflow/core/kernels/ops_util.h"
29 #include "tensorflow/core/lib/core/status_test_util.h"
30 #include "tensorflow/core/platform/test.h"
31 #include "tensorflow/core/platform/test_benchmark.h"
32
33 namespace tensorflow {
34 class FusedBatchNormOpTest : public OpsTestBase {};
35
TEST_F(FusedBatchNormOpTest,Training)36 TEST_F(FusedBatchNormOpTest, Training) {
37 TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
38 .Input(FakeInput(DT_FLOAT))
39 .Input(FakeInput(DT_FLOAT))
40 .Input(FakeInput(DT_FLOAT))
41 .Input(FakeInput(DT_FLOAT))
42 .Input(FakeInput(DT_FLOAT))
43 .Attr("exponential_avg_factor", 1.0)
44 .Attr("epsilon", 0.001)
45 .Attr("is_training", true)
46 .Finalize(node_def()));
47 TF_EXPECT_OK(InitOp());
48 AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
49 {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
50 AddInputFromArray<float>(TensorShape({2}), {4.0, 4.0});
51 AddInputFromArray<float>(TensorShape({2}), {2.0, 2.0});
52 AddInputFromArray<float>(TensorShape({0}), {});
53 AddInputFromArray<float>(TensorShape({0}), {});
54
55 TF_ASSERT_OK(RunOpKernel());
56
57 Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
58 test::FillValues<float>(&expected, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
59 3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
60 test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
61
62 Tensor expected_mean(allocator(), DT_FLOAT, TensorShape({2}));
63 test::FillValues<float>(&expected_mean, {10, 10});
64 test::ExpectTensorNear<float>(expected_mean, *GetOutput(1), 0.01);
65
66 Tensor expected_variance(allocator(), DT_FLOAT, TensorShape({2}));
67 test::FillValues<float>(&expected_variance, {14.00, 14.00});
68 test::ExpectTensorNear<float>(expected_variance, *GetOutput(2), 0.01);
69 }
70
TEST_F(FusedBatchNormOpTest,TrainingRunningMean)71 TEST_F(FusedBatchNormOpTest, TrainingRunningMean) {
72 TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
73 .Input(FakeInput(DT_FLOAT))
74 .Input(FakeInput(DT_FLOAT))
75 .Input(FakeInput(DT_FLOAT))
76 .Input(FakeInput(DT_FLOAT))
77 .Input(FakeInput(DT_FLOAT))
78 .Attr("exponential_avg_factor", 0.5)
79 .Attr("epsilon", 0.001)
80 .Attr("is_training", true)
81 .Finalize(node_def()));
82 TF_EXPECT_OK(InitOp());
83 AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
84 {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
85 AddInputFromArray<float>(TensorShape({2}), {4.0, 4.0});
86 AddInputFromArray<float>(TensorShape({2}), {2.0, 2.0});
87 AddInputFromArray<float>(TensorShape({2}), {6.0, 6.0});
88 AddInputFromArray<float>(TensorShape({2}), {16.0, 16.0});
89
90 TF_ASSERT_OK(RunOpKernel());
91
92 Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
93 test::FillValues<float>(&expected, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
94 3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
95 test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
96
97 Tensor expected_mean(allocator(), DT_FLOAT, TensorShape({2}));
98 test::FillValues<float>(&expected_mean, {8, 8});
99 test::ExpectTensorNear<float>(expected_mean, *GetOutput(1), 0.01);
100
101 Tensor expected_variance(allocator(), DT_FLOAT, TensorShape({2}));
102 test::FillValues<float>(&expected_variance, {15.00, 15.00});
103 test::ExpectTensorNear<float>(expected_variance, *GetOutput(2), 0.01);
104 }
105
TEST_F(FusedBatchNormOpTest,Inference)106 TEST_F(FusedBatchNormOpTest, Inference) {
107 TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
108 .Input(FakeInput(DT_FLOAT))
109 .Input(FakeInput(DT_FLOAT))
110 .Input(FakeInput(DT_FLOAT))
111 .Input(FakeInput(DT_FLOAT))
112 .Input(FakeInput(DT_FLOAT))
113 .Attr("epsilon", 0.001)
114 .Attr("is_training", false)
115 .Finalize(node_def()));
116 TF_EXPECT_OK(InitOp());
117 AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
118 {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
119 AddInputFromArray<float>(TensorShape({2}), {4.0, 4.0});
120 AddInputFromArray<float>(TensorShape({2}), {2.0, 2.0});
121 AddInputFromArray<float>(TensorShape({2}), {10, 10});
122 AddInputFromArray<float>(TensorShape({2}), {11.67f, 11.67f});
123
124 TF_ASSERT_OK(RunOpKernel());
125
126 Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
127 test::FillValues<float>(&expected, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
128 3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
129 test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
130 }
131
TEST_F(FusedBatchNormOpTest,InferenceIgnoreAvgFactor)132 TEST_F(FusedBatchNormOpTest, InferenceIgnoreAvgFactor) {
133 TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
134 .Input(FakeInput(DT_FLOAT))
135 .Input(FakeInput(DT_FLOAT))
136 .Input(FakeInput(DT_FLOAT))
137 .Input(FakeInput(DT_FLOAT))
138 .Input(FakeInput(DT_FLOAT))
139 .Attr("exponential_avg_factor", 0.5)
140 .Attr("epsilon", 0.001)
141 .Attr("is_training", false)
142 .Finalize(node_def()));
143 TF_EXPECT_OK(InitOp());
144 AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
145 {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
146 AddInputFromArray<float>(TensorShape({2}), {4.0, 4.0});
147 AddInputFromArray<float>(TensorShape({2}), {2.0, 2.0});
148 AddInputFromArray<float>(TensorShape({2}), {10, 10});
149 AddInputFromArray<float>(TensorShape({2}), {11.67f, 11.67f});
150
151 TF_ASSERT_OK(RunOpKernel());
152
153 Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
154 test::FillValues<float>(&expected, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
155 3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
156 test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
157 }
158
TEST_F(FusedBatchNormOpTest,EmptyInput)159 TEST_F(FusedBatchNormOpTest, EmptyInput) {
160 TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
161 .Input(FakeInput(DT_FLOAT))
162 .Input(FakeInput(DT_FLOAT))
163 .Input(FakeInput(DT_FLOAT))
164 .Input(FakeInput(DT_FLOAT))
165 .Input(FakeInput(DT_FLOAT))
166 .Attr("epsilon", 0.001)
167 .Attr("is_training", true)
168 .Finalize(node_def()));
169 TF_EXPECT_OK(InitOp());
170 AddInputFromArray<float>(TensorShape({1, 1, 0, 0}), {});
171 AddInputFromArray<float>(TensorShape({0}), {});
172 AddInputFromArray<float>(TensorShape({0}), {});
173 AddInputFromArray<float>(TensorShape({0}), {});
174 AddInputFromArray<float>(TensorShape({0}), {});
175
176 TF_ASSERT_OK(RunOpKernel());
177 EXPECT_EQ(GetOutput(0)->shape(), TensorShape({1, 1, 0, 0}));
178 }
179
180 class FusedBatchNormGradOpTest : public OpsTestBase {};
181
TEST_F(FusedBatchNormGradOpTest,Simple)182 TEST_F(FusedBatchNormGradOpTest, Simple) {
183 TF_EXPECT_OK(NodeDefBuilder("batch_norm_grad_op", "FusedBatchNormGrad")
184 .Input(FakeInput(DT_FLOAT))
185 .Input(FakeInput(DT_FLOAT))
186 .Input(FakeInput(DT_FLOAT))
187 .Input(FakeInput(DT_FLOAT))
188 .Input(FakeInput(DT_FLOAT))
189 .Attr("epsilon", 0.001)
190 .Finalize(node_def()));
191 TF_EXPECT_OK(InitOp());
192 AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
193 {2, 2, 9, 9, -4, -4, 5, 5, 8, 8, 7, 7});
194 AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
195 {1, 1, 7, 7, 4, 4, -3, -3, -11, -11, 13, 13});
196 AddInputFromArray<float>(TensorShape({2}), {4, 4});
197 AddInputFromArray<float>(TensorShape({2}), {1.833f, 1.833f});
198 AddInputFromArray<float>(TensorShape({2}), {57.472f, 57.472f});
199
200 TF_ASSERT_OK(RunOpKernel());
201
202 Tensor expected_x(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
203 test::FillValues<float>(&expected_x, {-1.34, -1.34, 2.47, 2.47, -4.44, -4.44,
204 0.17, 0.17, 1.60, 1.60, 1.53, 1.53});
205 test::ExpectTensorNear<float>(expected_x, *GetOutput(0), 0.01);
206
207 Tensor expected_scale(allocator(), DT_FLOAT, TensorShape({2}));
208 test::FillValues<float>(&expected_scale, {-1.6488, -1.6488});
209 test::ExpectTensorNear<float>(expected_scale, *GetOutput(1), 0.01);
210
211 Tensor expected_offset(allocator(), DT_FLOAT, TensorShape({2}));
212 test::FillValues<float>(&expected_offset, {27, 27});
213 test::ExpectTensorNear<float>(expected_offset, *GetOutput(2), 0.01);
214 }
215
216 //----------------------------------------------------------------------------//
217 // Performance benchmarks are below. //
218 //----------------------------------------------------------------------------//
219
220 using fp32 = float;
221 using fp16 = Eigen::half;
222
223 template <typename T>
FusedBatchNormInference(int n,int h,int w,int c,bool is_training,TensorFormat data_format)224 static Graph* FusedBatchNormInference(int n, int h, int w, int c,
225 bool is_training,
226 TensorFormat data_format) {
227 Graph* g = new Graph(OpRegistry::Global());
228
229 DataType dtype = DataTypeToEnum<T>::value;
230 Tensor x_t(dtype, data_format == FORMAT_NHWC ? TensorShape({n, h, w, c})
231 : TensorShape({n, c, h, w}));
232 x_t.flat<T>().setRandom();
233
234 Tensor other_t(DT_FLOAT, TensorShape({c}));
235 other_t.flat<float>().setRandom();
236
237 Tensor empty_t(DT_FLOAT, TensorShape({0}));
238
239 Node* x = test::graph::Constant(g, x_t, "x");
240 Node* other = test::graph::Constant(g, other_t, "other");
241 Node* empty = test::graph::Constant(g, empty_t, "empty");
242
243 Node* fused_batch_norm;
244 TF_CHECK_OK(NodeBuilder(g->NewName("fused_batch_norm"), "FusedBatchNormV3")
245 .Input(x)
246 .Input(other) // scale
247 .Input(other) // offset
248 .Input(is_training ? empty : other) // mean
249 .Input(is_training ? empty : other) // variance
250 .Attr("T", dtype)
251 .Attr("U", DT_FLOAT)
252 .Attr("epsilon", 0.001)
253 .Attr("is_training", is_training)
254 .Attr("data_format", ToString(data_format))
255 .Finalize(g, &fused_batch_norm));
256
257 return g;
258 }
259
260 template <typename T>
FusedBatchNormGrad(int n,int h,int w,int c,bool is_training,TensorFormat data_format)261 static Graph* FusedBatchNormGrad(int n, int h, int w, int c, bool is_training,
262 TensorFormat data_format) {
263 Graph* g = new Graph(OpRegistry::Global());
264
265 DataType dtype = DataTypeToEnum<T>::value;
266 TensorShape shape = data_format == FORMAT_NHWC ? TensorShape({n, h, w, c})
267 : TensorShape({n, c, h, w});
268
269 Tensor y_backprop_t(dtype, shape);
270 y_backprop_t.flat<T>().setRandom();
271
272 Tensor x_t(dtype, shape);
273 x_t.flat<T>().setRandom();
274
275 Tensor other_t(DT_FLOAT, TensorShape({c}));
276 other_t.flat<float>().setRandom();
277
278 Node* y_backprop = test::graph::Constant(g, y_backprop_t, "y_backprop");
279 Node* x = test::graph::Constant(g, x_t, "x");
280 Node* other = test::graph::Constant(g, other_t, "other");
281
282 Node* fused_batch_norm;
283 TF_CHECK_OK(
284 NodeBuilder(g->NewName("fused_batch_norm_grad"), "FusedBatchNormGradV3")
285 .Input(y_backprop)
286 .Input(x)
287 .Input(other) // scale
288 .Input(other) // saved_mean_or_pop_mean
289 .Input(other) // saved_maybe_inv_var_or_pop_var
290 .Input(other) // reserve_space
291 .Attr("T", dtype)
292 .Attr("U", DT_FLOAT)
293 .Attr("epsilon", 0.001)
294 .Attr("is_training", is_training)
295 .Attr("data_format", ToString(data_format))
296 .Finalize(g, &fused_batch_norm));
297
298 return g;
299 }
300
301 #define BM_NAME(NAME, N, H, W, C, T, IT, FORMAT, DEVICE) \
302 BM_##NAME##_##N##_##H##_##W##_##C##_##IT##_##FORMAT##_##T##_##DEVICE
303
304 // -------------------------------------------------------------------------- //
305 // FusedBatchNorm inference
306 // -------------------------------------------------------------------------- //
307 // clang-format off
308 // NOLINTBEGIN
309 #define BM_FusedBatchNorm(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE) \
310 static void BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)(::testing::benchmark::State & state) { \
311 test::Benchmark( \
312 #DEVICE, \
313 FusedBatchNormInference<T>(N, H, W, C, IS_TRAINING, FORMAT_##FORMAT), \
314 /*old_benchmark_api*/ false) \
315 .Run(state); \
316 state.SetItemsProcessed(state.iterations() * N * H * W * C); \
317 } \
318 BENCHMARK( \
319 BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)) \
320 ->UseRealTime();
321
322 // NOLINTEND
323 // clang-format on
324
325 BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, cpu);
326 BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, cpu);
327
328 BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NHWC, cpu);
329 BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NHWC, cpu);
330
331 #ifdef GOOGLE_CUDA
332 BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, gpu);
333 BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, gpu);
334
335 BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NCHW, gpu);
336 BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NCHW, gpu);
337
338 BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NHWC, gpu);
339 BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NHWC, gpu);
340
341 BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NCHW, gpu);
342 BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NCHW, gpu);
343 #endif // GOOGLE_CUDA
344
345 // -------------------------------------------------------------------------- //
346 // FusedBatchNorm gradient
347 // -------------------------------------------------------------------------- //
348
349 #define BM_FusedBatchNormGrad(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE) \
350 static void BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, \
351 DEVICE)(::testing::benchmark::State & state) { \
352 test::Benchmark( \
353 #DEVICE, \
354 FusedBatchNormGrad<T>(N, H, W, C, IS_TRAINING, FORMAT_##FORMAT), \
355 /*old_benchmark_api*/ false) \
356 .Run(state); \
357 state.SetItemsProcessed(state.iterations() * N * H * W * C); \
358 } \
359 BENCHMARK( \
360 BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)) \
361 ->UseRealTime();
362
363 #define BM_FusedBatchNormGradResnetShapes(T, IS_TRAINING, FORMAT, DEVICE) \
364 BM_FusedBatchNormGrad(64, 56, 56, 64, T, IS_TRAINING, FORMAT, DEVICE); \
365 BM_FusedBatchNormGrad(64, 56, 56, 128, T, IS_TRAINING, FORMAT, DEVICE); \
366 BM_FusedBatchNormGrad(64, 56, 56, 256, T, IS_TRAINING, FORMAT, DEVICE); \
367 \
368 BM_FusedBatchNormGrad(64, 28, 28, 128, T, IS_TRAINING, FORMAT, DEVICE); \
369 BM_FusedBatchNormGrad(64, 28, 28, 256, T, IS_TRAINING, FORMAT, DEVICE); \
370 BM_FusedBatchNormGrad(64, 28, 28, 512, T, IS_TRAINING, FORMAT, DEVICE); \
371 \
372 BM_FusedBatchNormGrad(64, 14, 14, 128, T, IS_TRAINING, FORMAT, DEVICE); \
373 BM_FusedBatchNormGrad(64, 14, 14, 256, T, IS_TRAINING, FORMAT, DEVICE); \
374 BM_FusedBatchNormGrad(64, 14, 14, 1024, T, IS_TRAINING, FORMAT, DEVICE)
375
376 BM_FusedBatchNormGradResnetShapes(fp32, true, NHWC, cpu);
377 BM_FusedBatchNormGradResnetShapes(fp32, false, NHWC, cpu);
378
379 #ifdef GOOGLE_CUDA
380 BM_FusedBatchNormGradResnetShapes(fp32, true, NHWC, gpu);
381 BM_FusedBatchNormGradResnetShapes(fp16, true, NHWC, gpu);
382 BM_FusedBatchNormGradResnetShapes(fp32, true, NCHW, gpu);
383 BM_FusedBatchNormGradResnetShapes(fp16, true, NCHW, gpu);
384
385 BM_FusedBatchNormGradResnetShapes(fp32, false, NHWC, gpu);
386 BM_FusedBatchNormGradResnetShapes(fp16, false, NHWC, gpu);
387 #endif // GOOGLE_CUDA
388
389 } // namespace tensorflow
390