• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <vector>
17 
18 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
19 #include "tensorflow/core/framework/allocator.h"
20 #include "tensorflow/core/framework/fake_input.h"
21 #include "tensorflow/core/framework/node_def_builder.h"
22 #include "tensorflow/core/framework/op_kernel.h"
23 #include "tensorflow/core/framework/tensor.h"
24 #include "tensorflow/core/framework/tensor_testutil.h"
25 #include "tensorflow/core/framework/types.h"
26 #include "tensorflow/core/graph/node_builder.h"
27 #include "tensorflow/core/kernels/ops_testutil.h"
28 #include "tensorflow/core/kernels/ops_util.h"
29 #include "tensorflow/core/lib/core/status_test_util.h"
30 #include "tensorflow/core/platform/test.h"
31 #include "tensorflow/core/platform/test_benchmark.h"
32 
33 namespace tensorflow {
34 class FusedBatchNormOpTest : public OpsTestBase {};
35 
TEST_F(FusedBatchNormOpTest,Training)36 TEST_F(FusedBatchNormOpTest, Training) {
37   TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
38                    .Input(FakeInput(DT_FLOAT))
39                    .Input(FakeInput(DT_FLOAT))
40                    .Input(FakeInput(DT_FLOAT))
41                    .Input(FakeInput(DT_FLOAT))
42                    .Input(FakeInput(DT_FLOAT))
43                    .Attr("exponential_avg_factor", 1.0)
44                    .Attr("epsilon", 0.001)
45                    .Attr("is_training", true)
46                    .Finalize(node_def()));
47   TF_EXPECT_OK(InitOp());
48   AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
49                            {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
50   AddInputFromArray<float>(TensorShape({2}), {4.0, 4.0});
51   AddInputFromArray<float>(TensorShape({2}), {2.0, 2.0});
52   AddInputFromArray<float>(TensorShape({0}), {});
53   AddInputFromArray<float>(TensorShape({0}), {});
54 
55   TF_ASSERT_OK(RunOpKernel());
56 
57   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
58   test::FillValues<float>(&expected, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
59                                       3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
60   test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
61 
62   Tensor expected_mean(allocator(), DT_FLOAT, TensorShape({2}));
63   test::FillValues<float>(&expected_mean, {10, 10});
64   test::ExpectTensorNear<float>(expected_mean, *GetOutput(1), 0.01);
65 
66   Tensor expected_variance(allocator(), DT_FLOAT, TensorShape({2}));
67   test::FillValues<float>(&expected_variance, {14.00, 14.00});
68   test::ExpectTensorNear<float>(expected_variance, *GetOutput(2), 0.01);
69 }
70 
TEST_F(FusedBatchNormOpTest,TrainingRunningMean)71 TEST_F(FusedBatchNormOpTest, TrainingRunningMean) {
72   TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
73                    .Input(FakeInput(DT_FLOAT))
74                    .Input(FakeInput(DT_FLOAT))
75                    .Input(FakeInput(DT_FLOAT))
76                    .Input(FakeInput(DT_FLOAT))
77                    .Input(FakeInput(DT_FLOAT))
78                    .Attr("exponential_avg_factor", 0.5)
79                    .Attr("epsilon", 0.001)
80                    .Attr("is_training", true)
81                    .Finalize(node_def()));
82   TF_EXPECT_OK(InitOp());
83   AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
84                            {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
85   AddInputFromArray<float>(TensorShape({2}), {4.0, 4.0});
86   AddInputFromArray<float>(TensorShape({2}), {2.0, 2.0});
87   AddInputFromArray<float>(TensorShape({2}), {6.0, 6.0});
88   AddInputFromArray<float>(TensorShape({2}), {16.0, 16.0});
89 
90   TF_ASSERT_OK(RunOpKernel());
91 
92   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
93   test::FillValues<float>(&expected, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
94                                       3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
95   test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
96 
97   Tensor expected_mean(allocator(), DT_FLOAT, TensorShape({2}));
98   test::FillValues<float>(&expected_mean, {8, 8});
99   test::ExpectTensorNear<float>(expected_mean, *GetOutput(1), 0.01);
100 
101   Tensor expected_variance(allocator(), DT_FLOAT, TensorShape({2}));
102   test::FillValues<float>(&expected_variance, {15.00, 15.00});
103   test::ExpectTensorNear<float>(expected_variance, *GetOutput(2), 0.01);
104 }
105 
TEST_F(FusedBatchNormOpTest,Inference)106 TEST_F(FusedBatchNormOpTest, Inference) {
107   TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
108                    .Input(FakeInput(DT_FLOAT))
109                    .Input(FakeInput(DT_FLOAT))
110                    .Input(FakeInput(DT_FLOAT))
111                    .Input(FakeInput(DT_FLOAT))
112                    .Input(FakeInput(DT_FLOAT))
113                    .Attr("epsilon", 0.001)
114                    .Attr("is_training", false)
115                    .Finalize(node_def()));
116   TF_EXPECT_OK(InitOp());
117   AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
118                            {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
119   AddInputFromArray<float>(TensorShape({2}), {4.0, 4.0});
120   AddInputFromArray<float>(TensorShape({2}), {2.0, 2.0});
121   AddInputFromArray<float>(TensorShape({2}), {10, 10});
122   AddInputFromArray<float>(TensorShape({2}), {11.67f, 11.67f});
123 
124   TF_ASSERT_OK(RunOpKernel());
125 
126   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
127   test::FillValues<float>(&expected, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
128                                       3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
129   test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
130 }
131 
TEST_F(FusedBatchNormOpTest,InferenceIgnoreAvgFactor)132 TEST_F(FusedBatchNormOpTest, InferenceIgnoreAvgFactor) {
133   TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
134                    .Input(FakeInput(DT_FLOAT))
135                    .Input(FakeInput(DT_FLOAT))
136                    .Input(FakeInput(DT_FLOAT))
137                    .Input(FakeInput(DT_FLOAT))
138                    .Input(FakeInput(DT_FLOAT))
139                    .Attr("exponential_avg_factor", 0.5)
140                    .Attr("epsilon", 0.001)
141                    .Attr("is_training", false)
142                    .Finalize(node_def()));
143   TF_EXPECT_OK(InitOp());
144   AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
145                            {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
146   AddInputFromArray<float>(TensorShape({2}), {4.0, 4.0});
147   AddInputFromArray<float>(TensorShape({2}), {2.0, 2.0});
148   AddInputFromArray<float>(TensorShape({2}), {10, 10});
149   AddInputFromArray<float>(TensorShape({2}), {11.67f, 11.67f});
150 
151   TF_ASSERT_OK(RunOpKernel());
152 
153   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
154   test::FillValues<float>(&expected, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
155                                       3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
156   test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
157 }
158 
TEST_F(FusedBatchNormOpTest,EmptyInput)159 TEST_F(FusedBatchNormOpTest, EmptyInput) {
160   TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
161                    .Input(FakeInput(DT_FLOAT))
162                    .Input(FakeInput(DT_FLOAT))
163                    .Input(FakeInput(DT_FLOAT))
164                    .Input(FakeInput(DT_FLOAT))
165                    .Input(FakeInput(DT_FLOAT))
166                    .Attr("epsilon", 0.001)
167                    .Attr("is_training", true)
168                    .Finalize(node_def()));
169   TF_EXPECT_OK(InitOp());
170   AddInputFromArray<float>(TensorShape({1, 1, 0, 0}), {});
171   AddInputFromArray<float>(TensorShape({0}), {});
172   AddInputFromArray<float>(TensorShape({0}), {});
173   AddInputFromArray<float>(TensorShape({0}), {});
174   AddInputFromArray<float>(TensorShape({0}), {});
175 
176   TF_ASSERT_OK(RunOpKernel());
177   EXPECT_EQ(GetOutput(0)->shape(), TensorShape({1, 1, 0, 0}));
178 }
179 
180 class FusedBatchNormGradOpTest : public OpsTestBase {};
181 
TEST_F(FusedBatchNormGradOpTest,Simple)182 TEST_F(FusedBatchNormGradOpTest, Simple) {
183   TF_EXPECT_OK(NodeDefBuilder("batch_norm_grad_op", "FusedBatchNormGrad")
184                    .Input(FakeInput(DT_FLOAT))
185                    .Input(FakeInput(DT_FLOAT))
186                    .Input(FakeInput(DT_FLOAT))
187                    .Input(FakeInput(DT_FLOAT))
188                    .Input(FakeInput(DT_FLOAT))
189                    .Attr("epsilon", 0.001)
190                    .Finalize(node_def()));
191   TF_EXPECT_OK(InitOp());
192   AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
193                            {2, 2, 9, 9, -4, -4, 5, 5, 8, 8, 7, 7});
194   AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
195                            {1, 1, 7, 7, 4, 4, -3, -3, -11, -11, 13, 13});
196   AddInputFromArray<float>(TensorShape({2}), {4, 4});
197   AddInputFromArray<float>(TensorShape({2}), {1.833f, 1.833f});
198   AddInputFromArray<float>(TensorShape({2}), {57.472f, 57.472f});
199 
200   TF_ASSERT_OK(RunOpKernel());
201 
202   Tensor expected_x(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
203   test::FillValues<float>(&expected_x, {-1.34, -1.34, 2.47, 2.47, -4.44, -4.44,
204                                         0.17, 0.17, 1.60, 1.60, 1.53, 1.53});
205   test::ExpectTensorNear<float>(expected_x, *GetOutput(0), 0.01);
206 
207   Tensor expected_scale(allocator(), DT_FLOAT, TensorShape({2}));
208   test::FillValues<float>(&expected_scale, {-1.6488, -1.6488});
209   test::ExpectTensorNear<float>(expected_scale, *GetOutput(1), 0.01);
210 
211   Tensor expected_offset(allocator(), DT_FLOAT, TensorShape({2}));
212   test::FillValues<float>(&expected_offset, {27, 27});
213   test::ExpectTensorNear<float>(expected_offset, *GetOutput(2), 0.01);
214 }
215 
216 //----------------------------------------------------------------------------//
217 // Performance benchmarks are below.                                          //
218 //----------------------------------------------------------------------------//
219 
220 using fp32 = float;
221 using fp16 = Eigen::half;
222 
223 template <typename T>
FusedBatchNormInference(int n,int h,int w,int c,bool is_training,TensorFormat data_format)224 static Graph* FusedBatchNormInference(int n, int h, int w, int c,
225                                       bool is_training,
226                                       TensorFormat data_format) {
227   Graph* g = new Graph(OpRegistry::Global());
228 
229   DataType dtype = DataTypeToEnum<T>::value;
230   Tensor x_t(dtype, data_format == FORMAT_NHWC ? TensorShape({n, h, w, c})
231                                                : TensorShape({n, c, h, w}));
232   x_t.flat<T>().setRandom();
233 
234   Tensor other_t(DT_FLOAT, TensorShape({c}));
235   other_t.flat<float>().setRandom();
236 
237   Tensor empty_t(DT_FLOAT, TensorShape({0}));
238 
239   Node* x = test::graph::Constant(g, x_t, "x");
240   Node* other = test::graph::Constant(g, other_t, "other");
241   Node* empty = test::graph::Constant(g, empty_t, "empty");
242 
243   Node* fused_batch_norm;
244   TF_CHECK_OK(NodeBuilder(g->NewName("fused_batch_norm"), "FusedBatchNormV3")
245                   .Input(x)
246                   .Input(other)                        // scale
247                   .Input(other)                        // offset
248                   .Input(is_training ? empty : other)  // mean
249                   .Input(is_training ? empty : other)  // variance
250                   .Attr("T", dtype)
251                   .Attr("U", DT_FLOAT)
252                   .Attr("epsilon", 0.001)
253                   .Attr("is_training", is_training)
254                   .Attr("data_format", ToString(data_format))
255                   .Finalize(g, &fused_batch_norm));
256 
257   return g;
258 }
259 
260 template <typename T>
FusedBatchNormGrad(int n,int h,int w,int c,bool is_training,TensorFormat data_format)261 static Graph* FusedBatchNormGrad(int n, int h, int w, int c, bool is_training,
262                                  TensorFormat data_format) {
263   Graph* g = new Graph(OpRegistry::Global());
264 
265   DataType dtype = DataTypeToEnum<T>::value;
266   TensorShape shape = data_format == FORMAT_NHWC ? TensorShape({n, h, w, c})
267                                                  : TensorShape({n, c, h, w});
268 
269   Tensor y_backprop_t(dtype, shape);
270   y_backprop_t.flat<T>().setRandom();
271 
272   Tensor x_t(dtype, shape);
273   x_t.flat<T>().setRandom();
274 
275   Tensor other_t(DT_FLOAT, TensorShape({c}));
276   other_t.flat<float>().setRandom();
277 
278   Node* y_backprop = test::graph::Constant(g, y_backprop_t, "y_backprop");
279   Node* x = test::graph::Constant(g, x_t, "x");
280   Node* other = test::graph::Constant(g, other_t, "other");
281 
282   Node* fused_batch_norm;
283   TF_CHECK_OK(
284       NodeBuilder(g->NewName("fused_batch_norm_grad"), "FusedBatchNormGradV3")
285           .Input(y_backprop)
286           .Input(x)
287           .Input(other)  // scale
288           .Input(other)  // saved_mean_or_pop_mean
289           .Input(other)  // saved_maybe_inv_var_or_pop_var
290           .Input(other)  // reserve_space
291           .Attr("T", dtype)
292           .Attr("U", DT_FLOAT)
293           .Attr("epsilon", 0.001)
294           .Attr("is_training", is_training)
295           .Attr("data_format", ToString(data_format))
296           .Finalize(g, &fused_batch_norm));
297 
298   return g;
299 }
300 
301 #define BM_NAME(NAME, N, H, W, C, T, IT, FORMAT, DEVICE) \
302   BM_##NAME##_##N##_##H##_##W##_##C##_##IT##_##FORMAT##_##T##_##DEVICE
303 
304 // -------------------------------------------------------------------------- //
305 // FusedBatchNorm inference
306 // -------------------------------------------------------------------------- //
307 // clang-format off
308 // NOLINTBEGIN
309 #define BM_FusedBatchNorm(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)         \
310   static void BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)(::testing::benchmark::State & state) {                     \
311     test::Benchmark(                                                          \
312         #DEVICE,                                                              \
313         FusedBatchNormInference<T>(N, H, W, C, IS_TRAINING, FORMAT_##FORMAT), \
314         /*old_benchmark_api*/ false)                                          \
315         .Run(state);                                                          \
316     state.SetItemsProcessed(state.iterations() * N * H * W * C);              \
317   }                                                                           \
318   BENCHMARK(                                                                  \
319       BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE))    \
320       ->UseRealTime();
321 
322 // NOLINTEND
323 // clang-format on
324 
325 BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, cpu);
326 BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, cpu);
327 
328 BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NHWC, cpu);
329 BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NHWC, cpu);
330 
331 #ifdef GOOGLE_CUDA
332 BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, gpu);
333 BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, gpu);
334 
335 BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NCHW, gpu);
336 BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NCHW, gpu);
337 
338 BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NHWC, gpu);
339 BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NHWC, gpu);
340 
341 BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NCHW, gpu);
342 BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NCHW, gpu);
343 #endif  // GOOGLE_CUDA
344 
345 // -------------------------------------------------------------------------- //
346 // FusedBatchNorm gradient
347 // -------------------------------------------------------------------------- //
348 
349 #define BM_FusedBatchNormGrad(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)      \
350   static void BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT,  \
351                       DEVICE)(::testing::benchmark::State & state) {           \
352     test::Benchmark(                                                           \
353         #DEVICE,                                                               \
354         FusedBatchNormGrad<T>(N, H, W, C, IS_TRAINING, FORMAT_##FORMAT),       \
355         /*old_benchmark_api*/ false)                                           \
356         .Run(state);                                                           \
357     state.SetItemsProcessed(state.iterations() * N * H * W * C);               \
358   }                                                                            \
359   BENCHMARK(                                                                   \
360       BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)) \
361       ->UseRealTime();
362 
363 #define BM_FusedBatchNormGradResnetShapes(T, IS_TRAINING, FORMAT, DEVICE) \
364   BM_FusedBatchNormGrad(64, 56, 56, 64, T, IS_TRAINING, FORMAT, DEVICE);  \
365   BM_FusedBatchNormGrad(64, 56, 56, 128, T, IS_TRAINING, FORMAT, DEVICE); \
366   BM_FusedBatchNormGrad(64, 56, 56, 256, T, IS_TRAINING, FORMAT, DEVICE); \
367                                                                           \
368   BM_FusedBatchNormGrad(64, 28, 28, 128, T, IS_TRAINING, FORMAT, DEVICE); \
369   BM_FusedBatchNormGrad(64, 28, 28, 256, T, IS_TRAINING, FORMAT, DEVICE); \
370   BM_FusedBatchNormGrad(64, 28, 28, 512, T, IS_TRAINING, FORMAT, DEVICE); \
371                                                                           \
372   BM_FusedBatchNormGrad(64, 14, 14, 128, T, IS_TRAINING, FORMAT, DEVICE); \
373   BM_FusedBatchNormGrad(64, 14, 14, 256, T, IS_TRAINING, FORMAT, DEVICE); \
374   BM_FusedBatchNormGrad(64, 14, 14, 1024, T, IS_TRAINING, FORMAT, DEVICE)
375 
376 BM_FusedBatchNormGradResnetShapes(fp32, true, NHWC, cpu);
377 BM_FusedBatchNormGradResnetShapes(fp32, false, NHWC, cpu);
378 
379 #ifdef GOOGLE_CUDA
380 BM_FusedBatchNormGradResnetShapes(fp32, true, NHWC, gpu);
381 BM_FusedBatchNormGradResnetShapes(fp16, true, NHWC, gpu);
382 BM_FusedBatchNormGradResnetShapes(fp32, true, NCHW, gpu);
383 BM_FusedBatchNormGradResnetShapes(fp16, true, NCHW, gpu);
384 
385 BM_FusedBatchNormGradResnetShapes(fp32, false, NHWC, gpu);
386 BM_FusedBatchNormGradResnetShapes(fp16, false, NHWC, gpu);
387 #endif  // GOOGLE_CUDA
388 
389 }  // namespace tensorflow
390