• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <vector>
17 
18 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
19 #include "tensorflow/core/framework/allocator.h"
20 #include "tensorflow/core/framework/fake_input.h"
21 #include "tensorflow/core/framework/node_def_builder.h"
22 #include "tensorflow/core/framework/op_kernel.h"
23 #include "tensorflow/core/framework/tensor.h"
24 #include "tensorflow/core/framework/tensor_testutil.h"
25 #include "tensorflow/core/framework/types.h"
26 #include "tensorflow/core/graph/node_builder.h"
27 #include "tensorflow/core/kernels/ops_testutil.h"
28 #include "tensorflow/core/kernels/ops_util.h"
29 #include "tensorflow/core/lib/core/status_test_util.h"
30 #include "tensorflow/core/platform/test.h"
31 #include "tensorflow/core/platform/test_benchmark.h"
32 
33 namespace tensorflow {
34 class FusedBatchNormOpTest : public OpsTestBase {};
35 
TEST_F(FusedBatchNormOpTest,Training)36 TEST_F(FusedBatchNormOpTest, Training) {
37   TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
38                    .Input(FakeInput(DT_FLOAT))
39                    .Input(FakeInput(DT_FLOAT))
40                    .Input(FakeInput(DT_FLOAT))
41                    .Input(FakeInput(DT_FLOAT))
42                    .Input(FakeInput(DT_FLOAT))
43                    .Attr("epsilon", 0.001)
44                    .Attr("is_training", true)
45                    .Finalize(node_def()));
46   TF_EXPECT_OK(InitOp());
47   AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
48                            {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
49   AddInputFromArray<float>(TensorShape({2}), {4.0, 4.0});
50   AddInputFromArray<float>(TensorShape({2}), {2.0, 2.0});
51   AddInputFromArray<float>(TensorShape({0}), {});
52   AddInputFromArray<float>(TensorShape({0}), {});
53 
54   TF_ASSERT_OK(RunOpKernel());
55 
56   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
57   test::FillValues<float>(&expected, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
58                                       3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
59   test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
60 
61   Tensor expected_mean(allocator(), DT_FLOAT, TensorShape({2}));
62   test::FillValues<float>(&expected_mean, {10, 10});
63   test::ExpectTensorNear<float>(expected_mean, *GetOutput(1), 0.01);
64 
65   Tensor expected_variance(allocator(), DT_FLOAT, TensorShape({2}));
66   test::FillValues<float>(&expected_variance, {14.00, 14.00});
67   test::ExpectTensorNear<float>(expected_variance, *GetOutput(2), 0.01);
68 }
69 
TEST_F(FusedBatchNormOpTest,Inference)70 TEST_F(FusedBatchNormOpTest, Inference) {
71   TF_EXPECT_OK(NodeDefBuilder("batch_norm_op", "FusedBatchNorm")
72                    .Input(FakeInput(DT_FLOAT))
73                    .Input(FakeInput(DT_FLOAT))
74                    .Input(FakeInput(DT_FLOAT))
75                    .Input(FakeInput(DT_FLOAT))
76                    .Input(FakeInput(DT_FLOAT))
77                    .Attr("epsilon", 0.001)
78                    .Attr("is_training", false)
79                    .Finalize(node_def()));
80   TF_EXPECT_OK(InitOp());
81   AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
82                            {5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15});
83   AddInputFromArray<float>(TensorShape({2}), {4.0, 4.0});
84   AddInputFromArray<float>(TensorShape({2}), {2.0, 2.0});
85   AddInputFromArray<float>(TensorShape({2}), {10, 10});
86   AddInputFromArray<float>(TensorShape({2}), {11.67f, 11.67f});
87 
88   TF_ASSERT_OK(RunOpKernel());
89 
90   Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
91   test::FillValues<float>(&expected, {-3.86, -3.86, -1.51, -1.51, 0.83, 0.83,
92                                       3.17, 3.17, 5.51, 5.51, 7.86, 7.86});
93   test::ExpectTensorNear<float>(expected, *GetOutput(0), 0.01);
94 }
95 
96 class FusedBatchNormGradOpTest : public OpsTestBase {};
97 
TEST_F(FusedBatchNormGradOpTest,Simple)98 TEST_F(FusedBatchNormGradOpTest, Simple) {
99   TF_EXPECT_OK(NodeDefBuilder("batch_norm_grad_op", "FusedBatchNormGrad")
100                    .Input(FakeInput(DT_FLOAT))
101                    .Input(FakeInput(DT_FLOAT))
102                    .Input(FakeInput(DT_FLOAT))
103                    .Input(FakeInput(DT_FLOAT))
104                    .Input(FakeInput(DT_FLOAT))
105                    .Attr("epsilon", 0.001)
106                    .Finalize(node_def()));
107   TF_EXPECT_OK(InitOp());
108   AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
109                            {2, 2, 9, 9, -4, -4, 5, 5, 8, 8, 7, 7});
110   AddInputFromArray<float>(TensorShape({1, 1, 6, 2}),
111                            {1, 1, 7, 7, 4, 4, -3, -3, -11, -11, 13, 13});
112   AddInputFromArray<float>(TensorShape({2}), {4, 4});
113   AddInputFromArray<float>(TensorShape({2}), {1.833f, 1.833f});
114   AddInputFromArray<float>(TensorShape({2}), {57.472f, 57.472f});
115 
116   TF_ASSERT_OK(RunOpKernel());
117 
118   Tensor expected_x(allocator(), DT_FLOAT, TensorShape({1, 1, 6, 2}));
119   test::FillValues<float>(&expected_x, {-1.34, -1.34, 2.47, 2.47, -4.44, -4.44,
120                                         0.17, 0.17, 1.60, 1.60, 1.53, 1.53});
121   test::ExpectTensorNear<float>(expected_x, *GetOutput(0), 0.01);
122 
123   Tensor expected_scale(allocator(), DT_FLOAT, TensorShape({2}));
124   test::FillValues<float>(&expected_scale, {-1.6488, -1.6488});
125   test::ExpectTensorNear<float>(expected_scale, *GetOutput(1), 0.01);
126 
127   Tensor expected_offset(allocator(), DT_FLOAT, TensorShape({2}));
128   test::FillValues<float>(&expected_offset, {27, 27});
129   test::ExpectTensorNear<float>(expected_offset, *GetOutput(2), 0.01);
130 }
131 
132 //----------------------------------------------------------------------------//
133 // Performance benchmarks are below.                                          //
134 //----------------------------------------------------------------------------//
135 
136 using fp32 = float;
137 using fp16 = Eigen::half;
138 
139 template <typename T>
FusedBatchNormInference(int n,int h,int w,int c,bool is_training,TensorFormat data_format)140 static Graph* FusedBatchNormInference(int n, int h, int w, int c,
141                                       bool is_training,
142                                       TensorFormat data_format) {
143   Graph* g = new Graph(OpRegistry::Global());
144 
145   DataType dtype = DataTypeToEnum<T>::value;
146   Tensor x_t(dtype, data_format == FORMAT_NHWC ? TensorShape({n, h, w, c})
147                                                : TensorShape({n, c, h, w}));
148   x_t.flat<T>().setRandom();
149 
150   Tensor other_t(DT_FLOAT, TensorShape({c}));
151   other_t.flat<float>().setRandom();
152 
153   Tensor empty_t(DT_FLOAT, TensorShape({0}));
154 
155   Node* x = test::graph::Constant(g, x_t, "x");
156   Node* other = test::graph::Constant(g, other_t, "other");
157   Node* empty = test::graph::Constant(g, empty_t, "empty");
158 
159   Node* fused_batch_norm;
160   TF_CHECK_OK(NodeBuilder(g->NewName("fused_batch_norm"), "FusedBatchNormV3")
161                   .Input(x)
162                   .Input(other)                        // scale
163                   .Input(other)                        // offset
164                   .Input(is_training ? empty : other)  // mean
165                   .Input(is_training ? empty : other)  // variance
166                   .Attr("T", dtype)
167                   .Attr("U", DT_FLOAT)
168                   .Attr("epsilon", 0.001)
169                   .Attr("is_training", is_training)
170                   .Attr("data_format", ToString(data_format))
171                   .Finalize(g, &fused_batch_norm));
172 
173   return g;
174 }
175 
176 template <typename T>
FusedBatchNormGrad(int n,int h,int w,int c,bool is_training,TensorFormat data_format)177 static Graph* FusedBatchNormGrad(int n, int h, int w, int c, bool is_training,
178                                  TensorFormat data_format) {
179   Graph* g = new Graph(OpRegistry::Global());
180 
181   DataType dtype = DataTypeToEnum<T>::value;
182   TensorShape shape = data_format == FORMAT_NHWC ? TensorShape({n, h, w, c})
183                                                  : TensorShape({n, c, h, w});
184 
185   Tensor y_backprop_t(dtype, shape);
186   y_backprop_t.flat<T>().setRandom();
187 
188   Tensor x_t(dtype, shape);
189   x_t.flat<T>().setRandom();
190 
191   Tensor other_t(DT_FLOAT, TensorShape({c}));
192   other_t.flat<float>().setRandom();
193 
194   Node* y_backprop = test::graph::Constant(g, y_backprop_t, "y_backprop");
195   Node* x = test::graph::Constant(g, x_t, "x");
196   Node* other = test::graph::Constant(g, other_t, "other");
197 
198   Node* fused_batch_norm;
199   TF_CHECK_OK(
200       NodeBuilder(g->NewName("fused_batch_norm_grad"), "FusedBatchNormGradV3")
201           .Input(y_backprop)
202           .Input(x)
203           .Input(other)  // scale
204           .Input(other)  // saved_mean_or_pop_mean
205           .Input(other)  // saved_maybe_inv_var_or_pop_var
206           .Input(other)  // reserve_space
207           .Attr("T", dtype)
208           .Attr("U", DT_FLOAT)
209           .Attr("epsilon", 0.001)
210           .Attr("is_training", is_training)
211           .Attr("data_format", ToString(data_format))
212           .Finalize(g, &fused_batch_norm));
213 
214   return g;
215 }
216 
217 #define BM_NAME(NAME, N, H, W, C, T, IT, FORMAT, DEVICE) \
218   BM_##NAME##_##N##_##H##_##W##_##C##_##IT##_##FORMAT##_##T##_##DEVICE
219 
220 // -------------------------------------------------------------------------- //
221 // FusedBatchNorm inference
222 // -------------------------------------------------------------------------- //
223 
224 #define BM_FusedBatchNorm(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)       \
225   static void BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT,   \
226                       DEVICE)(int iters) {                                  \
227     testing::UseRealTime();                                                 \
228     testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C);     \
229     test::Benchmark(#DEVICE, FusedBatchNormInference<T>(                    \
230                                  N, H, W, C, IS_TRAINING, FORMAT_##FORMAT)) \
231         .Run(iters);                                                        \
232   }                                                                         \
233   BENCHMARK(                                                                \
234       BM_NAME(FusedBatchNorm, N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE));
235 
236 BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, cpu);
237 BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, cpu);
238 
239 BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NHWC, cpu);
240 BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NHWC, cpu);
241 
242 #ifdef GOOGLE_CUDA
243 BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NHWC, gpu);
244 BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NHWC, gpu);
245 
246 BM_FusedBatchNorm(64, 14, 14, 256, fp32, false, NCHW, gpu);
247 BM_FusedBatchNorm(64, 14, 14, 256, fp16, false, NCHW, gpu);
248 
249 BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NHWC, gpu);
250 BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NHWC, gpu);
251 
252 BM_FusedBatchNorm(64, 14, 14, 256, fp32, true, NCHW, gpu);
253 BM_FusedBatchNorm(64, 14, 14, 256, fp16, true, NCHW, gpu);
254 #endif  // GOOGLE_CUDA
255 
256 // -------------------------------------------------------------------------- //
257 // FusedBatchNorm gradient
258 // -------------------------------------------------------------------------- //
259 
260 #define BM_FusedBatchNormGrad(N, H, W, C, T, IS_TRAINING, FORMAT, DEVICE)     \
261   static void BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT, \
262                       DEVICE)(int iters) {                                    \
263     testing::UseRealTime();                                                   \
264     testing::ItemsProcessed(static_cast<int64>(iters) * N * H * W * C);       \
265     test::Benchmark(#DEVICE, FusedBatchNormGrad<T>(N, H, W, C, IS_TRAINING,   \
266                                                    FORMAT_##FORMAT))          \
267         .Run(iters);                                                          \
268   }                                                                           \
269   BENCHMARK(BM_NAME(FusedBatchNormGrad, N, H, W, C, T, IS_TRAINING, FORMAT,   \
270                     DEVICE));
271 
272 #define BM_FusedBatchNormGradResnetShapes(T, IS_TRAINING, FORMAT, DEVICE) \
273   BM_FusedBatchNormGrad(64, 56, 56, 64, T, IS_TRAINING, FORMAT, DEVICE);  \
274   BM_FusedBatchNormGrad(64, 56, 56, 128, T, IS_TRAINING, FORMAT, DEVICE); \
275   BM_FusedBatchNormGrad(64, 56, 56, 256, T, IS_TRAINING, FORMAT, DEVICE); \
276                                                                           \
277   BM_FusedBatchNormGrad(64, 28, 28, 128, T, IS_TRAINING, FORMAT, DEVICE); \
278   BM_FusedBatchNormGrad(64, 28, 28, 256, T, IS_TRAINING, FORMAT, DEVICE); \
279   BM_FusedBatchNormGrad(64, 28, 28, 512, T, IS_TRAINING, FORMAT, DEVICE); \
280                                                                           \
281   BM_FusedBatchNormGrad(64, 14, 14, 128, T, IS_TRAINING, FORMAT, DEVICE); \
282   BM_FusedBatchNormGrad(64, 14, 14, 256, T, IS_TRAINING, FORMAT, DEVICE); \
283   BM_FusedBatchNormGrad(64, 14, 14, 1024, T, IS_TRAINING, FORMAT, DEVICE)
284 
285 BM_FusedBatchNormGradResnetShapes(fp32, true, NHWC, cpu);
286 BM_FusedBatchNormGradResnetShapes(fp32, false, NHWC, cpu);
287 
288 #ifdef GOOGLE_CUDA
289 BM_FusedBatchNormGradResnetShapes(fp32, true, NHWC, gpu);
290 BM_FusedBatchNormGradResnetShapes(fp16, true, NHWC, gpu);
291 BM_FusedBatchNormGradResnetShapes(fp32, true, NCHW, gpu);
292 BM_FusedBatchNormGradResnetShapes(fp16, true, NCHW, gpu);
293 
294 BM_FusedBatchNormGradResnetShapes(fp32, false, NHWC, gpu);
295 BM_FusedBatchNormGradResnetShapes(fp16, false, NHWC, gpu);
296 #endif  // GOOGLE_CUDA
297 
298 }  // namespace tensorflow
299