• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <string>
17 #include <vector>
18 
19 #include "absl/algorithm/container.h"
20 #include "tensorflow/cc/ops/const_op.h"
21 #include "tensorflow/cc/ops/image_ops.h"
22 #include "tensorflow/cc/ops/nn_ops.h"
23 #include "tensorflow/cc/ops/nn_ops_internal.h"
24 #include "tensorflow/cc/ops/standard_ops.h"
25 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
26 #include "tensorflow/core/framework/fake_input.h"
27 #include "tensorflow/core/framework/node_def_builder.h"
28 #include "tensorflow/core/framework/tensor.h"
29 #include "tensorflow/core/framework/types.pb.h"
30 #include "tensorflow/core/kernels/conv_ops_gpu.h"
31 #include "tensorflow/core/kernels/ops_testutil.h"
32 #include "tensorflow/core/kernels/ops_util.h"
33 #include "tensorflow/core/lib/core/status_test_util.h"
34 #include "tensorflow/core/platform/tensor_float_32_utils.h"
35 #include "tensorflow/core/platform/test.h"
36 #include "tensorflow/core/platform/test_benchmark.h"
37 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
38 #include "tensorflow/core/public/session.h"
39 
40 namespace tensorflow {
41 
42 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
43 
44 struct ConvParametersPeer {
45   template <typename T>
ShouldIncludeWinogradNonfusedAlgoPreCudnn7tensorflow::ConvParametersPeer46   bool ShouldIncludeWinogradNonfusedAlgoPreCudnn7() {
47     return params.ShouldIncludeWinogradNonfusedAlgoPreCudnn7<T>();
48   }
49 
50   ConvParameters params;
51 };
52 
TEST(ConvParameters,WinogradNonfusedAlgoSize)53 TEST(ConvParameters, WinogradNonfusedAlgoSize) {
54   ConvParametersPeer conv_params_small = {{
55       1,            // batch
56       32,           // in_depths
57       {{300,        // in_rows
58         300}},      // in_cols
59       FORMAT_NCHW,  // compute_data_format
60       128,          // out_depths
61       {{3,          // filter_rows
62         3}},        // filter_cols
63       {{1,          // dilation_rows
64         1}},        // dilation_cols
65       {{1,          // stride_rows
66         1}},        // stride_cols
67       {{0,          // padding_rows
68         0}},        // padding_cols
69       DT_FLOAT,     // tensor datatype
70       0,            // device_id
71   }};
72   EXPECT_TRUE(
73       conv_params_small.ShouldIncludeWinogradNonfusedAlgoPreCudnn7<float>());
74 
75   ConvParametersPeer conv_params_large = {{
76       1,            // batch
77       128,          // in_depths
78       {{300,        // in_rows
79         300}},      // in_cols
80       FORMAT_NCHW,  // compute_data_format
81       768,          // out_depths
82       {{3,          // filter_rows
83         3}},        // filter_cols
84       {{1,          // dilation_rows
85         1}},        // dilation_cols
86       {{1,          // stride_rows
87         1}},        // stride_cols
88       {{0,          // padding_rows
89         0}},        // padding_cols
90       DT_FLOAT,     // tensor datatype
91       0,            // device_id
92   }};
93   EXPECT_FALSE(
94       conv_params_large.ShouldIncludeWinogradNonfusedAlgoPreCudnn7<float>());
95 }
96 
97 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
98 
99 class FusedResizePadConvOpTest : public OpsTestBase {
100  protected:
101   template <typename T>
HandwrittenConv(DataType dtype)102   void HandwrittenConv(DataType dtype) {
103     const int stride = 1;
104     TF_EXPECT_OK(NodeDefBuilder("fused_resize_op", "FusedResizeAndPadConv2D")
105                      .Input(FakeInput(dtype))
106                      .Input(FakeInput(DT_INT32))
107                      .Input(FakeInput(DT_INT32))
108                      .Input(FakeInput(dtype))
109                      .Attr("T", dtype)
110                      .Attr("resize_align_corners", false)
111                      .Attr("mode", "REFLECT")
112                      .Attr("strides", {1, stride, stride, 1})
113                      .Attr("padding", "SAME")
114                      .Finalize(node_def()));
115     TF_EXPECT_OK(InitOp());
116     const int depth = 1;
117     const int image_width = 4;
118     const int image_height = 3;
119     const int image_batch_count = 1;
120     // The image matrix is:
121     // |  1 |  2 |  3 |  4 |
122     // |  5 |  6 |  7 |  8 |
123     // |  9 | 10 | 11 | 12 |
124     Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
125     test::FillValues<T>(&image, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
126 
127     // The filter matrix is:
128     // | 1 | 4 | 7 |
129     // | 2 | 5 | 8 |
130     // | 3 | 6 | 9 |
131     const int filter_size = 3;
132     const int filter_count = 1;
133     Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
134     test::FillValues<T>(&filter, {1, 4, 7, 2, 5, 8, 3, 6, 9});
135 
136     const int resized_width = image_width;
137     const int resized_height = image_height;
138 
139     const int top_padding = 0;
140     const int bottom_padding = 0;
141     const int left_padding = 0;
142     const int right_padding = 0;
143 
144     AddInputFromArray<T>(image.shape(), image.flat<T>());
145     AddInputFromArray<int32>(TensorShape({2}), {resized_height, resized_width});
146     AddInputFromArray<int32>(
147         TensorShape({4, 2}),
148         {0, 0, top_padding, bottom_padding, left_padding, right_padding, 0, 0});
149     AddInputFromArray<T>(filter.shape(), filter.flat<T>());
150     TF_ASSERT_OK(RunOpKernel());
151 
152     // We're sliding the 3x3 filter across the 3x4 image, with accesses outside
153     // the input set to zero because we're using the 'SAME' padding mode.
154     // The calculations behind the expected output are:
155     // (1*0)+(4*0)+(7*0)+(2*0)+(5*1)+(8*2)+(3*0)+(6*5)+(9*6)=105
156     // (1*0)+(4*0)+(7*0)+(2*1)+(5*2)+(8*3)+(3*5)+(6*6)+(9*7)=150
157     // (1*0)+(4*0)+(7*0)+(2*2)+(5*3)+(8*4)+(3*6)+(6*7)+(9*8)=183
158     // (1*0)+(4*0)+(7*0)+(2*3)+(5*4)+(8*0)+(3*7)+(6*8)+(9*0)=95
159     // (1*0)+(4*1)+(7*2)+(2*0)+(5*5)+(8*6)+(3*0)+(6*9)+(9*10)=235
160     // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)=312
161     // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)=357
162     // (1*3)+(4*4)+(7*0)+(2*7)+(5*8)+(8*0)+(3*11)+(6*12)+(9*0)=178
163     // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)=187
164     // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)=234
165     // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)=261
166     // (1*7)+(4*11)+(7*0)+(2*8)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)=121
167     // This means we should end up with this matrix:
168     // |  105  |  150  |  183  |   95  |
169     // |  235  |  312  |  357  |  178  |
170     // |  187  |  234  |  261  |  121  |
171     const int expected_width = image_width;
172     const int expected_height = image_height * filter_count;
173     Tensor expected(dtype, TensorShape({image_batch_count, expected_height,
174                                         expected_width, filter_count}));
175     test::FillValues<T>(
176         &expected, {105, 150, 183, 95, 235, 312, 357, 178, 187, 234, 261, 121});
177     const Tensor& output = *GetOutput(0);
178     test::ExpectTensorNear<T>(expected, output, 1e-5);
179   }
180 
181   template <typename T>
CompareFusedAndSeparate(int input_width,int input_height,int input_depth,int resize_width,int resize_height,int y_padding,int x_padding,int filter_size,int filter_count,bool resize_align_corners,const string & pad_mode,int stride,const string & padding,DataType dtype)182   void CompareFusedAndSeparate(int input_width, int input_height,
183                                int input_depth, int resize_width,
184                                int resize_height, int y_padding, int x_padding,
185                                int filter_size, int filter_count,
186                                bool resize_align_corners,
187                                const string& pad_mode, int stride,
188                                const string& padding, DataType dtype) {
189     Scope root = tensorflow::Scope::NewRootScope();
190     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
191 
192     Tensor input_data(DT_FLOAT,
193                       TensorShape({1, input_height, input_width, input_depth}));
194     test::FillIota<float>(&input_data, 1.0f);
195     Output input =
196         Const(root.WithOpName("input"), Input::Initializer(input_data));
197     Output casted_input = Cast(root.WithOpName("casted_input"), input, dtype);
198 
199     Tensor filter_data(DT_FLOAT, TensorShape({filter_size, filter_size,
200                                               input_depth, filter_count}));
201     test::FillIota<float>(&filter_data, 1.0f);
202     Output filter =
203         Const(root.WithOpName("filter"), Input::Initializer(filter_data));
204     Output casted_filter =
205         Cast(root.WithOpName("casted_filter"), filter, dtype);
206 
207     Output resize_size =
208         Const(root.WithOpName("resize_size"), {resize_height, resize_width});
209     Output resize =
210         ResizeBilinear(root.WithOpName("resize"), input, resize_size,
211                        ResizeBilinear::AlignCorners(resize_align_corners));
212     // Bilinear resize only output float, cast it to dtype to match the input.
213     Output casted_resize = Cast(root.WithOpName("cast"), resize, dtype);
214     Output paddings =
215         Const(root.WithOpName("paddings"),
216               {{0, 0}, {y_padding, y_padding}, {x_padding, x_padding}, {0, 0}});
217     Output mirror_pad = MirrorPad(root.WithOpName("mirror_pad"), casted_resize,
218                                   paddings, pad_mode);
219     Output conv = Conv2D(root.WithOpName("conv"), mirror_pad, casted_filter,
220                          {1, stride, stride, 1}, padding);
221 
222     Output fused_conv = FusedResizeAndPadConv2D(
223         root.WithOpName("fused_conv"), casted_input, resize_size, paddings,
224         casted_filter, pad_mode, {1, stride, stride, 1}, padding,
225         FusedResizeAndPadConv2D::ResizeAlignCorners(resize_align_corners));
226 
227     tensorflow::GraphDef graph;
228     TF_ASSERT_OK(root.ToGraphDef(&graph));
229 
230     std::unique_ptr<tensorflow::Session> session(
231         tensorflow::NewSession(tensorflow::SessionOptions()));
232     TF_ASSERT_OK(session->Create(graph));
233 
234     std::vector<Tensor> unfused_tensors;
235     TF_ASSERT_OK(session->Run({}, {"conv"}, {}, &unfused_tensors));
236 
237     std::vector<Tensor> fused_tensors;
238     TF_ASSERT_OK(session->Run({}, {"fused_conv"}, {}, &fused_tensors));
239 
240     test::ExpectClose(unfused_tensors[0], fused_tensors[0]);
241   }
242 
243   template <typename T>
CompareFusedPadOnlyAndSeparate(int input_width,int input_height,int input_depth,int y_padding,int x_padding,int filter_size,int filter_count,const string & pad_mode,int stride,const string & padding,DataType dtype)244   void CompareFusedPadOnlyAndSeparate(int input_width, int input_height,
245                                       int input_depth, int y_padding,
246                                       int x_padding, int filter_size,
247                                       int filter_count, const string& pad_mode,
248                                       int stride, const string& padding,
249                                       DataType dtype) {
250     Scope root = tensorflow::Scope::NewRootScope();
251     using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
252 
253     Tensor input_data(DT_FLOAT,
254                       TensorShape({1, input_height, input_width, input_depth}));
255     test::FillIota<float>(&input_data, 1.0f);
256     Output input =
257         Const(root.WithOpName("input"), Input::Initializer(input_data));
258     Output casted_input = Cast(root.WithOpName("casted_input"), input, dtype);
259 
260     Tensor filter_data(DT_FLOAT, TensorShape({filter_size, filter_size,
261                                               input_depth, filter_count}));
262     test::FillIota<float>(&filter_data, 1.0f);
263     Output filter =
264         Const(root.WithOpName("filter"), Input::Initializer(filter_data));
265     Output casted_filter =
266         Cast(root.WithOpName("casted_filter"), filter, dtype);
267 
268     Output paddings =
269         Const(root.WithOpName("paddings"),
270               {{0, 0}, {y_padding, y_padding}, {x_padding, x_padding}, {0, 0}});
271     Output mirror_pad = MirrorPad(root.WithOpName("mirror_pad"), casted_input,
272                                   paddings, pad_mode);
273     Output conv = Conv2D(root.WithOpName("conv"), mirror_pad, casted_filter,
274                          {1, stride, stride, 1}, padding);
275 
276     Output fused_conv = FusedPadConv2D(
277         root.WithOpName("fused_conv"), casted_input, paddings, casted_filter,
278         pad_mode, {1, stride, stride, 1}, padding);
279 
280     tensorflow::GraphDef graph;
281     TF_ASSERT_OK(root.ToGraphDef(&graph));
282 
283     std::unique_ptr<tensorflow::Session> session(
284         tensorflow::NewSession(tensorflow::SessionOptions()));
285     TF_ASSERT_OK(session->Create(graph));
286 
287     std::vector<Tensor> unfused_tensors;
288     TF_ASSERT_OK(session->Run({}, {"conv"}, {}, &unfused_tensors));
289 
290     std::vector<Tensor> fused_tensors;
291     TF_ASSERT_OK(session->Run({}, {"fused_conv"}, {}, &fused_tensors));
292 
293     test::ExpectClose(unfused_tensors[0], fused_tensors[0]);
294   }
295 };
296 
TEST_F(FusedResizePadConvOpTest,HandwrittenConvHalf)297 TEST_F(FusedResizePadConvOpTest, HandwrittenConvHalf) {
298   HandwrittenConv<Eigen::half>(DT_HALF);
299 }
300 
TEST_F(FusedResizePadConvOpTest,HandwrittenConvFloat)301 TEST_F(FusedResizePadConvOpTest, HandwrittenConvFloat) {
302   HandwrittenConv<float>(DT_FLOAT);
303 }
304 
TEST_F(FusedResizePadConvOpTest,HandwrittenConvDouble)305 TEST_F(FusedResizePadConvOpTest, HandwrittenConvDouble) {
306   HandwrittenConv<double>(DT_DOUBLE);
307 }
308 
TEST_F(FusedResizePadConvOpTest,IdentityComparativeHalf)309 TEST_F(FusedResizePadConvOpTest, IdentityComparativeHalf) {
310   CompareFusedAndSeparate<Eigen::half>(10, 10, 1, 10, 10, 0, 0, 1, 1, false,
311                                        "REFLECT", 1, "SAME", DT_HALF);
312 }
313 
TEST_F(FusedResizePadConvOpTest,IdentityComparativeFloat)314 TEST_F(FusedResizePadConvOpTest, IdentityComparativeFloat) {
315   CompareFusedAndSeparate<float>(10, 10, 1, 10, 10, 0, 0, 1, 1, false,
316                                  "REFLECT", 1, "SAME", DT_FLOAT);
317 }
318 
TEST_F(FusedResizePadConvOpTest,IdentityComparativeDouble)319 TEST_F(FusedResizePadConvOpTest, IdentityComparativeDouble) {
320   CompareFusedAndSeparate<double>(10, 10, 1, 10, 10, 0, 0, 1, 1, false,
321                                   "REFLECT", 1, "SAME", DT_DOUBLE);
322 }
323 
TEST_F(FusedResizePadConvOpTest,ConvOnlyComparative)324 TEST_F(FusedResizePadConvOpTest, ConvOnlyComparative) {
325   CompareFusedAndSeparate<float>(10, 10, 3, 10, 10, 0, 0, 4, 4, false,
326                                  "REFLECT", 1, "SAME", DT_FLOAT);
327 }
328 
TEST_F(FusedResizePadConvOpTest,ResizeOnlyComparative)329 TEST_F(FusedResizePadConvOpTest, ResizeOnlyComparative) {
330   CompareFusedAndSeparate<float>(10, 10, 1, 20, 20, 0, 0, 1, 1, false,
331                                  "REFLECT", 1, "SAME", DT_FLOAT);
332 }
333 
TEST_F(FusedResizePadConvOpTest,ResizeAndConvComparative)334 TEST_F(FusedResizePadConvOpTest, ResizeAndConvComparative) {
335   CompareFusedAndSeparate<float>(2, 2, 4, 4, 2, 0, 0, 2, 2, false, "REFLECT", 1,
336                                  "SAME", DT_FLOAT);
337 }
338 
TEST_F(FusedResizePadConvOpTest,ResizeAlignAndConvComparative)339 TEST_F(FusedResizePadConvOpTest, ResizeAlignAndConvComparative) {
340   CompareFusedAndSeparate<float>(2, 2, 4, 4, 2, 0, 0, 2, 2, true, "REFLECT", 1,
341                                  "SAME", DT_FLOAT);
342 }
343 
TEST_F(FusedResizePadConvOpTest,ResizeAndConvStridedComparative)344 TEST_F(FusedResizePadConvOpTest, ResizeAndConvStridedComparative) {
345   CompareFusedAndSeparate<float>(2, 2, 4, 4, 2, 0, 0, 2, 2, false, "REFLECT", 2,
346                                  "SAME", DT_FLOAT);
347 }
348 
TEST_F(FusedResizePadConvOpTest,ResizeAlignAndConvValidComparative)349 TEST_F(FusedResizePadConvOpTest, ResizeAlignAndConvValidComparative) {
350   CompareFusedAndSeparate<float>(2, 2, 4, 4, 2, 0, 0, 2, 2, true, "REFLECT", 1,
351                                  "VALID", DT_FLOAT);
352 }
353 
TEST_F(FusedResizePadConvOpTest,PadOnlyComparative)354 TEST_F(FusedResizePadConvOpTest, PadOnlyComparative) {
355   CompareFusedAndSeparate<float>(4, 4, 1, 4, 4, 2, 2, 1, 1, false, "REFLECT", 1,
356                                  "SAME", DT_FLOAT);
357 }
358 
TEST_F(FusedResizePadConvOpTest,PadOnlyWithChannelsComparative)359 TEST_F(FusedResizePadConvOpTest, PadOnlyWithChannelsComparative) {
360   CompareFusedAndSeparate<float>(4, 4, 3, 4, 4, 2, 2, 1, 1, false, "REFLECT", 1,
361                                  "SAME", DT_FLOAT);
362 }
363 
TEST_F(FusedResizePadConvOpTest,ResizeAndPadComparative)364 TEST_F(FusedResizePadConvOpTest, ResizeAndPadComparative) {
365   CompareFusedAndSeparate<float>(4, 4, 1, 6, 6, 2, 2, 1, 1, false, "REFLECT", 1,
366                                  "SAME", DT_FLOAT);
367 }
368 
TEST_F(FusedResizePadConvOpTest,PadOnlySymmetricComparative)369 TEST_F(FusedResizePadConvOpTest, PadOnlySymmetricComparative) {
370   CompareFusedAndSeparate<float>(4, 4, 1, 4, 4, 2, 2, 1, 1, false, "SYMMETRIC",
371                                  1, "SAME", DT_FLOAT);
372 }
373 
TEST_F(FusedResizePadConvOpTest,ResizeAndPadSymmetricComparative)374 TEST_F(FusedResizePadConvOpTest, ResizeAndPadSymmetricComparative) {
375   CompareFusedAndSeparate<float>(4, 4, 3, 6, 6, 2, 2, 1, 1, false, "SYMMETRIC",
376                                  1, "SAME", DT_FLOAT);
377 }
378 
TEST_F(FusedResizePadConvOpTest,ResizeAndPadSymmetricComparativeLarge)379 TEST_F(FusedResizePadConvOpTest, ResizeAndPadSymmetricComparativeLarge) {
380   CompareFusedAndSeparate<float>(1000, 1000, 3, 1006, 1006, 2, 2, 1, 1, false,
381                                  "SYMMETRIC", 1, "SAME", DT_FLOAT);
382 }
383 
TEST_F(FusedResizePadConvOpTest,NoResizeIdentityComparativeHalf)384 TEST_F(FusedResizePadConvOpTest, NoResizeIdentityComparativeHalf) {
385   CompareFusedPadOnlyAndSeparate<Eigen::half>(10, 10, 1, 0, 0, 1, 1, "REFLECT",
386                                               1, "SAME", DT_HALF);
387 }
388 
TEST_F(FusedResizePadConvOpTest,NoResizeIdentityComparativeFloat)389 TEST_F(FusedResizePadConvOpTest, NoResizeIdentityComparativeFloat) {
390   CompareFusedPadOnlyAndSeparate<float>(10, 10, 1, 0, 0, 1, 1, "REFLECT", 1,
391                                         "SAME", DT_FLOAT);
392 }
393 
TEST_F(FusedResizePadConvOpTest,NoResizeIdentityComparativeDouble)394 TEST_F(FusedResizePadConvOpTest, NoResizeIdentityComparativeDouble) {
395   CompareFusedPadOnlyAndSeparate<double>(10, 10, 1, 0, 0, 1, 1, "REFLECT", 1,
396                                          "SAME", DT_DOUBLE);
397 }
398 
TEST_F(FusedResizePadConvOpTest,NoResizeConvOnlyComparative)399 TEST_F(FusedResizePadConvOpTest, NoResizeConvOnlyComparative) {
400   CompareFusedPadOnlyAndSeparate<float>(10, 10, 3, 0, 0, 4, 4, "REFLECT", 1,
401                                         "SAME", DT_FLOAT);
402 }
403 
TEST_F(FusedResizePadConvOpTest,NoResizePadOnlyComparative)404 TEST_F(FusedResizePadConvOpTest, NoResizePadOnlyComparative) {
405   CompareFusedPadOnlyAndSeparate<float>(4, 4, 1, 2, 2, 1, 1, "REFLECT", 1,
406                                         "SAME", DT_FLOAT);
407 }
408 
TEST_F(FusedResizePadConvOpTest,NoResizePadOnlyWithChannelsComparative)409 TEST_F(FusedResizePadConvOpTest, NoResizePadOnlyWithChannelsComparative) {
410   CompareFusedPadOnlyAndSeparate<float>(4, 4, 3, 2, 2, 1, 1, "REFLECT", 1,
411                                         "SAME", DT_FLOAT);
412 }
413 
TEST_F(FusedResizePadConvOpTest,NoResizePadOnlySymmetricComparative)414 TEST_F(FusedResizePadConvOpTest, NoResizePadOnlySymmetricComparative) {
415   CompareFusedPadOnlyAndSeparate<float>(4, 4, 1, 2, 2, 1, 1, "SYMMETRIC", 1,
416                                         "SAME", DT_FLOAT);
417 }
418 
419 class ConvOpTest : public OpsTestBase {
420  protected:
HandwrittenConv()421   void HandwrittenConv() {
422     const int stride = 1;
423     TF_EXPECT_OK(NodeDefBuilder("conv_op", "Conv2D")
424                      .Input(FakeInput(DT_FLOAT))
425                      .Input(FakeInput(DT_FLOAT))
426                      .Attr("T", DT_FLOAT)
427                      .Attr("strides", {1, stride, stride, 1})
428                      .Attr("padding", "SAME")
429                      .Finalize(node_def()));
430     TF_EXPECT_OK(InitOp());
431     const int depth = 1;
432     const int image_width = 4;
433     const int image_height = 3;
434     const int image_batch_count = 1;
435     // The image matrix is:
436     // |  1 |  2 |  3 |  4 |
437     // |  5 |  6 |  7 |  8 |
438     // |  9 | 10 | 11 | 12 |
439     Tensor image(DT_FLOAT,
440                  {image_batch_count, image_height, image_width, depth});
441     test::FillValues<float>(&image, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
442 
443     // The filter matrix is:
444     // | 1 | 4 | 7 |
445     // | 2 | 5 | 8 |
446     // | 3 | 6 | 9 |
447     const int filter_size = 3;
448     const int filter_count = 1;
449     Tensor filter(DT_FLOAT, {filter_size, filter_size, depth, filter_count});
450     test::FillValues<float>(&filter, {1, 4, 7, 2, 5, 8, 3, 6, 9});
451 
452     AddInputFromArray<float>(image.shape(), image.flat<float>());
453     AddInputFromArray<float>(filter.shape(), filter.flat<float>());
454     TF_ASSERT_OK(RunOpKernel());
455 
456     // We're sliding the 3x3 filter across the 3x4 image, with accesses outside
457     // the input set to zero because we're using the 'SAME' padding mode.
458     // The calculations behind the expected output are:
459     // (1*0)+(4*0)+(7*0)+(2*0)+(5*1)+(8*2)+(3*0)+(6*5)+(9*6)=105
460     // (1*0)+(4*0)+(7*0)+(2*1)+(5*2)+(8*3)+(3*5)+(6*6)+(9*7)=150
461     // (1*0)+(4*0)+(7*0)+(2*2)+(5*3)+(8*4)+(3*6)+(6*7)+(9*8)=183
462     // (1*0)+(4*0)+(7*0)+(2*3)+(5*4)+(8*0)+(3*7)+(6*8)+(9*0)=95
463     // (1*0)+(4*1)+(7*2)+(2*0)+(5*5)+(8*6)+(3*0)+(6*9)+(9*10)=235
464     // (1*1)+(4*2)+(7*3)+(2*5)+(5*6)+(8*7)+(3*9)+(6*10)+(9*11)=312
465     // (1*2)+(4*3)+(7*4)+(2*6)+(5*7)+(8*8)+(3*10)+(6*11)+(9*12)=357
466     // (1*3)+(4*4)+(7*0)+(2*7)+(5*8)+(8*0)+(3*11)+(6*12)+(9*0)=178
467     // (1*0)+(4*5)+(7*6)+(2*0)+(5*9)+(8*10)+(3*0)+(6*0)+(9*0)=187
468     // (1*5)+(4*6)+(7*7)+(2*9)+(5*10)+(8*11)+(3*0)+(6*0)+(9*0)=234
469     // (1*6)+(4*7)+(7*8)+(2*10)+(5*11)+(8*12)+(3*0)+(6*0)+(9*0)=261
470     // (1*7)+(4*8)+(7*0)+(2*11)+(5*12)+(8*0)+(3*0)+(6*0)+(9*0)=121
471     // This means we should end up with this matrix:
472     // |  105  |  150  |  183  |   95  |
473     // |  235  |  312  |  357  |  178  |
474     // |  187  |  234  |  261  |  121  |
475     const int expected_width = image_width;
476     const int expected_height = image_height * filter_count;
477     Tensor expected(DT_FLOAT, TensorShape({image_batch_count, expected_height,
478                                            expected_width, filter_count}));
479     test::FillValues<float>(
480         &expected, {105, 150, 183, 95, 235, 312, 357, 178, 187, 234, 261, 121});
481     const Tensor& output = *GetOutput(0);
482     test::ExpectTensorNear<float>(expected, output, 1e-5);
483   }
484 
AnisotropicStrides()485   void AnisotropicStrides() {
486     const int stride_width = 3;
487     const int stride_height = 1;
488     TF_EXPECT_OK(NodeDefBuilder("conv_op", "Conv2D")
489                      .Input(FakeInput(DT_FLOAT))
490                      .Input(FakeInput(DT_FLOAT))
491                      .Attr("T", DT_FLOAT)
492                      .Attr("strides", {1, stride_height, stride_width, 1})
493                      .Attr("padding", "VALID")
494                      .Finalize(node_def()));
495     TF_EXPECT_OK(InitOp());
496     const int depth = 1;
497     const int image_width = 6;
498     const int image_height = 3;
499     const int image_batch_count = 1;
500     Tensor image(DT_FLOAT,
501                  {image_batch_count, image_height, image_width, depth});
502     test::FillValues<float>(&image, {
503                                         3, 2, 1, -1, -2, -3,  //
504                                         4, 3, 2, -2, -3, -4,  //
505                                         5, 4, 3, -3, -4, -5,  //
506                                     });
507     const int filter_size = 2;
508     const int filter_count = 1;
509     Tensor filter(DT_FLOAT, {filter_size, filter_size, depth, filter_count});
510     test::FillValues<float>(&filter, {
511                                          1, 2,  //
512                                          3, 4,  //
513                                      });
514 
515     AddInputFromArray<float>(image.shape(), image.flat<float>());
516     AddInputFromArray<float>(filter.shape(), filter.flat<float>());
517     TF_ASSERT_OK(RunOpKernel());
518 
519     const int expected_width = 2;
520     const int expected_height = 2;
521     Tensor expected(DT_FLOAT, TensorShape({image_batch_count, expected_height,
522                                            expected_width, filter_count}));
523     test::FillValues<float>(&expected, {31, -23, 41, -33});
524     const Tensor& output = *GetOutput(0);
525     test::ExpectTensorNear<float>(expected, output, 1e-5);
526   }
527 };
528 
TEST_F(ConvOpTest,HandwrittenConv)529 TEST_F(ConvOpTest, HandwrittenConv) { HandwrittenConv(); }
530 
TEST_F(ConvOpTest,AnisotropicStride)531 TEST_F(ConvOpTest, AnisotropicStride) { AnisotropicStrides(); }
532 
533 template <typename T>
534 class FusedConv2DOpTest : public OpsTestBase {
535  protected:
536   static constexpr int kDepth = 3;
537   static constexpr int kImageWidth = 32;
538   static constexpr int kImageHeight = 32;
539   static constexpr int kImageBatchCount = 8;
540 
541   using BiasAddGraphRunner =
542       std::function<void(const Tensor& input_data, const Tensor& filter_data,
543                          const Tensor& bias_data, Tensor* out)>;
544 
545   using BatchNormGraphRunner = std::function<void(
546       const Tensor& input_data, const Tensor& filter_data,
547       const Tensor& scale_data, const Tensor& offset_data,
548       const Tensor& mean_data, const Tensor& variance_data, Tensor* out)>;
549 
550   // Runs a Tensorflow graph defined by the root scope, and fetches the result
551   // of 'fetch' node into the output Tensor. Optional `fetch_node` parameter
552   // allows to define a fetch node directly using a NodeDef for the ops that are
553   // not supported by the C++ Api.
RunAndFetch(const tensorflow::Scope & root,const string & fetch,Tensor * output,bool allow_gpu_device,const NodeDef * fetch_node=nullptr)554   void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
555                    Tensor* output, bool allow_gpu_device,
556                    const NodeDef* fetch_node = nullptr) {
557     tensorflow::GraphDef graph;
558     TF_ASSERT_OK(root.ToGraphDef(&graph));
559 
560     if (fetch_node) {
561       *graph.add_node() = *fetch_node;
562     }
563 
564     // We really want to make sure that graph executed exactly as we passed it
565     // to the session, so we disable various optimizations.
566     tensorflow::SessionOptions session_options;
567 
568     // Disable common runtime constant folding.
569     session_options.config.mutable_graph_options()
570         ->mutable_optimizer_options()
571         ->set_opt_level(OptimizerOptions::L0);
572 
573     // Disable Grappler optimizations for tests.
574     tensorflow::RewriterConfig* cfg =
575         session_options.config.mutable_graph_options()
576             ->mutable_rewrite_options();
577     cfg->set_constant_folding(tensorflow::RewriterConfig::OFF);
578     cfg->set_layout_optimizer(tensorflow::RewriterConfig::OFF);
579     cfg->set_remapping(tensorflow::RewriterConfig::OFF);
580 
581     std::unique_ptr<tensorflow::Session> session(
582         tensorflow::NewSession(session_options));
583 
584     std::vector<DeviceAttributes> available_devices;
585     TF_ASSERT_OK(session->ListDevices(&available_devices))
586         << "Failed to get available session devices";
587 
588     // Check if session has an available GPU device.
589     const bool has_gpu_device =
590         absl::c_any_of(available_devices, [](const DeviceAttributes& device) {
591           return device.device_type() == DEVICE_GPU;
592         });
593 
594     // Some of the `FusedConv2D` fusion types are implemented only for CPU, and
595     // in this test we don't want to compare GPU vs CPU numbers, so place all
596     // nodes on CPU in this case.
597     const bool place_all_on_gpu = allow_gpu_device && has_gpu_device;
598 
599     const string device = place_all_on_gpu ? "/device:GPU:0" : "/device:CPU:0";
600     for (NodeDef& mutable_node : *graph.mutable_node()) {
601       mutable_node.set_device(device);
602     }
603 
604     TF_ASSERT_OK(session->Create(graph));
605 
606     std::vector<Tensor> unfused_tensors;
607     TF_ASSERT_OK(session->Run({}, {fetch}, {}, &unfused_tensors));
608 
609     *output = unfused_tensors[0];
610   }
611 
RunConv2DWithBias(const Tensor & input_data,const Tensor & filter_data,const Tensor & bias_data,const std::string & padding,const std::vector<int> & explicit_paddings,Tensor * output,bool allow_gpu_device=false,int stride=1)612   void RunConv2DWithBias(const Tensor& input_data, const Tensor& filter_data,
613                          const Tensor& bias_data, const std::string& padding,
614                          const std::vector<int>& explicit_paddings,
615                          Tensor* output, bool allow_gpu_device = false,
616                          int stride = 1) {
617     Scope root = tensorflow::Scope::NewRootScope();
618 
619     ops::Conv2D conv = ops::Conv2D(
620         root.WithOpName("conv"),
621         ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
622         ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
623         {1, stride, stride, 1}, padding,
624         ops::Conv2D::Attrs().ExplicitPaddings(explicit_paddings));
625 
626     ops::BiasAdd with_bias = ops::BiasAdd(
627         root.WithOpName("with_bias"), conv,
628         ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
629 
630     RunAndFetch(root, "with_bias", output, allow_gpu_device);
631   }
632 
RunConv2DWithBiasAndActivation(const Tensor & input_data,const Tensor & filter_data,const Tensor & bias_data,const string & activation_type,const std::string & padding,const std::vector<int> & explicit_paddings,Tensor * output,bool allow_gpu_device=false,int stride=1)633   void RunConv2DWithBiasAndActivation(
634       const Tensor& input_data, const Tensor& filter_data,
635       const Tensor& bias_data, const string& activation_type,
636       const std::string& padding, const std::vector<int>& explicit_paddings,
637       Tensor* output, bool allow_gpu_device = false, int stride = 1) {
638     Scope root = tensorflow::Scope::NewRootScope();
639 
640     ops::Conv2D conv = ops::Conv2D(
641         root.WithOpName("conv"),
642         ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
643         ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
644         {1, stride, stride, 1}, padding,
645         ops::Conv2D::Attrs().ExplicitPaddings(explicit_paddings));
646 
647     ops::BiasAdd with_bias = ops::BiasAdd(
648         root.WithOpName("with_bias"), conv,
649         ops::Const(root.WithOpName("bias"), Input::Initializer(bias_data)));
650 
651     if (activation_type == "Relu") {
652       ops::Relu(root.WithOpName("with_activation"), with_bias);
653     } else if (activation_type == "Relu6") {
654       ops::Relu6(root.WithOpName("with_activation"), with_bias);
655     } else if (activation_type == "Elu") {
656       ops::Elu(root.WithOpName("with_activation"), with_bias);
657     } else if (activation_type == "LeakyRelu") {
658       ops::internal::LeakyRelu(root.WithOpName("with_activation"), with_bias);
659     } else {
660       ops::Identity(root.WithOpName("with_activation"), with_bias);
661     }
662 
663     RunAndFetch(root, "with_activation", output, allow_gpu_device);
664   }
665 
RunConv2DWithBatchNorm(const Tensor & input_data,const Tensor & filter_data,const Tensor & scale_data,const Tensor & offset_data,const Tensor & mean_data,const Tensor & variance_data,const std::string & padding,const std::vector<int> & explicit_paddings,Tensor * output,bool allow_gpu_device=false,int stride=1)666   void RunConv2DWithBatchNorm(
667       const Tensor& input_data, const Tensor& filter_data,
668       const Tensor& scale_data, const Tensor& offset_data,
669       const Tensor& mean_data, const Tensor& variance_data,
670       const std::string& padding, const std::vector<int>& explicit_paddings,
671       Tensor* output, bool allow_gpu_device = false, int stride = 1) {
672     Scope root = tensorflow::Scope::NewRootScope();
673 
674     ops::Conv2D conv = ops::Conv2D(
675         root.WithOpName("conv"),
676         ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
677         ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
678         {1, stride, stride, 1}, padding,
679         ops::Conv2D::Attrs().ExplicitPaddings(explicit_paddings));
680 
681     ops::FusedBatchNorm::Attrs attr;
682     attr = attr.IsTraining(false);
683 
684     ops::FusedBatchNorm with_fused_batch_norm = ops::FusedBatchNorm(
685         root.WithOpName("with_fused_batch_norm"), conv,
686         ops::Const(root.WithOpName("scale"), Input::Initializer(scale_data)),
687         ops::Const(root.WithOpName("offset"), Input::Initializer(offset_data)),
688         ops::Const(root.WithOpName("mean"), Input::Initializer(mean_data)),
689         ops::Const(root.WithOpName("var"), Input::Initializer(variance_data)),
690         attr);
691 
692     RunAndFetch(root, "with_fused_batch_norm", output, allow_gpu_device);
693   }
694 
RunConv2DWithBatchNormAndActivation(const Tensor & input_data,const Tensor & filter_data,const Tensor & scale_data,const Tensor & offset_data,const Tensor & mean_data,const Tensor & variance_data,const string & activation_type,const std::string & padding,const std::vector<int> & explicit_paddings,Tensor * output,bool allow_gpu_device=false,int stride=1)695   void RunConv2DWithBatchNormAndActivation(
696       const Tensor& input_data, const Tensor& filter_data,
697       const Tensor& scale_data, const Tensor& offset_data,
698       const Tensor& mean_data, const Tensor& variance_data,
699       const string& activation_type, const std::string& padding,
700       const std::vector<int>& explicit_paddings, Tensor* output,
701       bool allow_gpu_device = false, int stride = 1) {
702     Scope root = tensorflow::Scope::NewRootScope();
703 
704     ops::Conv2D conv = ops::Conv2D(
705         root.WithOpName("conv"),
706         ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
707         ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
708         {1, stride, stride, 1}, padding,
709         ops::Conv2D::Attrs().ExplicitPaddings(explicit_paddings));
710 
711     ops::FusedBatchNorm::Attrs attr;
712     attr = attr.IsTraining(false);
713 
714     ops::FusedBatchNorm with_fused_batch_norm = ops::FusedBatchNorm(
715         root.WithOpName("with_fused_batch_norm"), conv,
716         ops::Const(root.WithOpName("scale"), Input::Initializer(scale_data)),
717         ops::Const(root.WithOpName("offset"), Input::Initializer(offset_data)),
718         ops::Const(root.WithOpName("mean"), Input::Initializer(mean_data)),
719         ops::Const(root.WithOpName("var"), Input::Initializer(variance_data)),
720         attr);
721 
722     if (activation_type == "Relu") {
723       ops::Relu(root.WithOpName("with_activation"), with_fused_batch_norm.y);
724     } else if (activation_type == "Relu6") {
725       ops::Relu6(root.WithOpName("with_activation"), with_fused_batch_norm.y);
726     } else if (activation_type == "Elu") {
727       ops::Elu(root.WithOpName("with_activation"), with_fused_batch_norm.y);
728     } else if (activation_type == "LeakyRelu") {
729       ops::internal::LeakyRelu(root.WithOpName("with_activation"),
730                                with_fused_batch_norm.y);
731     } else {
732       ops::Identity(root.WithOpName("with_activation"),
733                     with_fused_batch_norm.y);
734     }
735 
736     RunAndFetch(root, "with_activation", output, allow_gpu_device);
737   }
738 
RunFusedConv2DOp(const Tensor & input_data,const Tensor & filter_data,const std::vector<Tensor> & args_data,const std::vector<string> & fused_ops,const std::string & padding,const std::vector<int> & explicit_paddings,Tensor * output,bool allow_gpu_device=false,int stride=1)739   void RunFusedConv2DOp(const Tensor& input_data, const Tensor& filter_data,
740                         const std::vector<Tensor>& args_data,
741                         const std::vector<string>& fused_ops,
742                         const std::string& padding,
743                         const std::vector<int>& explicit_paddings,
744                         Tensor* output, bool allow_gpu_device = false,
745                         int stride = 1) {
746     Scope root = tensorflow::Scope::NewRootScope();
747 
748     DataType dtype = DataTypeToEnum<T>::v();
749     int num_args = static_cast<int>(args_data.size());
750 
751     Output input =
752         ops::Const(root.WithOpName("input"), Input::Initializer(input_data));
753     Output filter =
754         ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data));
755 
756     std::vector<NodeDefBuilder::NodeOut> args;
757     for (int i = 0; i < num_args; ++i) {
758       Output arg = ops::Const(root.WithOpName(absl::StrCat("arg", i)),
759                               Input::Initializer(args_data[i]));
760       args.emplace_back(arg.name(), 0, dtype);
761     }
762 
763     NodeDef fused_conv2d;
764     TF_EXPECT_OK(NodeDefBuilder("fused_conv", "_FusedConv2D")
765                      .Input({input.name(), 0, dtype})
766                      .Input({filter.name(), 0, dtype})
767                      .Input(args)
768                      .Attr("num_args", num_args)
769                      .Attr("T", dtype)
770                      .Attr("strides", {1, stride, stride, 1})
771                      .Attr("padding", padding)
772                      .Attr("explicit_paddings", explicit_paddings)
773                      .Attr("fused_ops", fused_ops)
774                      .Finalize(&fused_conv2d));
775 
776     RunAndFetch(root, fused_conv2d.name(), output, allow_gpu_device,
777                 &fused_conv2d);
778   }
779 
VerifyBiasAddTensorsNear(int depth,int image_width,int image_height,int image_batch_count,int filter_size,int filter_count,const BiasAddGraphRunner & run_default,const BiasAddGraphRunner & run_fused)780   void VerifyBiasAddTensorsNear(int depth, int image_width, int image_height,
781                                 int image_batch_count, int filter_size,
782                                 int filter_count,
783                                 const BiasAddGraphRunner& run_default,
784                                 const BiasAddGraphRunner& run_fused) {
785     DataType dtype = DataTypeToEnum<T>::v();
786 
787     Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
788     image.flat<T>() = image.flat<T>().setRandom();
789 
790     // Add some negative values to filter to properly test Relu.
791     Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
792     filter.flat<T>() = filter.flat<T>().setRandom();
793     filter.flat<T>() -= filter.flat<T>().constant(static_cast<T>(0.5f));
794 
795     const int bias_size = filter_count;
796     Tensor bias(dtype, {bias_size});
797     bias.flat<T>() = bias.flat<T>().setRandom();
798     bias.flat<T>() += bias.flat<T>().constant(static_cast<T>(0.5f));
799 
800     Tensor conv_2d;
801     Tensor fused_conv_2d;
802 
803     run_default(image, filter, bias, &conv_2d);
804     run_fused(image, filter, bias, &fused_conv_2d);
805 
806     ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
807     ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
808 
809     // NOTE(intel-tf): When filter_size is equal to the input image size,
810     // conv2d essentially is element-wise multiplication followed by
811     // a full sum reduction, which causes larger numerical error
812     // than usual cases.
813     if (image_width == filter_size && image_height == filter_size) {
814       test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-4);
815     } else {
816       test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-5);
817     }
818   }
819 
VerifyFusedBatchNormTensorsNear(int depth,int image_width,int image_height,int image_batch_count,int filter_size,int filter_count,const BatchNormGraphRunner & run_default,const BatchNormGraphRunner & run_fused)820   void VerifyFusedBatchNormTensorsNear(int depth, int image_width,
821                                        int image_height, int image_batch_count,
822                                        int filter_size, int filter_count,
823                                        const BatchNormGraphRunner& run_default,
824                                        const BatchNormGraphRunner& run_fused) {
825     DataType dtype = DataTypeToEnum<T>::v();
826 
827     Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
828     image.flat<T>() = image.flat<T>().setRandom();
829 
830     // Add some negative values to filter to properly test Relu.
831     Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
832     filter.flat<T>() = filter.flat<T>().setRandom();
833     filter.flat<T>() -= filter.flat<T>().constant(static_cast<T>(0.5f));
834 
835     const int scale_size = filter_count;
836 
837     Tensor scale(dtype, {scale_size});
838     scale.flat<T>() = scale.flat<T>().setRandom();
839 
840     Tensor offset(dtype, {scale_size});
841     offset.flat<T>() = offset.flat<T>().setRandom();
842 
843     Tensor mean(dtype, {scale_size});
844     mean.flat<T>() = mean.flat<T>().setRandom();
845 
846     Tensor variance(dtype, {scale_size});
847     variance.flat<T>() = variance.flat<T>().setRandom();
848     variance.flat<T>() += variance.flat<T>().constant(static_cast<T>(0.5f));
849 
850     Tensor conv_2d;
851     Tensor fused_conv_2d;
852 
853     run_default(image, filter, scale, offset, mean, variance, &conv_2d);
854     run_fused(image, filter, scale, offset, mean, variance, &fused_conv_2d);
855 
856     ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
857     ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
858 
859     // NOTE(intel-tf): When filter_size is equal to the input image size,
860     // conv2d essentially is element-wise multiplication followed by
861     // a full sum reduction, which causes larger numerical error
862     // than usual cases.
863     if (image_width == filter_size && image_height == filter_size) {
864       test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-4);
865     } else {
866       test::ExpectClose(conv_2d, fused_conv_2d, /*atol=*/1e-5);
867     }
868   }
869 
870   // Verifies that computing Conv2D+BiasAdd in a graph is identical to
871   // FusedConv2D.
VerifyConv2DWithBias(int filter_size,int filter_count,const std::vector<int> & explicit_paddings={},int depth=kDepth,int image_width=kImageWidth,int image_height=kImageHeight,int image_batch_count=kImageBatchCount)872   void VerifyConv2DWithBias(int filter_size, int filter_count,
873                             const std::vector<int>& explicit_paddings = {},
874                             int depth = kDepth, int image_width = kImageWidth,
875                             int image_height = kImageHeight,
876                             int image_batch_count = kImageBatchCount) {
877     std::string padding = explicit_paddings.empty() ? "SAME" : "EXPLICIT";
878     const BiasAddGraphRunner run_default =
879         [this, &explicit_paddings, padding](
880             const Tensor& input_data, const Tensor& filter_data,
__anone7ca09510202( const Tensor& input_data, const Tensor& filter_data, const Tensor& bias_data, Tensor* out) 881             const Tensor& bias_data, Tensor* out) {
882           RunConv2DWithBias(input_data, filter_data, bias_data, padding,
883                             explicit_paddings, out);
884         };
885 
886     const BiasAddGraphRunner run_fused =
887         [this, explicit_paddings, padding](
888             const Tensor& input_data, const Tensor& filter_data,
__anone7ca09510302( const Tensor& input_data, const Tensor& filter_data, const Tensor& bias_data, Tensor* out) 889             const Tensor& bias_data, Tensor* out) {
890           RunFusedConv2DOp(input_data, filter_data, {bias_data}, {"BiasAdd"},
891                            padding, explicit_paddings, out);
892         };
893 
894     VerifyBiasAddTensorsNear(depth, image_width, image_height,
895                              image_batch_count, filter_size, filter_count,
896                              run_default, run_fused);
897   }
898 
899   // Verifies that computing Conv2D+BiasAdd+{Activation} in a graph is identical
900   // to FusedConv2D.
VerifyConv2DWithBiasAndActivation(const string & activation,int filter_size,int filter_count,const std::vector<int> & explicit_paddings={},int depth=kDepth,int image_width=kImageWidth,int image_height=kImageHeight,int image_batch_count=kImageBatchCount)901   void VerifyConv2DWithBiasAndActivation(
902       const string& activation, int filter_size, int filter_count,
903       const std::vector<int>& explicit_paddings = {}, int depth = kDepth,
904       int image_width = kImageWidth, int image_height = kImageHeight,
905       int image_batch_count = kImageBatchCount) {
906     std::string padding = explicit_paddings.empty() ? "SAME" : "EXPLICIT";
907     const BiasAddGraphRunner run_default =
908         [this, &activation, &explicit_paddings, &padding](
909             const Tensor& input_data, const Tensor& filter_data,
__anone7ca09510402( const Tensor& input_data, const Tensor& filter_data, const Tensor& bias_data, Tensor* out) 910             const Tensor& bias_data, Tensor* out) {
911           RunConv2DWithBiasAndActivation(
912               input_data, filter_data, bias_data, activation, padding,
913               explicit_paddings, out,
914               /*allow_gpu_device=*/activation == "Relu");
915         };
916 
917     const BiasAddGraphRunner run_fused = [this, &activation, &explicit_paddings,
918                                           padding](const Tensor& input_data,
919                                                    const Tensor& filter_data,
920                                                    const Tensor& bias_data,
__anone7ca09510502(const Tensor& input_data, const Tensor& filter_data, const Tensor& bias_data, Tensor* out) 921                                                    Tensor* out) {
922       RunFusedConv2DOp(input_data, filter_data, {bias_data},
923                        {"BiasAdd", activation}, padding, explicit_paddings, out,
924                        /*allow_gpu_device=*/activation == "Relu");
925     };
926 
927     VerifyBiasAddTensorsNear(depth, image_width, image_height,
928                              image_batch_count, filter_size, filter_count,
929                              run_default, run_fused);
930   }
931 
932   // Verifies that computing Conv2D+FusedBatchNorm in a graph is identical to
933   // FusedConv2D.
VerifyConv2DWithBatchNorm(int filter_size,int filter_count,const std::vector<int> & explicit_paddings={},int depth=kDepth,int image_width=kImageWidth,int image_height=kImageHeight,int image_batch_count=kImageBatchCount)934   void VerifyConv2DWithBatchNorm(int filter_size, int filter_count,
935                                  const std::vector<int>& explicit_paddings = {},
936                                  int depth = kDepth,
937                                  int image_width = kImageWidth,
938                                  int image_height = kImageHeight,
939                                  int image_batch_count = kImageBatchCount) {
940     std::string padding = explicit_paddings.empty() ? "SAME" : "EXPLICIT";
941     const BatchNormGraphRunner run_default =
942         [this, explicit_paddings, padding](
943             const Tensor& input_data, const Tensor& filter_data,
944             const Tensor& scale_data, const Tensor& offset_data,
__anone7ca09510602( const Tensor& input_data, const Tensor& filter_data, const Tensor& scale_data, const Tensor& offset_data, const Tensor& mean_data, const Tensor& variance_data, Tensor* out) 945             const Tensor& mean_data, const Tensor& variance_data, Tensor* out) {
946           RunConv2DWithBatchNorm(input_data, filter_data, scale_data,
947                                  offset_data, mean_data, variance_data, padding,
948                                  explicit_paddings, out);
949         };
950 
951     const BatchNormGraphRunner run_fused =
952         [this, explicit_paddings, padding](
953             const Tensor& input_data, const Tensor& filter_data,
954             const Tensor& scale_data, const Tensor& offset_data,
__anone7ca09510702( const Tensor& input_data, const Tensor& filter_data, const Tensor& scale_data, const Tensor& offset_data, const Tensor& mean_data, const Tensor& variance_data, Tensor* out) 955             const Tensor& mean_data, const Tensor& variance_data, Tensor* out) {
956           RunFusedConv2DOp(input_data, filter_data,
957                            {scale_data, offset_data, mean_data, variance_data},
958                            {"FusedBatchNorm"}, padding, explicit_paddings, out);
959         };
960 
961     VerifyFusedBatchNormTensorsNear(depth, image_width, image_height,
962                                     image_batch_count, filter_size,
963                                     filter_count, run_default, run_fused);
964   }
965 
966   // Verifies that computing Conv2D+FusedBatchNorm+{Activation} in a graph is
967   // identical to FusedConv2D.
VerifyConv2DWithBatchNormAndActivation(const string & activation,int filter_size,int filter_count,const std::vector<int> & explicit_paddings={},int depth=kDepth,int image_width=kImageWidth,int image_height=kImageHeight,int image_batch_count=kImageBatchCount)968   void VerifyConv2DWithBatchNormAndActivation(
969       const string& activation, int filter_size, int filter_count,
970       const std::vector<int>& explicit_paddings = {}, int depth = kDepth,
971       int image_width = kImageWidth, int image_height = kImageHeight,
972       int image_batch_count = kImageBatchCount) {
973     std::string padding = explicit_paddings.empty() ? "SAME" : "EXPLICIT";
974     const BatchNormGraphRunner run_default =
975         [this, &activation, explicit_paddings, padding](
976             const Tensor& input_data, const Tensor& filter_data,
977             const Tensor& scale_data, const Tensor& offset_data,
__anone7ca09510802( const Tensor& input_data, const Tensor& filter_data, const Tensor& scale_data, const Tensor& offset_data, const Tensor& mean_data, const Tensor& variance_data, Tensor* out) 978             const Tensor& mean_data, const Tensor& variance_data, Tensor* out) {
979           RunConv2DWithBatchNormAndActivation(
980               input_data, filter_data, scale_data, offset_data, mean_data,
981               variance_data, activation, padding, explicit_paddings, out);
982         };
983 
984     const BatchNormGraphRunner run_fused =
985         [this, &activation, explicit_paddings, padding](
986             const Tensor& input_data, const Tensor& filter_data,
987             const Tensor& scale_data, const Tensor& offset_data,
__anone7ca09510902( const Tensor& input_data, const Tensor& filter_data, const Tensor& scale_data, const Tensor& offset_data, const Tensor& mean_data, const Tensor& variance_data, Tensor* out) 988             const Tensor& mean_data, const Tensor& variance_data, Tensor* out) {
989           RunFusedConv2DOp(input_data, filter_data,
990                            {scale_data, offset_data, mean_data, variance_data},
991                            {"FusedBatchNorm", activation}, padding,
992                            explicit_paddings, out);
993         };
994 
995     VerifyFusedBatchNormTensorsNear(depth, image_width, image_height,
996                                     image_batch_count, filter_size,
997                                     filter_count, run_default, run_fused);
998   }
999 };
1000 
1001 // Conv2D with BatchNorm can be tested only with `T=float`, because default
1002 // `FusedBatchNorm` kernel supports only floats for scale, mean and variance.
1003 
1004 template <typename T>
1005 class FusedConv2DWithBiasOpTest : public FusedConv2DOpTest<T> {};
1006 template <typename T>
1007 class FusedConv2DWithBatchNormOpTest : public FusedConv2DOpTest<T> {};
1008 
1009 TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest);
1010 TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest);
1011 
1012 // ROCm does not yet support the _FusedConv2D op,
1013 // Therefore disable tests that check _FusedConv2D, when building with ROCm
1014 
1015 #ifndef TENSORFLOW_USE_ROCM
1016 // -------------------------------------------------------------------------- //
1017 // Conv2D + BiasAdd + {Activation}                                            //
1018 // -------------------------------------------------------------------------- //
1019 
TYPED_TEST_P(FusedConv2DWithBiasOpTest,OneByOneConvolution)1020 TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolution) {
1021   const int filter_size = 1;
1022   const int filter_count = 12;
1023   this->VerifyConv2DWithBias(filter_size, filter_count);
1024 }
1025 
TYPED_TEST_P(FusedConv2DWithBiasOpTest,ImageSizeConvolution)1026 TYPED_TEST_P(FusedConv2DWithBiasOpTest, ImageSizeConvolution) {
1027   const int filter_size = TestFixture::kImageWidth;
1028   const int filter_count = 12;
1029   this->VerifyConv2DWithBias(filter_size, filter_count);
1030 }
1031 
TYPED_TEST_P(FusedConv2DWithBiasOpTest,SpatialConvolution)1032 TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolution) {
1033   const int filter_size = 3;
1034   const int filter_count = 12;
1035   this->VerifyConv2DWithBias(filter_size, filter_count);
1036 }
1037 
1038 #ifndef INTEL_MKL
TYPED_TEST_P(FusedConv2DWithBiasOpTest,ExplicitPaddingConvolution)1039 TYPED_TEST_P(FusedConv2DWithBiasOpTest, ExplicitPaddingConvolution) {
1040   const int filter_size = 3;
1041   const int filter_count = 12;
1042   this->VerifyConv2DWithBias(filter_size, filter_count,
1043                              /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
1044 }
1045 #endif
1046 
TYPED_TEST_P(FusedConv2DWithBiasOpTest,OneByOneConvolutionAndActivation)1047 TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolutionAndActivation) {
1048   // Requires full precision Conv2D op
1049   tensorflow::enable_tensor_float_32_execution(false);
1050   const int filter_size = 1;
1051   const int filter_count = 12;
1052   for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
1053     this->VerifyConv2DWithBiasAndActivation(activation, filter_size,
1054                                             filter_count);
1055   }
1056 }
1057 
TYPED_TEST_P(FusedConv2DWithBiasOpTest,ImageSizeConvolutionAndActivation)1058 TYPED_TEST_P(FusedConv2DWithBiasOpTest, ImageSizeConvolutionAndActivation) {
1059   const int filter_size = TestFixture::kImageWidth;
1060   const int filter_count = 12;
1061   for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
1062     this->VerifyConv2DWithBiasAndActivation(activation, filter_size,
1063                                             filter_count);
1064   }
1065 }
1066 
TYPED_TEST_P(FusedConv2DWithBiasOpTest,SpatialConvolutionAndActivation)1067 TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolutionAndActivation) {
1068   const int filter_size = 3;
1069   const int filter_count = 12;
1070   for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
1071     this->VerifyConv2DWithBiasAndActivation(activation, filter_size,
1072                                             filter_count);
1073   }
1074 }
1075 
1076 #ifndef INTEL_MKL
TYPED_TEST_P(FusedConv2DWithBiasOpTest,ExplicitPaddingConvolutionAndActivation)1077 TYPED_TEST_P(FusedConv2DWithBiasOpTest,
1078              ExplicitPaddingConvolutionAndActivation) {
1079   const int filter_size = 3;
1080   const int filter_count = 12;
1081   for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
1082     this->VerifyConv2DWithBiasAndActivation(
1083         activation, filter_size, filter_count,
1084         /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
1085   }
1086 }
1087 #endif
1088 
1089 // -------------------------------------------------------------------------- //
1090 // Conv2D + FusedBatchNorm + {Activation}                                     //
1091 // -------------------------------------------------------------------------- //
1092 
TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,OneByOneConvolution)1093 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolution) {
1094   const int filter_size = 1;
1095   const int filter_count = 12;
1096   this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
1097 }
1098 
TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,ImageSizeConvolution)1099 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ImageSizeConvolution) {
1100   const int filter_size = TestFixture::kImageWidth;
1101   const int filter_count = 12;
1102   this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
1103 }
1104 
TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,SpatialConvolution)1105 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolution) {
1106   const int filter_size = 3;
1107   const int filter_count = 12;
1108   this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
1109 }
1110 
1111 #ifndef INTEL_MKL
TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,ExplicitPaddingConvolution)1112 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ExplicitPaddingConvolution) {
1113   const int filter_size = 3;
1114   const int filter_count = 12;
1115   this->VerifyConv2DWithBatchNorm(
1116       filter_size, filter_count,
1117       /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
1118 }
1119 #endif
1120 
TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,OneByOneConvolutionAndActivation)1121 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolutionAndActivation) {
1122   const int filter_size = 1;
1123   const int filter_count = 12;
1124   for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
1125     this->VerifyConv2DWithBatchNormAndActivation(activation, filter_size,
1126                                                  filter_count);
1127   }
1128 }
1129 
TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,ImageSizeConvolutionAndActivation)1130 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
1131              ImageSizeConvolutionAndActivation) {
1132   const int filter_size = TestFixture::kImageWidth;
1133   const int filter_count = 12;
1134   for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
1135     this->VerifyConv2DWithBatchNormAndActivation(activation, filter_size,
1136                                                  filter_count);
1137   }
1138 }
1139 
TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,SpatialConvolutionAndActivation)1140 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolutionAndActivation) {
1141   const int filter_size = 3;
1142   const int filter_count = 12;
1143   for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
1144     this->VerifyConv2DWithBatchNormAndActivation(activation, filter_size,
1145                                                  filter_count);
1146   }
1147 }
1148 
1149 #ifndef INTEL_MKL
TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,ExplicitPaddingConvolutionAndActivation)1150 TYPED_TEST_P(FusedConv2DWithBatchNormOpTest,
1151              ExplicitPaddingConvolutionAndActivation) {
1152   const int filter_size = 3;
1153   const int filter_count = 12;
1154   for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
1155     this->VerifyConv2DWithBatchNormAndActivation(
1156         activation, filter_size, filter_count,
1157         /*explicit_paddings=*/{0, 0, 1, 2, 3, 4, 0, 0});
1158   }
1159 }
1160 #endif
1161 
1162 #ifndef INTEL_MKL
1163 REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest,          //
1164                             OneByOneConvolution,                //
1165                             ImageSizeConvolution,               //
1166                             SpatialConvolution,                 //
1167                             ExplicitPaddingConvolution,         //
1168                             OneByOneConvolutionAndActivation,   //
1169                             ImageSizeConvolutionAndActivation,  //
1170                             SpatialConvolutionAndActivation,    //
1171                             ExplicitPaddingConvolutionAndActivation);
1172 
1173 REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest,     //
1174                             OneByOneConvolution,                //
1175                             ImageSizeConvolution,               //
1176                             SpatialConvolution,                 //
1177                             ExplicitPaddingConvolution,         //
1178                             OneByOneConvolutionAndActivation,   //
1179                             ImageSizeConvolutionAndActivation,  //
1180                             SpatialConvolutionAndActivation,    //
1181                             ExplicitPaddingConvolutionAndActivation);
1182 #else
1183 REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBiasOpTest,          //
1184                             OneByOneConvolution,                //
1185                             ImageSizeConvolution,               //
1186                             SpatialConvolution,                 //
1187                             OneByOneConvolutionAndActivation,   //
1188                             ImageSizeConvolutionAndActivation,  //
1189                             SpatialConvolutionAndActivation);
1190 
1191 REGISTER_TYPED_TEST_SUITE_P(FusedConv2DWithBatchNormOpTest,     //
1192                             OneByOneConvolution,                //
1193                             ImageSizeConvolution,               //
1194                             SpatialConvolution,                 //
1195                             OneByOneConvolutionAndActivation,   //
1196                             ImageSizeConvolutionAndActivation,  //
1197                             SpatialConvolutionAndActivation);
1198 #endif
1199 
1200 using FusedBiasAddDataTypes = ::testing::Types<float, double>;
1201 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBiasOpTest,
1202                                FusedBiasAddDataTypes);
1203 
1204 using FusedBatchNormDataTypes = ::testing::Types<float>;
1205 INSTANTIATE_TYPED_TEST_SUITE_P(Test, FusedConv2DWithBatchNormOpTest,
1206                                FusedBatchNormDataTypes);
1207 
1208 #endif  // TENSORFLOW_USE_ROCM
1209 }  // namespace tensorflow
1210