• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
17 #include "tensorflow/core/framework/tensor.h"
18 #include "tensorflow/core/graph/node_builder.h"
19 #include "tensorflow/core/kernels/ops_util.h"
20 #include "tensorflow/core/platform/test.h"
21 #include "tensorflow/core/platform/test_benchmark.h"
22 #include "tensorflow/core/util/tensor_format.h"
23 
24 namespace tensorflow {
25 namespace {
26 
27 // Creates a Graph which applies a unary "func" on a 3D tensor of
28 // type T with "num" elements.
29 template <typename T>
Unary(const string & func,int num,DataType dtype)30 static Graph* Unary(const string& func, int num, DataType dtype) {
31   Graph* g = new Graph(OpRegistry::Global());
32   Tensor data(dtype, TensorShape({64, 64, num / (64 * 64)}));
33   CHECK_GT(data.NumElements(), 0);
34   data.flat<T>().setRandom();
35   test::graph::Unary(g, func, test::graph::Constant(g, data), 0);
36   return g;
37 }
38 
39 const int kRows = 100000;
40 
RowsAndColsArg(int r,int c)41 int RowsAndColsArg(int r, int c) { return r * kRows + c; }
RowsFromArg(int arg)42 int RowsFromArg(int arg) { return (arg / kRows); }
ColsFromArg(int arg)43 int ColsFromArg(int arg) { return (arg % kRows); }
44 
45 #define BM_UNARY(DEVICE, FUNC, T, TYPE)                                    \
46   void BM_##DEVICE##_##FUNC##_##TYPE(::testing::benchmark::State& state) { \
47     const int num = state.range(0);                                        \
48     test::Benchmark(#DEVICE, Unary<T>(#FUNC, num, TYPE),                   \
49                     /*old_benchmark_api*/ false)                           \
50         .Run(state);                                                       \
51     const int64 tot = static_cast<int64>(state.iterations()) * num;        \
52     state.SetItemsProcessed(tot);                                          \
53     state.SetBytesProcessed(tot * sizeof(T));                              \
54   }                                                                        \
55   BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)                                 \
56       ->UseRealTime()                                                      \
57       ->Range(4 << 10, 1 << 20);
58 
59 BM_UNARY(cpu, Floor, float, DT_FLOAT);
60 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
61 BM_UNARY(gpu, Floor, float, DT_FLOAT);
62 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
63 
64 BM_UNARY(cpu, Floor, double, DT_DOUBLE);
65 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
66 BM_UNARY(gpu, Floor, double, DT_DOUBLE);
67 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
68 
69 BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
70 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
71 BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
72 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
73 BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
74 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
75 BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
76 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
77 
78 BM_UNARY(cpu, Rint, double, DT_DOUBLE);
79 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
80 BM_UNARY(gpu, Rint, double, DT_DOUBLE);
81 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
82 BM_UNARY(cpu, Rint, float, DT_FLOAT);
83 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
84 BM_UNARY(gpu, Rint, float, DT_FLOAT);
85 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
86 
87 BM_UNARY(cpu, Round, double, DT_DOUBLE);
88 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
89 BM_UNARY(gpu, Round, double, DT_DOUBLE);
90 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
91 BM_UNARY(cpu, Round, float, DT_FLOAT);
92 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
93 BM_UNARY(gpu, Round, float, DT_FLOAT);
94 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
95 
96 // data func scalar.
BinaryScalar(int num,const string & func)97 Graph* BinaryScalar(int num, const string& func) {
98   Graph* g = new Graph(OpRegistry::Global());
99   Tensor lhs(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
100   lhs.flat<float>().setRandom();
101   Tensor rhs(DT_FLOAT, TensorShape({}));
102   rhs.flat<float>().setRandom();
103   test::graph::Binary(g, func, test::graph::Constant(g, lhs),
104                       test::graph::Constant(g, rhs));
105   return g;
106 }
107 
108 #define BM_BINARY_SCALAR(DEVICE, FUNC)                                     \
109   void BM_##DEVICE##_##FUNC##_scalar(::testing::benchmark::State& state) { \
110     const int num = state.range(0);                                        \
111                                                                            \
112     test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC),                     \
113                     /*old_benchmark_api=*/false)                           \
114         .Run(state);                                                       \
115     const int64 tot = static_cast<int64>(state.iterations()) * num;        \
116     state.SetItemsProcessed(tot);                                          \
117     state.SetBytesProcessed(tot * sizeof(float));                          \
118   }                                                                        \
119   BENCHMARK(BM_##DEVICE##_##FUNC##_scalar)                                 \
120       ->Arg(1 << 12) /* must >= 4096 */                                    \
121       ->Arg(1 << 13)                                                       \
122       ->Arg(1 << 14)                                                       \
123       ->Arg((1 << 15) - (1 << 13))                                         \
124       ->Arg(1 << 15)                                                       \
125       ->Arg((1 << 15) + (1 << 14))                                         \
126       ->Arg(1 << 16)                                                       \
127       ->Arg((1 << 17) - (1 << 15))                                         \
128       ->Arg(1 << 17)                                                       \
129       ->Arg((1 << 17) + (1 << 16))                                         \
130       ->Arg(1 << 18)                                                       \
131       ->Arg(1 << 19)                                                       \
132       ->Arg(1 << 20);
133 
134 BM_BINARY_SCALAR(cpu, Less);
135 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
136 BM_BINARY_SCALAR(gpu, Less);
137 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
138 
139 BM_BINARY_SCALAR(cpu, Add);
140 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
141 BM_BINARY_SCALAR(gpu, Add);
142 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
143 
144 BM_BINARY_SCALAR(cpu, DivNoNan);
145 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
146 BM_BINARY_SCALAR(gpu, DivNoNan);
147 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
148 
149 #undef BM_BINARY_SCALAR
150 
151 // Three implementations of x^3.
CubeWithPow3(int num)152 Graph* CubeWithPow3(int num) {
153   Graph* g = new Graph(OpRegistry::Global());
154   Tensor lhs(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
155   lhs.flat<float>().setRandom();
156   Tensor rhs(DT_FLOAT, TensorShape({}));
157   rhs.flat<float>().setConstant(3);
158   test::graph::Binary(g, "Pow", test::graph::Constant(g, lhs),
159                       test::graph::Constant(g, rhs));
160   return g;
161 }
162 
CubeWithTwoMuls(int num)163 Graph* CubeWithTwoMuls(int num) {
164   Graph* g = new Graph(OpRegistry::Global());
165   Tensor lhs(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
166   lhs.flat<float>().setRandom();
167   auto* x = test::graph::Constant(g, lhs);
168   auto* inner = test::graph::Binary(g, "Mul", x, x);
169   test::graph::Binary(g, "Mul", x, inner);
170   return g;
171 }
172 
CubeWithMulSquare(int num)173 Graph* CubeWithMulSquare(int num) {
174   Graph* g = new Graph(OpRegistry::Global());
175   Tensor lhs(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
176   lhs.flat<float>().setRandom();
177   auto* x = test::graph::Constant(g, lhs);
178   auto* inner = test::graph::Unary(g, "Square", x);
179   test::graph::Binary(g, "Mul", test::graph::Constant(g, lhs), inner);
180   return g;
181 }
182 
183 #define BM_CUBE(DEVICE, Impl)                                          \
184   void BM_##DEVICE##_Cube_##Impl(::testing::benchmark::State& state) { \
185     const int num = state.range(0);                                    \
186                                                                        \
187     test::Benchmark(#DEVICE, Impl(num), /*old_benchmark_api*/ false)   \
188         .Run(state);                                                   \
189     const int64 tot = static_cast<int64>(state.iterations()) * num;    \
190     state.SetItemsProcessed(tot);                                      \
191     state.SetBytesProcessed(tot * sizeof(float));                      \
192   }                                                                    \
193   BENCHMARK(BM_##DEVICE##_Cube_##Impl)                                 \
194       ->UseRealTime()                                                  \
195       ->Arg(1 << 12) /* must >= 4096 */                                \
196       ->Arg(1 << 16)                                                   \
197       ->Arg(1 << 20);
198 
199 BM_CUBE(cpu, CubeWithPow3);
200 BM_CUBE(cpu, CubeWithTwoMuls);
201 BM_CUBE(cpu, CubeWithMulSquare);
202 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
203 BM_CUBE(gpu, CubeWithPow3);
204 BM_CUBE(gpu, CubeWithTwoMuls);
205 BM_CUBE(gpu, CubeWithMulSquare);
206 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
207 
208 #undef BM_CUBE
209 
210 template <class T>
BiasAdd(int rows,int cols,DataType type)211 Graph* BiasAdd(int rows, int cols, DataType type) {
212   Graph* g = new Graph(OpRegistry::Global());
213   Tensor lhs(type, TensorShape({rows, cols}));
214   lhs.template flat<T>().setRandom();
215   TensorShape rhs_shape;
216   rhs_shape = TensorShape({cols});
217   Tensor rhs(type, rhs_shape);
218   rhs.template flat<T>().setRandom();
219   test::graph::Binary(g, "BiasAdd", test::graph::Constant(g, lhs),
220                       test::graph::Constant(g, rhs));
221   return g;
222 }
223 
224 #define BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, R, C)                          \
225   void BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C(                        \
226       ::testing::benchmark::State& state) {                                 \
227     const int arg = state.range(0);                                         \
228     const int rows = RowsFromArg(arg);                                      \
229     const int cols = ColsFromArg(arg);                                      \
230     const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
231     test::Benchmark(#DEVICE, BiasAdd<C_TYPE>(rows, cols, TF_TYPE),          \
232                     /*old_benchmark_api=*/false)                            \
233         .Run(state);                                                        \
234     state.SetItemsProcessed(tot);                                           \
235     state.SetBytesProcessed(tot * sizeof(C_TYPE));                          \
236   }                                                                         \
237   BENCHMARK(BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C)                   \
238       ->UseRealTime()                                                       \
239       ->Arg(RowsAndColsArg(R, C));
240 
241 #define BM_BIAS_ADD_ALL(DEVICE, C_TYPE, TF_TYPE)   \
242   BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, 512, 2048); \
243   BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, 512, 4096); \
244   BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, 2048, 512); \
245   BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, 4096, 512);
246 
247 using Eigen::half;
248 BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT);
249 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
250 BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT);
251 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
252 BM_BIAS_ADD_ALL(cpu, half, DT_HALF);
253 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
254 BM_BIAS_ADD_ALL(gpu, half, DT_HALF);
255 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
256 #undef BM_BIAS_ADD_ALL
257 #undef BM_BIAS_ADD
258 
259 template <class T>
BiasAddGrad(int rows,int cols,int channels,DataType type,TensorFormat format)260 Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
261                    TensorFormat format) {
262   Graph* g = new Graph(OpRegistry::Global());
263   TensorShape lhs_shape;
264   if (format == FORMAT_NCHW) {
265     lhs_shape = TensorShape({channels, rows, cols});
266   } else {
267     lhs_shape = TensorShape({rows, cols, channels});
268   }
269   Tensor lhs(type, lhs_shape);
270   lhs.template flat<T>().setRandom();
271   Node* n;
272   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BiasAddGrad")
273                   .Attr("data_format", ToString(format))
274                   .Input(test::graph::Constant(g, lhs), /*src_index=*/0)
275                   .Finalize(g, &n));
276   return g;
277 }
278 
279 #define BM_BIAS_ADD_GRAD(DEVICE, FMT, C_TYPE, TF_TYPE, R, C, CH)               \
280   void BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH(      \
281       ::testing::benchmark::State& state) {                                    \
282     const int arg = state.range(0);                                            \
283     const int channels = state.range(1);                                       \
284                                                                                \
285     const int rows = RowsFromArg(arg);                                         \
286     const int cols = ColsFromArg(arg);                                         \
287     test::Benchmark(                                                           \
288         #DEVICE,                                                               \
289         BiasAddGrad<C_TYPE>(rows, cols, channels, TF_TYPE, FORMAT_##FMT),      \
290         /*old_benchmark_api=*/false)                                           \
291         .Run(state);                                                           \
292     const int64 tot =                                                          \
293         static_cast<int64>(state.iterations()) * rows * cols * channels;       \
294     state.SetItemsProcessed(tot);                                              \
295     state.SetBytesProcessed(tot * sizeof(C_TYPE));                             \
296   }                                                                            \
297   BENCHMARK(BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH) \
298       ->ArgPair(RowsAndColsArg(R, C), CH);
299 
300 #define BM_BIAS_ADD_GRAD_ALL(DEVICE, FORMAT, C_TYPE, TF_TYPE)       \
301   BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 64, 64, 64);    \
302   BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 512, 512, 4);   \
303   BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 512, 512, 1);   \
304   BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 4); \
305   BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 1);
306 
307 using Eigen::half;
308 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
309 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT);
310 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF);
311 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
312 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT);
313 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
314 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT);
315 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
316 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF);
317 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
318 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
319 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
320 #undef BM_BIAS_ADD_GRAD_ALL
321 #undef BM_BIAS_ADD_GRAD
322 
BcastAdd(int rows,int cols,int dim)323 Graph* BcastAdd(int rows, int cols, int dim) {
324   Graph* g = new Graph(OpRegistry::Global());
325   TensorShape lhs_shape, rhs_shape;
326   if (dim == 0) {  // row
327     lhs_shape = TensorShape({rows, cols});
328     rhs_shape = TensorShape({rows, 1});
329   } else if (dim == 1) {  // col
330     lhs_shape = TensorShape({rows, cols});
331     rhs_shape = TensorShape({cols});
332   } else if (dim == 2) {  // cross_rc
333     lhs_shape = TensorShape({rows, 1});
334     rhs_shape = TensorShape({1, cols});
335   } else {  // cross_cr
336     lhs_shape = TensorShape({1, cols});
337     rhs_shape = TensorShape({rows, 1});
338   }
339   Tensor lhs(DT_FLOAT, lhs_shape);
340   lhs.flat<float>().setRandom();
341   Tensor rhs(DT_FLOAT, rhs_shape);
342   rhs.flat<float>().setRandom();
343   test::graph::Binary(g, "Add", test::graph::Constant(g, lhs),
344                       test::graph::Constant(g, rhs));
345   return g;
346 }
347 
348 #define BM_BCAST_ADD_ROW(DEVICE, R, C)                                      \
349   void BM_##DEVICE##_BcastAddRow_R##R##_C##C(                               \
350       ::testing::benchmark::State& state) {                                 \
351     const int arg = state.range(0);                                         \
352                                                                             \
353     const int rows = RowsFromArg(arg);                                      \
354     const int cols = ColsFromArg(arg);                                      \
355     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0),                       \
356                     /*old_benchmark_api=*/false)                            \
357         .Run(state);                                                        \
358     const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
359     state.SetItemsProcessed(tot);                                           \
360     state.SetBytesProcessed(tot * sizeof(float));                           \
361   }                                                                         \
362   BENCHMARK(BM_##DEVICE##_BcastAddRow_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
363 
364 #define BM_BCAST_ADD_ROW_ALL(DEVICE)   \
365   BM_BCAST_ADD_ROW(DEVICE, 512, 2048); \
366   BM_BCAST_ADD_ROW(DEVICE, 512, 4096); \
367   BM_BCAST_ADD_ROW(DEVICE, 2048, 512); \
368   BM_BCAST_ADD_ROW(DEVICE, 4096, 512);
369 BM_BCAST_ADD_ROW_ALL(cpu);
370 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
371 BM_BCAST_ADD_ROW_ALL(gpu);
372 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
373 #undef BM_BCAST_ADD_ROW_ALL
374 #undef BM_BCAST_ADD_ROW
375 
376 #define BM_BCAST_ADD_COL(DEVICE, R, C)                                      \
377   void BM_##DEVICE##_BcastAddCol_R##R##_C##C(                               \
378       ::testing::benchmark::State& state) {                                 \
379     const int arg = state.range(0);                                         \
380                                                                             \
381     const int rows = RowsFromArg(arg);                                      \
382     const int cols = ColsFromArg(arg);                                      \
383     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1),                       \
384                     /*old_benchmark_api=*/false)                            \
385         .Run(state);                                                        \
386     const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
387                                                                             \
388     state.SetItemsProcessed(tot);                                           \
389     state.SetBytesProcessed(tot * sizeof(float));                           \
390   }                                                                         \
391   BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C)                          \
392       ->UseRealTime()                                                       \
393       ->Arg(RowsAndColsArg(R, C));
394 
395 #define BM_BCAST_ADD_COL_ALL(DEVICE)   \
396   BM_BCAST_ADD_COL(DEVICE, 512, 2048); \
397   BM_BCAST_ADD_COL(DEVICE, 512, 4096); \
398   BM_BCAST_ADD_COL(DEVICE, 2048, 512); \
399   BM_BCAST_ADD_COL(DEVICE, 4096, 512);
400 BM_BCAST_ADD_COL_ALL(cpu);
401 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
402 BM_BCAST_ADD_COL_ALL(gpu);
403 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
404 #undef BM_BCAST_ADD_COL_ALL
405 #undef BM_BCAST_ADD_COL
406 
407 #define BM_BCAST_ADD_CROSS_RC(DEVICE, R, C)                                 \
408   void BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C(                           \
409       ::testing::benchmark::State& state) {                                 \
410     const int arg = state.range(0);                                         \
411                                                                             \
412     const int rows = RowsFromArg(arg);                                      \
413     const int cols = ColsFromArg(arg);                                      \
414     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 2),                       \
415                     /*old_benchmark_api=*/false)                            \
416         .Run(state);                                                        \
417     const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
418                                                                             \
419     state.SetItemsProcessed(tot);                                           \
420     state.SetBytesProcessed(tot * sizeof(float));                           \
421   }                                                                         \
422   BENCHMARK(BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C)                      \
423       ->UseRealTime()                                                       \
424       ->Arg(RowsAndColsArg(R, C));
425 
426 #define BM_BCAST_ADD_CROSS_RC_ALL(DEVICE)   \
427   BM_BCAST_ADD_CROSS_RC(DEVICE, 512, 2048); \
428   BM_BCAST_ADD_CROSS_RC(DEVICE, 512, 4096); \
429   BM_BCAST_ADD_CROSS_RC(DEVICE, 2048, 512); \
430   BM_BCAST_ADD_CROSS_RC(DEVICE, 4096, 512);
431 BM_BCAST_ADD_CROSS_RC_ALL(cpu);
432 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
433 BM_BCAST_ADD_CROSS_RC_ALL(gpu);
434 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
435 #undef BM_BCAST_ADD_CROSS_RC_ALL
436 #undef BM_BCAST_ADD_CROSS_RC
437 
438 #define BM_BCAST_ADD_CROSS_CR(DEVICE, R, C)                                 \
439   void BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C(                           \
440       ::testing::benchmark::State& state) {                                 \
441     const int arg = state.range(0);                                         \
442                                                                             \
443     const int rows = RowsFromArg(arg);                                      \
444     const int cols = ColsFromArg(arg);                                      \
445     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 3),                       \
446                     /*old_benchmark_api*/ false)                            \
447         .Run(state);                                                        \
448     const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
449     state.SetItemsProcessed(tot);                                           \
450     state.SetBytesProcessed(tot * sizeof(float));                           \
451   }                                                                         \
452   BENCHMARK(BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C)                      \
453       ->UseRealTime()                                                       \
454       ->Arg(RowsAndColsArg(R, C));
455 
456 #define BM_BCAST_ADD_CROSS_CR_ALL(DEVICE)   \
457   BM_BCAST_ADD_CROSS_CR(DEVICE, 512, 2048); \
458   BM_BCAST_ADD_CROSS_CR(DEVICE, 512, 4096); \
459   BM_BCAST_ADD_CROSS_CR(DEVICE, 2048, 512); \
460   BM_BCAST_ADD_CROSS_CR(DEVICE, 4096, 512);
461 BM_BCAST_ADD_CROSS_CR_ALL(cpu);
462 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
463 BM_BCAST_ADD_CROSS_CR_ALL(gpu);
464 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
465 #undef BM_BCAST_ADD_CROSS_CR_ALL
466 #undef BM_BCAST_ADD_CROSS_CR
467 
468 }  // namespace
469 }  // namespace tensorflow
470