1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
17 #include "tensorflow/core/framework/tensor.h"
18 #include "tensorflow/core/graph/node_builder.h"
19 #include "tensorflow/core/kernels/ops_util.h"
20 #include "tensorflow/core/platform/test.h"
21 #include "tensorflow/core/platform/test_benchmark.h"
22 #include "tensorflow/core/util/tensor_format.h"
23
24 namespace tensorflow {
25 namespace {
26
27 // Creates a Graph which applies a unary "func" on a 3D tensor of
28 // type T with "num" elements.
29 template <typename T>
Unary(const string & func,int num,DataType dtype)30 static Graph* Unary(const string& func, int num, DataType dtype) {
31 Graph* g = new Graph(OpRegistry::Global());
32 Tensor data(dtype, TensorShape({64, 64, num / (64 * 64)}));
33 CHECK_GT(data.NumElements(), 0);
34 data.flat<T>().setRandom();
35 test::graph::Unary(g, func, test::graph::Constant(g, data), 0);
36 return g;
37 }
38
39 const int kRows = 100000;
40
RowsAndColsArg(int r,int c)41 int RowsAndColsArg(int r, int c) { return r * kRows + c; }
RowsFromArg(int arg)42 int RowsFromArg(int arg) { return (arg / kRows); }
ColsFromArg(int arg)43 int ColsFromArg(int arg) { return (arg % kRows); }
44
45 #define BM_UNARY(DEVICE, FUNC, T, TYPE) \
46 void BM_##DEVICE##_##FUNC##_##TYPE(::testing::benchmark::State& state) { \
47 const int num = state.range(0); \
48 test::Benchmark(#DEVICE, Unary<T>(#FUNC, num, TYPE), \
49 /*old_benchmark_api*/ false) \
50 .Run(state); \
51 const int64 tot = static_cast<int64>(state.iterations()) * num; \
52 state.SetItemsProcessed(tot); \
53 state.SetBytesProcessed(tot * sizeof(T)); \
54 } \
55 BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE) \
56 ->UseRealTime() \
57 ->Range(4 << 10, 1 << 20);
58
59 BM_UNARY(cpu, Floor, float, DT_FLOAT);
60 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
61 BM_UNARY(gpu, Floor, float, DT_FLOAT);
62 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
63
64 BM_UNARY(cpu, Floor, double, DT_DOUBLE);
65 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
66 BM_UNARY(gpu, Floor, double, DT_DOUBLE);
67 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
68
69 BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
70 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
71 BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
72 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
73 BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
74 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
75 BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
76 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
77
78 BM_UNARY(cpu, Rint, double, DT_DOUBLE);
79 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
80 BM_UNARY(gpu, Rint, double, DT_DOUBLE);
81 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
82 BM_UNARY(cpu, Rint, float, DT_FLOAT);
83 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
84 BM_UNARY(gpu, Rint, float, DT_FLOAT);
85 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
86
87 BM_UNARY(cpu, Round, double, DT_DOUBLE);
88 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
89 BM_UNARY(gpu, Round, double, DT_DOUBLE);
90 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
91 BM_UNARY(cpu, Round, float, DT_FLOAT);
92 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
93 BM_UNARY(gpu, Round, float, DT_FLOAT);
94 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
95
96 // data func scalar.
BinaryScalar(int num,const string & func)97 Graph* BinaryScalar(int num, const string& func) {
98 Graph* g = new Graph(OpRegistry::Global());
99 Tensor lhs(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
100 lhs.flat<float>().setRandom();
101 Tensor rhs(DT_FLOAT, TensorShape({}));
102 rhs.flat<float>().setRandom();
103 test::graph::Binary(g, func, test::graph::Constant(g, lhs),
104 test::graph::Constant(g, rhs));
105 return g;
106 }
107
108 #define BM_BINARY_SCALAR(DEVICE, FUNC) \
109 void BM_##DEVICE##_##FUNC##_scalar(::testing::benchmark::State& state) { \
110 const int num = state.range(0); \
111 \
112 test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC), \
113 /*old_benchmark_api=*/false) \
114 .Run(state); \
115 const int64 tot = static_cast<int64>(state.iterations()) * num; \
116 state.SetItemsProcessed(tot); \
117 state.SetBytesProcessed(tot * sizeof(float)); \
118 } \
119 BENCHMARK(BM_##DEVICE##_##FUNC##_scalar) \
120 ->Arg(1 << 12) /* must >= 4096 */ \
121 ->Arg(1 << 13) \
122 ->Arg(1 << 14) \
123 ->Arg((1 << 15) - (1 << 13)) \
124 ->Arg(1 << 15) \
125 ->Arg((1 << 15) + (1 << 14)) \
126 ->Arg(1 << 16) \
127 ->Arg((1 << 17) - (1 << 15)) \
128 ->Arg(1 << 17) \
129 ->Arg((1 << 17) + (1 << 16)) \
130 ->Arg(1 << 18) \
131 ->Arg(1 << 19) \
132 ->Arg(1 << 20);
133
134 BM_BINARY_SCALAR(cpu, Less);
135 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
136 BM_BINARY_SCALAR(gpu, Less);
137 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
138
139 BM_BINARY_SCALAR(cpu, Add);
140 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
141 BM_BINARY_SCALAR(gpu, Add);
142 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
143
144 BM_BINARY_SCALAR(cpu, DivNoNan);
145 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
146 BM_BINARY_SCALAR(gpu, DivNoNan);
147 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
148
149 #undef BM_BINARY_SCALAR
150
151 // Three implementations of x^3.
CubeWithPow3(int num)152 Graph* CubeWithPow3(int num) {
153 Graph* g = new Graph(OpRegistry::Global());
154 Tensor lhs(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
155 lhs.flat<float>().setRandom();
156 Tensor rhs(DT_FLOAT, TensorShape({}));
157 rhs.flat<float>().setConstant(3);
158 test::graph::Binary(g, "Pow", test::graph::Constant(g, lhs),
159 test::graph::Constant(g, rhs));
160 return g;
161 }
162
CubeWithTwoMuls(int num)163 Graph* CubeWithTwoMuls(int num) {
164 Graph* g = new Graph(OpRegistry::Global());
165 Tensor lhs(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
166 lhs.flat<float>().setRandom();
167 auto* x = test::graph::Constant(g, lhs);
168 auto* inner = test::graph::Binary(g, "Mul", x, x);
169 test::graph::Binary(g, "Mul", x, inner);
170 return g;
171 }
172
CubeWithMulSquare(int num)173 Graph* CubeWithMulSquare(int num) {
174 Graph* g = new Graph(OpRegistry::Global());
175 Tensor lhs(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
176 lhs.flat<float>().setRandom();
177 auto* x = test::graph::Constant(g, lhs);
178 auto* inner = test::graph::Unary(g, "Square", x);
179 test::graph::Binary(g, "Mul", test::graph::Constant(g, lhs), inner);
180 return g;
181 }
182
183 #define BM_CUBE(DEVICE, Impl) \
184 void BM_##DEVICE##_Cube_##Impl(::testing::benchmark::State& state) { \
185 const int num = state.range(0); \
186 \
187 test::Benchmark(#DEVICE, Impl(num), /*old_benchmark_api*/ false) \
188 .Run(state); \
189 const int64 tot = static_cast<int64>(state.iterations()) * num; \
190 state.SetItemsProcessed(tot); \
191 state.SetBytesProcessed(tot * sizeof(float)); \
192 } \
193 BENCHMARK(BM_##DEVICE##_Cube_##Impl) \
194 ->UseRealTime() \
195 ->Arg(1 << 12) /* must >= 4096 */ \
196 ->Arg(1 << 16) \
197 ->Arg(1 << 20);
198
199 BM_CUBE(cpu, CubeWithPow3);
200 BM_CUBE(cpu, CubeWithTwoMuls);
201 BM_CUBE(cpu, CubeWithMulSquare);
202 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
203 BM_CUBE(gpu, CubeWithPow3);
204 BM_CUBE(gpu, CubeWithTwoMuls);
205 BM_CUBE(gpu, CubeWithMulSquare);
206 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
207
208 #undef BM_CUBE
209
210 template <class T>
BiasAdd(int rows,int cols,DataType type)211 Graph* BiasAdd(int rows, int cols, DataType type) {
212 Graph* g = new Graph(OpRegistry::Global());
213 Tensor lhs(type, TensorShape({rows, cols}));
214 lhs.template flat<T>().setRandom();
215 TensorShape rhs_shape;
216 rhs_shape = TensorShape({cols});
217 Tensor rhs(type, rhs_shape);
218 rhs.template flat<T>().setRandom();
219 test::graph::Binary(g, "BiasAdd", test::graph::Constant(g, lhs),
220 test::graph::Constant(g, rhs));
221 return g;
222 }
223
224 #define BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, R, C) \
225 void BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C( \
226 ::testing::benchmark::State& state) { \
227 const int arg = state.range(0); \
228 const int rows = RowsFromArg(arg); \
229 const int cols = ColsFromArg(arg); \
230 const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
231 test::Benchmark(#DEVICE, BiasAdd<C_TYPE>(rows, cols, TF_TYPE), \
232 /*old_benchmark_api=*/false) \
233 .Run(state); \
234 state.SetItemsProcessed(tot); \
235 state.SetBytesProcessed(tot * sizeof(C_TYPE)); \
236 } \
237 BENCHMARK(BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C) \
238 ->UseRealTime() \
239 ->Arg(RowsAndColsArg(R, C));
240
241 #define BM_BIAS_ADD_ALL(DEVICE, C_TYPE, TF_TYPE) \
242 BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, 512, 2048); \
243 BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, 512, 4096); \
244 BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, 2048, 512); \
245 BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, 4096, 512);
246
247 using Eigen::half;
248 BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT);
249 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
250 BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT);
251 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
252 BM_BIAS_ADD_ALL(cpu, half, DT_HALF);
253 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
254 BM_BIAS_ADD_ALL(gpu, half, DT_HALF);
255 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
256 #undef BM_BIAS_ADD_ALL
257 #undef BM_BIAS_ADD
258
259 template <class T>
BiasAddGrad(int rows,int cols,int channels,DataType type,TensorFormat format)260 Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
261 TensorFormat format) {
262 Graph* g = new Graph(OpRegistry::Global());
263 TensorShape lhs_shape;
264 if (format == FORMAT_NCHW) {
265 lhs_shape = TensorShape({channels, rows, cols});
266 } else {
267 lhs_shape = TensorShape({rows, cols, channels});
268 }
269 Tensor lhs(type, lhs_shape);
270 lhs.template flat<T>().setRandom();
271 Node* n;
272 TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BiasAddGrad")
273 .Attr("data_format", ToString(format))
274 .Input(test::graph::Constant(g, lhs), /*src_index=*/0)
275 .Finalize(g, &n));
276 return g;
277 }
278
279 #define BM_BIAS_ADD_GRAD(DEVICE, FMT, C_TYPE, TF_TYPE, R, C, CH) \
280 void BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH( \
281 ::testing::benchmark::State& state) { \
282 const int arg = state.range(0); \
283 const int channels = state.range(1); \
284 \
285 const int rows = RowsFromArg(arg); \
286 const int cols = ColsFromArg(arg); \
287 test::Benchmark( \
288 #DEVICE, \
289 BiasAddGrad<C_TYPE>(rows, cols, channels, TF_TYPE, FORMAT_##FMT), \
290 /*old_benchmark_api=*/false) \
291 .Run(state); \
292 const int64 tot = \
293 static_cast<int64>(state.iterations()) * rows * cols * channels; \
294 state.SetItemsProcessed(tot); \
295 state.SetBytesProcessed(tot * sizeof(C_TYPE)); \
296 } \
297 BENCHMARK(BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH) \
298 ->ArgPair(RowsAndColsArg(R, C), CH);
299
300 #define BM_BIAS_ADD_GRAD_ALL(DEVICE, FORMAT, C_TYPE, TF_TYPE) \
301 BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 64, 64, 64); \
302 BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 512, 512, 4); \
303 BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 512, 512, 1); \
304 BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 4); \
305 BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 1);
306
307 using Eigen::half;
308 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
309 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT);
310 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF);
311 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
312 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT);
313 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
314 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT);
315 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
316 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF);
317 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
318 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
319 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
320 #undef BM_BIAS_ADD_GRAD_ALL
321 #undef BM_BIAS_ADD_GRAD
322
BcastAdd(int rows,int cols,int dim)323 Graph* BcastAdd(int rows, int cols, int dim) {
324 Graph* g = new Graph(OpRegistry::Global());
325 TensorShape lhs_shape, rhs_shape;
326 if (dim == 0) { // row
327 lhs_shape = TensorShape({rows, cols});
328 rhs_shape = TensorShape({rows, 1});
329 } else if (dim == 1) { // col
330 lhs_shape = TensorShape({rows, cols});
331 rhs_shape = TensorShape({cols});
332 } else if (dim == 2) { // cross_rc
333 lhs_shape = TensorShape({rows, 1});
334 rhs_shape = TensorShape({1, cols});
335 } else { // cross_cr
336 lhs_shape = TensorShape({1, cols});
337 rhs_shape = TensorShape({rows, 1});
338 }
339 Tensor lhs(DT_FLOAT, lhs_shape);
340 lhs.flat<float>().setRandom();
341 Tensor rhs(DT_FLOAT, rhs_shape);
342 rhs.flat<float>().setRandom();
343 test::graph::Binary(g, "Add", test::graph::Constant(g, lhs),
344 test::graph::Constant(g, rhs));
345 return g;
346 }
347
348 #define BM_BCAST_ADD_ROW(DEVICE, R, C) \
349 void BM_##DEVICE##_BcastAddRow_R##R##_C##C( \
350 ::testing::benchmark::State& state) { \
351 const int arg = state.range(0); \
352 \
353 const int rows = RowsFromArg(arg); \
354 const int cols = ColsFromArg(arg); \
355 test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0), \
356 /*old_benchmark_api=*/false) \
357 .Run(state); \
358 const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
359 state.SetItemsProcessed(tot); \
360 state.SetBytesProcessed(tot * sizeof(float)); \
361 } \
362 BENCHMARK(BM_##DEVICE##_BcastAddRow_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
363
364 #define BM_BCAST_ADD_ROW_ALL(DEVICE) \
365 BM_BCAST_ADD_ROW(DEVICE, 512, 2048); \
366 BM_BCAST_ADD_ROW(DEVICE, 512, 4096); \
367 BM_BCAST_ADD_ROW(DEVICE, 2048, 512); \
368 BM_BCAST_ADD_ROW(DEVICE, 4096, 512);
369 BM_BCAST_ADD_ROW_ALL(cpu);
370 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
371 BM_BCAST_ADD_ROW_ALL(gpu);
372 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
373 #undef BM_BCAST_ADD_ROW_ALL
374 #undef BM_BCAST_ADD_ROW
375
376 #define BM_BCAST_ADD_COL(DEVICE, R, C) \
377 void BM_##DEVICE##_BcastAddCol_R##R##_C##C( \
378 ::testing::benchmark::State& state) { \
379 const int arg = state.range(0); \
380 \
381 const int rows = RowsFromArg(arg); \
382 const int cols = ColsFromArg(arg); \
383 test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1), \
384 /*old_benchmark_api=*/false) \
385 .Run(state); \
386 const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
387 \
388 state.SetItemsProcessed(tot); \
389 state.SetBytesProcessed(tot * sizeof(float)); \
390 } \
391 BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C) \
392 ->UseRealTime() \
393 ->Arg(RowsAndColsArg(R, C));
394
395 #define BM_BCAST_ADD_COL_ALL(DEVICE) \
396 BM_BCAST_ADD_COL(DEVICE, 512, 2048); \
397 BM_BCAST_ADD_COL(DEVICE, 512, 4096); \
398 BM_BCAST_ADD_COL(DEVICE, 2048, 512); \
399 BM_BCAST_ADD_COL(DEVICE, 4096, 512);
400 BM_BCAST_ADD_COL_ALL(cpu);
401 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
402 BM_BCAST_ADD_COL_ALL(gpu);
403 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
404 #undef BM_BCAST_ADD_COL_ALL
405 #undef BM_BCAST_ADD_COL
406
407 #define BM_BCAST_ADD_CROSS_RC(DEVICE, R, C) \
408 void BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C( \
409 ::testing::benchmark::State& state) { \
410 const int arg = state.range(0); \
411 \
412 const int rows = RowsFromArg(arg); \
413 const int cols = ColsFromArg(arg); \
414 test::Benchmark(#DEVICE, BcastAdd(rows, cols, 2), \
415 /*old_benchmark_api=*/false) \
416 .Run(state); \
417 const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
418 \
419 state.SetItemsProcessed(tot); \
420 state.SetBytesProcessed(tot * sizeof(float)); \
421 } \
422 BENCHMARK(BM_##DEVICE##_BcastAddCrossRC_R##R##_C##C) \
423 ->UseRealTime() \
424 ->Arg(RowsAndColsArg(R, C));
425
426 #define BM_BCAST_ADD_CROSS_RC_ALL(DEVICE) \
427 BM_BCAST_ADD_CROSS_RC(DEVICE, 512, 2048); \
428 BM_BCAST_ADD_CROSS_RC(DEVICE, 512, 4096); \
429 BM_BCAST_ADD_CROSS_RC(DEVICE, 2048, 512); \
430 BM_BCAST_ADD_CROSS_RC(DEVICE, 4096, 512);
431 BM_BCAST_ADD_CROSS_RC_ALL(cpu);
432 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
433 BM_BCAST_ADD_CROSS_RC_ALL(gpu);
434 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
435 #undef BM_BCAST_ADD_CROSS_RC_ALL
436 #undef BM_BCAST_ADD_CROSS_RC
437
438 #define BM_BCAST_ADD_CROSS_CR(DEVICE, R, C) \
439 void BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C( \
440 ::testing::benchmark::State& state) { \
441 const int arg = state.range(0); \
442 \
443 const int rows = RowsFromArg(arg); \
444 const int cols = ColsFromArg(arg); \
445 test::Benchmark(#DEVICE, BcastAdd(rows, cols, 3), \
446 /*old_benchmark_api*/ false) \
447 .Run(state); \
448 const int64 tot = static_cast<int64>(state.iterations()) * rows * cols; \
449 state.SetItemsProcessed(tot); \
450 state.SetBytesProcessed(tot * sizeof(float)); \
451 } \
452 BENCHMARK(BM_##DEVICE##_BcastAddCrossCR_R##R##_C##C) \
453 ->UseRealTime() \
454 ->Arg(RowsAndColsArg(R, C));
455
456 #define BM_BCAST_ADD_CROSS_CR_ALL(DEVICE) \
457 BM_BCAST_ADD_CROSS_CR(DEVICE, 512, 2048); \
458 BM_BCAST_ADD_CROSS_CR(DEVICE, 512, 4096); \
459 BM_BCAST_ADD_CROSS_CR(DEVICE, 2048, 512); \
460 BM_BCAST_ADD_CROSS_CR(DEVICE, 4096, 512);
461 BM_BCAST_ADD_CROSS_CR_ALL(cpu);
462 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
463 BM_BCAST_ADD_CROSS_CR_ALL(gpu);
464 #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
465 #undef BM_BCAST_ADD_CROSS_CR_ALL
466 #undef BM_BCAST_ADD_CROSS_CR
467
468 } // namespace
469 } // namespace tensorflow
470