1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15
16 #include <functional>
17 #include <memory>
18 #include <vector>
19
20 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
21 #include "tensorflow/core/framework/allocator.h"
22 #include "tensorflow/core/framework/op_kernel.h"
23 #include "tensorflow/core/framework/tensor.h"
24 #include "tensorflow/core/framework/types.h"
25 #include "tensorflow/core/framework/types.pb.h"
26 #include "tensorflow/core/graph/node_builder.h"
27 #include "tensorflow/core/graph/testlib.h"
28 #include "tensorflow/core/kernels/ops_testutil.h"
29 #include "tensorflow/core/kernels/ops_util.h"
30 #include "tensorflow/core/lib/core/status_test_util.h"
31 #include "tensorflow/core/platform/prefetch.h"
32 #include "tensorflow/core/platform/test.h"
33 #include "tensorflow/core/platform/test_benchmark.h"
34
35 namespace tensorflow {
36 namespace {
37
38 template <typename T>
FillTensorWithRandomValues(Tensor * t,int string_length,int64 * bytes)39 void FillTensorWithRandomValues(Tensor* t, int string_length, int64* bytes) {
40 t->flat<T>().setRandom();
41 *bytes = t->flat<T>().size() * sizeof(T);
42 }
43
44 template <>
FillTensorWithRandomValues(Tensor * t,int string_length,int64 * bytes)45 void FillTensorWithRandomValues<tstring>(Tensor* t, int string_length,
46 int64* bytes) {
47 auto ts = t->flat<tstring>();
48 *bytes = 0;
49 for (int i = 0; i < ts.size(); i++) {
50 ts(i) = tstring(string_length, 'x');
51 *bytes += sizeof(ts(i)) + ts(i).size();
52 }
53 }
54
55 // For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
56 // in size, and concat them together along "concat_dimension". If T is
57 // std::string, then the length of individual strings in the tensors will be
58 // of length "string_length".
59 template <typename T>
ConcatHelper(::testing::benchmark::State & state,int concat_dimension,int dim2,int string_length=0)60 static void ConcatHelper(::testing::benchmark::State& state,
61 int concat_dimension, int dim2,
62 int string_length = 0) {
63 Graph* g = new Graph(OpRegistry::Global());
64
65 DataType dt = DataTypeToEnum<T>::v();
66 const int kDim1 = 100;
67 Tensor concat_dim(DT_INT32, TensorShape({}));
68 concat_dim.scalar<int32>()() = concat_dimension;
69 Tensor in0(dt, TensorShape({kDim1, dim2}));
70 Tensor in1(dt, TensorShape({kDim1, dim2}));
71 int64_t in0_bytes, in1_bytes;
72 FillTensorWithRandomValues<T>(&in0, string_length, &in0_bytes);
73 FillTensorWithRandomValues<T>(&in1, string_length, &in1_bytes);
74
75 Node* node;
76 TF_CHECK_OK(
77 NodeBuilder(g->NewName("n"), "Concat")
78 .Input(test::graph::Constant(g, concat_dim))
79 .Input({test::graph::Constant(g, in0), test::graph::Constant(g, in1)})
80 .Attr("N", 2)
81 .Attr("T", dt)
82 .Finalize(g, &node));
83
84 test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
85 state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
86 (in0_bytes + in1_bytes));
87 }
88
BM_ConcatDim0Float(::testing::benchmark::State & state)89 void BM_ConcatDim0Float(::testing::benchmark::State& state) {
90 const int dim2 = state.range(0);
91
92 ConcatHelper<float>(state, 0, dim2);
93 }
94
BM_ConcatDim1Float(::testing::benchmark::State & state)95 void BM_ConcatDim1Float(::testing::benchmark::State& state) {
96 const int dim2 = state.range(0);
97
98 ConcatHelper<float>(state, 1, dim2);
99 }
100
101 BENCHMARK(BM_ConcatDim0Float)
102 ->UseRealTime()
103 ->Arg(1000)
104 ->Arg(100000)
105 ->Arg(1000000);
106 BENCHMARK(BM_ConcatDim1Float)
107 ->UseRealTime()
108 ->Arg(1000)
109 ->Arg(100000)
110 ->Arg(1000000);
111
BM_ConcatDim0String(::testing::benchmark::State & state)112 void BM_ConcatDim0String(::testing::benchmark::State& state) {
113 const int dim2 = state.range(0);
114 const int string_length = state.range(1);
115
116 ConcatHelper<tstring>(state, 0, dim2, string_length);
117 }
118
119 BENCHMARK(BM_ConcatDim0String)
120 ->UseRealTime()
121 ->ArgPair(1, 16)
122 ->ArgPair(1, 10000)
123 ->ArgPair(100, 16);
124
BM_ConcatDim1uint8(::testing::benchmark::State & state)125 void BM_ConcatDim1uint8(::testing::benchmark::State& state) {
126 const int dim2 = state.range(0);
127
128 ConcatHelper<uint8>(state, 1, dim2);
129 }
BM_ConcatDim1int16(::testing::benchmark::State & state)130 void BM_ConcatDim1int16(::testing::benchmark::State& state) {
131 const int dim2 = state.range(0);
132
133 ConcatHelper<int16>(state, 1, dim2);
134 }
BM_ConcatDim1bfloat16(::testing::benchmark::State & state)135 void BM_ConcatDim1bfloat16(::testing::benchmark::State& state) {
136 const int dim2 = state.range(0);
137
138 ConcatHelper<bfloat16>(state, 1, dim2);
139 }
140
141 BENCHMARK(BM_ConcatDim1uint8)
142 ->UseRealTime()
143 ->Arg(1000)
144 ->Arg(100000)
145 ->Arg(1000000);
146 BENCHMARK(BM_ConcatDim1int16)
147 ->UseRealTime()
148 ->Arg(1000)
149 ->Arg(100000)
150 ->Arg(1000000);
151 BENCHMARK(BM_ConcatDim1bfloat16)
152 ->UseRealTime()
153 ->Arg(1000)
154 ->Arg(100000)
155 ->Arg(1000000);
156
157 template <typename T>
ConcatManyHelper(::testing::benchmark::State & state,int concat_dimension,int dim2)158 static void ConcatManyHelper(::testing::benchmark::State& state,
159 int concat_dimension, int dim2) {
160 Graph* g = new Graph(OpRegistry::Global());
161
162 DataType dt = DataTypeToEnum<T>::v();
163 const int kDim1 = 40000;
164 const int kNumInputs = 64;
165 Tensor concat_dim(DT_INT32, TensorShape({}));
166 concat_dim.scalar<int32>()() = concat_dimension;
167 std::vector<NodeBuilder::NodeOut> inputs;
168 inputs.reserve(kNumInputs);
169 for (int i = 0; i < kNumInputs; ++i) {
170 Tensor in(dt, TensorShape({kDim1, dim2}));
171 in.flat<T>().setRandom();
172 inputs.push_back(test::graph::Constant(g, in));
173 }
174
175 Node* node;
176 TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Concat")
177 .Input(test::graph::Constant(g, concat_dim))
178 .Input(inputs)
179 .Attr("N", 64)
180 .Attr("T", dt)
181 .Finalize(g, &node));
182 test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
183 state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
184 dim2 * kNumInputs * sizeof(T));
185 }
186
BM_ConcatManyDim1bfloat16(::testing::benchmark::State & state)187 void BM_ConcatManyDim1bfloat16(::testing::benchmark::State& state) {
188 const int dim2 = state.range(0);
189
190 ConcatManyHelper<bfloat16>(state, 1, dim2);
191 }
192
193 BENCHMARK(BM_ConcatManyDim1bfloat16)->UseRealTime()->Arg(18)->Arg(34)->Arg(60);
194
MemcpyAlternativeHelper(::testing::benchmark::State & state,int dim2)195 void MemcpyAlternativeHelper(::testing::benchmark::State& state, int dim2) {
196 const int kDim1 = 100;
197 std::vector<float> data1(kDim1 * dim2, 1.0f);
198 std::vector<float> data2(kDim1 * dim2, 2.0f);
199
200 for (auto s : state) {
201 const size_t n0 = data1.size();
202 const size_t n1 = data2.size();
203 float* result = new float[n0 + n1];
204 memcpy(&result[0], &data1[0], n0 * sizeof(float));
205 memcpy(&result[n0], &data2[0], n1 * sizeof(float));
206 delete[] result;
207 }
208 state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
209 ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float));
210 }
211
BM_MemcpyAlternativeDim0(::testing::benchmark::State & state)212 void BM_MemcpyAlternativeDim0(::testing::benchmark::State& state) {
213 const int dim2 = state.range(0);
214
215 MemcpyAlternativeHelper(state, dim2);
216 }
BM_MemcpyAlternativeDim1(::testing::benchmark::State & state)217 void BM_MemcpyAlternativeDim1(::testing::benchmark::State& state) {
218 const int dim2 = state.range(0);
219
220 MemcpyAlternativeHelper(state, dim2);
221 }
222
223 BENCHMARK(BM_MemcpyAlternativeDim0)
224 ->UseRealTime()
225 ->Arg(1000)
226 ->Arg(100000)
227 ->Arg(1000000);
228 BENCHMARK(BM_MemcpyAlternativeDim1)
229 ->UseRealTime()
230 ->Arg(1000)
231 ->Arg(100000)
232 ->Arg(1000000);
233
234 typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
235 Eigen::Unaligned>
236 EigenMap;
MemcpyManyAlternative1(::testing::benchmark::State & state)237 void MemcpyManyAlternative1(::testing::benchmark::State& state) {
238 int dim2 = state.range(0);
239 const int kDim1 = 40000;
240 const int kNumCopies = 64;
241 const int size = kDim1 * dim2 * kNumCopies;
242 bfloat16* data = new bfloat16[size];
243 EigenMap map(data, size);
244 map.setRandom();
245
246 for (auto s : state) {
247 std::vector<bfloat16*> inputs(kNumCopies);
248 for (int i = 0; i < kNumCopies; ++i) {
249 inputs[i] = &data[i * kDim1 * dim2];
250 }
251 bfloat16* result = new bfloat16[size];
252 for (int j = 0; j < kNumCopies; ++j) {
253 bfloat16* output = &result[j * dim2];
254 for (int i = 0; i < kDim1; ++i) {
255 if (i + 1 < kDim1) {
256 port::prefetch<port::PREFETCH_HINT_T0>(inputs[j] + dim2);
257 }
258 memcpy(output, inputs[j], dim2 * sizeof(bfloat16));
259 inputs[j] += dim2;
260 output += dim2 * kNumCopies;
261 }
262 }
263 delete[] result;
264 }
265 delete[] data;
266 state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
267 dim2 * kNumCopies * sizeof(bfloat16));
268 }
269
MemcpyManyAlternative2(::testing::benchmark::State & state)270 void MemcpyManyAlternative2(::testing::benchmark::State& state) {
271 int dim2 = state.range(0);
272 const int kDim1 = 40000;
273 const int kNumCopies = 64;
274 const int size = kDim1 * dim2 * kNumCopies;
275 bfloat16* data = new bfloat16[size];
276 EigenMap map(data, size);
277 map.setRandom();
278
279 std::vector<bfloat16*> inputs(kNumCopies);
280 for (auto s : state) {
281 bfloat16* result = new bfloat16[size];
282 for (int i = 0; i < kNumCopies; ++i) {
283 inputs[i] = &data[i * kDim1 * dim2];
284 }
285 bfloat16* output = result;
286 for (int i = 0; i < kDim1; ++i) {
287 for (int j = 0; j < kNumCopies; ++j) {
288 if (j + 1 < kNumCopies) {
289 port::prefetch<port::PREFETCH_HINT_T0>(inputs[j + 1]);
290 }
291 memcpy(output, inputs[j], dim2 * sizeof(bfloat16));
292 inputs[j] += dim2;
293 output += dim2;
294 }
295 }
296 delete[] result;
297 }
298 delete[] data;
299
300 state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
301 dim2 * kNumCopies * sizeof(bfloat16));
302 }
303
304 BENCHMARK(MemcpyManyAlternative1)
305 ->Arg(16)
306 ->Arg(17)
307 ->Arg(18)
308 ->Arg(32)
309 ->Arg(33)
310 ->Arg(34)
311 ->Arg(60)
312 ->Arg(64)
313 ->Arg(65);
314
315 BENCHMARK(MemcpyManyAlternative2)
316 ->Arg(16)
317 ->Arg(17)
318 ->Arg(18)
319 ->Arg(32)
320 ->Arg(33)
321 ->Arg(34)
322 ->Arg(60)
323 ->Arg(64)
324 ->Arg(65);
325
326 } // namespace
327 } // namespace tensorflow
328