• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2 
3 Licensed under the Apache License, Version 2.0 (the "License");
4 you may not use this file except in compliance with the License.
5 You may obtain a copy of the License at
6 
7     http://www.apache.org/licenses/LICENSE-2.0
8 
9 Unless required by applicable law or agreed to in writing, software
10 distributed under the License is distributed on an "AS IS" BASIS,
11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 See the License for the specific language governing permissions and
13 limitations under the License.
14 ==============================================================================*/
15 
16 #include <functional>
17 #include <memory>
18 #include <vector>
19 
20 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
21 #include "tensorflow/core/framework/allocator.h"
22 #include "tensorflow/core/framework/op_kernel.h"
23 #include "tensorflow/core/framework/tensor.h"
24 #include "tensorflow/core/framework/types.h"
25 #include "tensorflow/core/framework/types.pb.h"
26 #include "tensorflow/core/graph/node_builder.h"
27 #include "tensorflow/core/graph/testlib.h"
28 #include "tensorflow/core/kernels/ops_testutil.h"
29 #include "tensorflow/core/kernels/ops_util.h"
30 #include "tensorflow/core/lib/core/status_test_util.h"
31 #include "tensorflow/core/platform/prefetch.h"
32 #include "tensorflow/core/platform/test.h"
33 #include "tensorflow/core/platform/test_benchmark.h"
34 
35 namespace tensorflow {
36 namespace {
37 
38 template <typename T>
FillTensorWithRandomValues(Tensor * t,int string_length,int64 * bytes)39 void FillTensorWithRandomValues(Tensor* t, int string_length, int64* bytes) {
40   t->flat<T>().setRandom();
41   *bytes = t->flat<T>().size() * sizeof(T);
42 }
43 
44 template <>
FillTensorWithRandomValues(Tensor * t,int string_length,int64 * bytes)45 void FillTensorWithRandomValues<tstring>(Tensor* t, int string_length,
46                                          int64* bytes) {
47   auto ts = t->flat<tstring>();
48   *bytes = 0;
49   for (int i = 0; i < ts.size(); i++) {
50     ts(i) = tstring(string_length, 'x');
51     *bytes += sizeof(ts(i)) + ts(i).size();
52   }
53 }
54 
55 // For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
56 // in size, and concat them together along "concat_dimension".  If T is
57 // std::string, then the length of individual strings in the tensors will be
58 // of length "string_length".
59 template <typename T>
ConcatHelper(::testing::benchmark::State & state,int concat_dimension,int dim2,int string_length=0)60 static void ConcatHelper(::testing::benchmark::State& state,
61                          int concat_dimension, int dim2,
62                          int string_length = 0) {
63   Graph* g = new Graph(OpRegistry::Global());
64 
65   DataType dt = DataTypeToEnum<T>::v();
66   const int kDim1 = 100;
67   Tensor concat_dim(DT_INT32, TensorShape({}));
68   concat_dim.scalar<int32>()() = concat_dimension;
69   Tensor in0(dt, TensorShape({kDim1, dim2}));
70   Tensor in1(dt, TensorShape({kDim1, dim2}));
71   int64_t in0_bytes, in1_bytes;
72   FillTensorWithRandomValues<T>(&in0, string_length, &in0_bytes);
73   FillTensorWithRandomValues<T>(&in1, string_length, &in1_bytes);
74 
75   Node* node;
76   TF_CHECK_OK(
77       NodeBuilder(g->NewName("n"), "Concat")
78           .Input(test::graph::Constant(g, concat_dim))
79           .Input({test::graph::Constant(g, in0), test::graph::Constant(g, in1)})
80           .Attr("N", 2)
81           .Attr("T", dt)
82           .Finalize(g, &node));
83 
84   test::Benchmark("cpu", g, /*old_benchmark_api=*/false).Run(state);
85   state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
86                           (in0_bytes + in1_bytes));
87 }
88 
BM_ConcatDim0Float(::testing::benchmark::State & state)89 void BM_ConcatDim0Float(::testing::benchmark::State& state) {
90   const int dim2 = state.range(0);
91 
92   ConcatHelper<float>(state, 0, dim2);
93 }
94 
BM_ConcatDim1Float(::testing::benchmark::State & state)95 void BM_ConcatDim1Float(::testing::benchmark::State& state) {
96   const int dim2 = state.range(0);
97 
98   ConcatHelper<float>(state, 1, dim2);
99 }
100 
101 BENCHMARK(BM_ConcatDim0Float)
102     ->UseRealTime()
103     ->Arg(1000)
104     ->Arg(100000)
105     ->Arg(1000000);
106 BENCHMARK(BM_ConcatDim1Float)
107     ->UseRealTime()
108     ->Arg(1000)
109     ->Arg(100000)
110     ->Arg(1000000);
111 
BM_ConcatDim0String(::testing::benchmark::State & state)112 void BM_ConcatDim0String(::testing::benchmark::State& state) {
113   const int dim2 = state.range(0);
114   const int string_length = state.range(1);
115 
116   ConcatHelper<tstring>(state, 0, dim2, string_length);
117 }
118 
119 BENCHMARK(BM_ConcatDim0String)
120     ->UseRealTime()
121     ->ArgPair(1, 16)
122     ->ArgPair(1, 10000)
123     ->ArgPair(100, 16);
124 
BM_ConcatDim1uint8(::testing::benchmark::State & state)125 void BM_ConcatDim1uint8(::testing::benchmark::State& state) {
126   const int dim2 = state.range(0);
127 
128   ConcatHelper<uint8>(state, 1, dim2);
129 }
BM_ConcatDim1int16(::testing::benchmark::State & state)130 void BM_ConcatDim1int16(::testing::benchmark::State& state) {
131   const int dim2 = state.range(0);
132 
133   ConcatHelper<int16>(state, 1, dim2);
134 }
BM_ConcatDim1bfloat16(::testing::benchmark::State & state)135 void BM_ConcatDim1bfloat16(::testing::benchmark::State& state) {
136   const int dim2 = state.range(0);
137 
138   ConcatHelper<bfloat16>(state, 1, dim2);
139 }
140 
141 BENCHMARK(BM_ConcatDim1uint8)
142     ->UseRealTime()
143     ->Arg(1000)
144     ->Arg(100000)
145     ->Arg(1000000);
146 BENCHMARK(BM_ConcatDim1int16)
147     ->UseRealTime()
148     ->Arg(1000)
149     ->Arg(100000)
150     ->Arg(1000000);
151 BENCHMARK(BM_ConcatDim1bfloat16)
152     ->UseRealTime()
153     ->Arg(1000)
154     ->Arg(100000)
155     ->Arg(1000000);
156 
157 template <typename T>
ConcatManyHelper(::testing::benchmark::State & state,int concat_dimension,int dim2)158 static void ConcatManyHelper(::testing::benchmark::State& state,
159                              int concat_dimension, int dim2) {
160   Graph* g = new Graph(OpRegistry::Global());
161 
162   DataType dt = DataTypeToEnum<T>::v();
163   const int kDim1 = 40000;
164   const int kNumInputs = 64;
165   Tensor concat_dim(DT_INT32, TensorShape({}));
166   concat_dim.scalar<int32>()() = concat_dimension;
167   std::vector<NodeBuilder::NodeOut> inputs;
168   inputs.reserve(kNumInputs);
169   for (int i = 0; i < kNumInputs; ++i) {
170     Tensor in(dt, TensorShape({kDim1, dim2}));
171     in.flat<T>().setRandom();
172     inputs.push_back(test::graph::Constant(g, in));
173   }
174 
175   Node* node;
176   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Concat")
177                   .Input(test::graph::Constant(g, concat_dim))
178                   .Input(inputs)
179                   .Attr("N", 64)
180                   .Attr("T", dt)
181                   .Finalize(g, &node));
182   test::Benchmark("cpu", g, /*old_benchmark_api*/ false).Run(state);
183   state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
184                           dim2 * kNumInputs * sizeof(T));
185 }
186 
BM_ConcatManyDim1bfloat16(::testing::benchmark::State & state)187 void BM_ConcatManyDim1bfloat16(::testing::benchmark::State& state) {
188   const int dim2 = state.range(0);
189 
190   ConcatManyHelper<bfloat16>(state, 1, dim2);
191 }
192 
193 BENCHMARK(BM_ConcatManyDim1bfloat16)->UseRealTime()->Arg(18)->Arg(34)->Arg(60);
194 
MemcpyAlternativeHelper(::testing::benchmark::State & state,int dim2)195 void MemcpyAlternativeHelper(::testing::benchmark::State& state, int dim2) {
196   const int kDim1 = 100;
197   std::vector<float> data1(kDim1 * dim2, 1.0f);
198   std::vector<float> data2(kDim1 * dim2, 2.0f);
199 
200   for (auto s : state) {
201     const size_t n0 = data1.size();
202     const size_t n1 = data2.size();
203     float* result = new float[n0 + n1];
204     memcpy(&result[0], &data1[0], n0 * sizeof(float));
205     memcpy(&result[n0], &data2[0], n1 * sizeof(float));
206     delete[] result;
207   }
208   state.SetBytesProcessed(static_cast<int64>(state.iterations()) *
209                           ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float));
210 }
211 
BM_MemcpyAlternativeDim0(::testing::benchmark::State & state)212 void BM_MemcpyAlternativeDim0(::testing::benchmark::State& state) {
213   const int dim2 = state.range(0);
214 
215   MemcpyAlternativeHelper(state, dim2);
216 }
BM_MemcpyAlternativeDim1(::testing::benchmark::State & state)217 void BM_MemcpyAlternativeDim1(::testing::benchmark::State& state) {
218   const int dim2 = state.range(0);
219 
220   MemcpyAlternativeHelper(state, dim2);
221 }
222 
223 BENCHMARK(BM_MemcpyAlternativeDim0)
224     ->UseRealTime()
225     ->Arg(1000)
226     ->Arg(100000)
227     ->Arg(1000000);
228 BENCHMARK(BM_MemcpyAlternativeDim1)
229     ->UseRealTime()
230     ->Arg(1000)
231     ->Arg(100000)
232     ->Arg(1000000);
233 
234 typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
235                          Eigen::Unaligned>
236     EigenMap;
MemcpyManyAlternative1(::testing::benchmark::State & state)237 void MemcpyManyAlternative1(::testing::benchmark::State& state) {
238   int dim2 = state.range(0);
239   const int kDim1 = 40000;
240   const int kNumCopies = 64;
241   const int size = kDim1 * dim2 * kNumCopies;
242   bfloat16* data = new bfloat16[size];
243   EigenMap map(data, size);
244   map.setRandom();
245 
246   for (auto s : state) {
247     std::vector<bfloat16*> inputs(kNumCopies);
248     for (int i = 0; i < kNumCopies; ++i) {
249       inputs[i] = &data[i * kDim1 * dim2];
250     }
251     bfloat16* result = new bfloat16[size];
252     for (int j = 0; j < kNumCopies; ++j) {
253       bfloat16* output = &result[j * dim2];
254       for (int i = 0; i < kDim1; ++i) {
255         if (i + 1 < kDim1) {
256           port::prefetch<port::PREFETCH_HINT_T0>(inputs[j] + dim2);
257         }
258         memcpy(output, inputs[j], dim2 * sizeof(bfloat16));
259         inputs[j] += dim2;
260         output += dim2 * kNumCopies;
261       }
262     }
263     delete[] result;
264   }
265   delete[] data;
266   state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
267                           dim2 * kNumCopies * sizeof(bfloat16));
268 }
269 
MemcpyManyAlternative2(::testing::benchmark::State & state)270 void MemcpyManyAlternative2(::testing::benchmark::State& state) {
271   int dim2 = state.range(0);
272   const int kDim1 = 40000;
273   const int kNumCopies = 64;
274   const int size = kDim1 * dim2 * kNumCopies;
275   bfloat16* data = new bfloat16[size];
276   EigenMap map(data, size);
277   map.setRandom();
278 
279   std::vector<bfloat16*> inputs(kNumCopies);
280   for (auto s : state) {
281     bfloat16* result = new bfloat16[size];
282     for (int i = 0; i < kNumCopies; ++i) {
283       inputs[i] = &data[i * kDim1 * dim2];
284     }
285     bfloat16* output = result;
286     for (int i = 0; i < kDim1; ++i) {
287       for (int j = 0; j < kNumCopies; ++j) {
288         if (j + 1 < kNumCopies) {
289           port::prefetch<port::PREFETCH_HINT_T0>(inputs[j + 1]);
290         }
291         memcpy(output, inputs[j], dim2 * sizeof(bfloat16));
292         inputs[j] += dim2;
293         output += dim2;
294       }
295     }
296     delete[] result;
297   }
298   delete[] data;
299 
300   state.SetBytesProcessed(static_cast<int64>(state.iterations()) * kDim1 *
301                           dim2 * kNumCopies * sizeof(bfloat16));
302 }
303 
304 BENCHMARK(MemcpyManyAlternative1)
305     ->Arg(16)
306     ->Arg(17)
307     ->Arg(18)
308     ->Arg(32)
309     ->Arg(33)
310     ->Arg(34)
311     ->Arg(60)
312     ->Arg(64)
313     ->Arg(65);
314 
315 BENCHMARK(MemcpyManyAlternative2)
316     ->Arg(16)
317     ->Arg(17)
318     ->Arg(18)
319     ->Arg(32)
320     ->Arg(33)
321     ->Arg(34)
322     ->Arg(60)
323     ->Arg(64)
324     ->Arg(65);
325 
326 }  // namespace
327 }  // namespace tensorflow
328