1 #ifndef THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ 2 #define THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ 3 4 typedef int TensorIndex; 5 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int 6 7 #include "unsupported/Eigen/CXX11/Tensor" 8 #include "benchmark.h" 9 10 #define BENCHMARK_RANGE(bench, lo, hi) \ 11 BENCHMARK(bench)->Range(lo, hi) 12 13 using Eigen::Tensor; 14 using Eigen::TensorMap; 15 16 // TODO(bsteiner): also templatize on the input type since we have users 17 // for int8 as well as floats. 18 template <typename Device, typename T> class BenchmarkSuite { 19 public: BenchmarkSuite(const Device & device,size_t m,size_t k,size_t n)20 BenchmarkSuite(const Device& device, size_t m, size_t k, size_t n) 21 : m_(m), k_(k), n_(n), device_(device) { 22 initialize(); 23 } 24 BenchmarkSuite(const Device & device,size_t m)25 BenchmarkSuite(const Device& device, size_t m) 26 : m_(m), k_(m), n_(m), device_(device) { 27 initialize(); 28 } 29 ~BenchmarkSuite()30 ~BenchmarkSuite() { 31 device_.deallocate(a_); 32 device_.deallocate(b_); 33 device_.deallocate(c_); 34 } 35 memcpy(int num_iters)36 void memcpy(int num_iters) { 37 eigen_assert(m_ == k_ && k_ == n_); 38 StartBenchmarkTiming(); 39 for (int iter = 0; iter < num_iters; ++iter) { 40 device_.memcpy(c_, a_, m_ * m_ * sizeof(T)); 41 } 42 // Record the number of values copied per second 43 finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters); 44 } 45 typeCasting(int num_iters)46 void typeCasting(int num_iters) { 47 eigen_assert(m_ == n_); 48 Eigen::array<TensorIndex, 2> sizes; 49 if (sizeof(T) >= sizeof(int)) { 50 sizes[0] = m_; 51 sizes[1] = k_; 52 } else { 53 sizes[0] = m_ * sizeof(T) / sizeof(int); 54 sizes[1] = k_ * sizeof(T) / sizeof(int); 55 } 56 const TensorMap<Tensor<int, 2, 0, TensorIndex>, Eigen::Aligned> A((int*)a_, sizes); 57 TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, sizes); 58 59 StartBenchmarkTiming(); 60 for (int iter = 0; iter < num_iters; ++iter) { 61 B.device(device_) = A.template cast<T>(); 62 } 63 // Record the number of values copied per second 64 finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters); 65 } 66 random(int num_iters)67 void random(int num_iters) { 68 eigen_assert(m_ == k_ && k_ == n_); 69 Eigen::array<TensorIndex, 2> sizes; 70 sizes[0] = m_; 71 sizes[1] = m_; 72 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); 73 74 StartBenchmarkTiming(); 75 for (int iter = 0; iter < num_iters; ++iter) { 76 C.device(device_) = C.random(); 77 } 78 // Record the number of random numbers generated per second 79 finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters); 80 } 81 slicing(int num_iters)82 void slicing(int num_iters) { 83 eigen_assert(m_ == k_ && k_ == n_); 84 Eigen::array<TensorIndex, 2> sizes; 85 sizes[0] = m_; 86 sizes[1] = m_; 87 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); 88 const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); 89 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); 90 91 const Eigen::DSizes<TensorIndex, 2> quarter_sizes(m_/2, m_/2); 92 const Eigen::DSizes<TensorIndex, 2> first_quadrant(0, 0); 93 const Eigen::DSizes<TensorIndex, 2> second_quadrant(0, m_/2); 94 const Eigen::DSizes<TensorIndex, 2> third_quadrant(m_/2, 0); 95 const Eigen::DSizes<TensorIndex, 2> fourth_quadrant(m_/2, m_/2); 96 97 StartBenchmarkTiming(); 98 for (int iter = 0; iter < num_iters; ++iter) { 99 C.slice(first_quadrant, quarter_sizes).device(device_) = 100 A.slice(first_quadrant, quarter_sizes); 101 C.slice(second_quadrant, quarter_sizes).device(device_) = 102 B.slice(second_quadrant, quarter_sizes); 103 C.slice(third_quadrant, quarter_sizes).device(device_) = 104 A.slice(third_quadrant, quarter_sizes); 105 C.slice(fourth_quadrant, quarter_sizes).device(device_) = 106 B.slice(fourth_quadrant, quarter_sizes); 107 } 108 // Record the number of values copied from the rhs slice to the lhs slice 109 // each second 110 finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters); 111 } 112 rowChip(int num_iters)113 void rowChip(int num_iters) { 114 Eigen::array<TensorIndex, 2> input_size; 115 input_size[0] = k_; 116 input_size[1] = n_; 117 const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); 118 Eigen::array<TensorIndex, 1> output_size; 119 output_size[0] = n_; 120 TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); 121 122 StartBenchmarkTiming(); 123 for (int iter = 0; iter < num_iters; ++iter) { 124 C.device(device_) = B.chip(iter % k_, 0); 125 } 126 // Record the number of values copied from the rhs chip to the lhs. 127 finalizeBenchmark(static_cast<int64_t>(n_) * num_iters); 128 } 129 colChip(int num_iters)130 void colChip(int num_iters) { 131 Eigen::array<TensorIndex, 2> input_size; 132 input_size[0] = k_; 133 input_size[1] = n_; 134 const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); 135 Eigen::array<TensorIndex, 1> output_size; 136 output_size[0] = n_; 137 TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); 138 139 StartBenchmarkTiming(); 140 for (int iter = 0; iter < num_iters; ++iter) { 141 C.device(device_) = B.chip(iter % n_, 1); 142 } 143 // Record the number of values copied from the rhs chip to the lhs. 144 finalizeBenchmark(static_cast<int64_t>(n_) * num_iters); 145 } 146 shuffling(int num_iters)147 void shuffling(int num_iters) { 148 eigen_assert(m_ == n_); 149 Eigen::array<TensorIndex, 2> size_a; 150 size_a[0] = m_; 151 size_a[1] = k_; 152 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a); 153 Eigen::array<TensorIndex, 2> size_b; 154 size_b[0] = k_; 155 size_b[1] = m_; 156 TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b); 157 158 Eigen::array<int, 2> shuffle; 159 shuffle[0] = 1; 160 shuffle[1] = 0; 161 162 StartBenchmarkTiming(); 163 for (int iter = 0; iter < num_iters; ++iter) { 164 B.device(device_) = A.shuffle(shuffle); 165 } 166 // Record the number of values shuffled from A and copied to B each second 167 finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters); 168 } 169 padding(int num_iters)170 void padding(int num_iters) { 171 eigen_assert(m_ == k_); 172 Eigen::array<TensorIndex, 2> size_a; 173 size_a[0] = m_; 174 size_a[1] = k_-3; 175 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a); 176 Eigen::array<TensorIndex, 2> size_b; 177 size_b[0] = k_; 178 size_b[1] = m_; 179 TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b); 180 181 #if defined(EIGEN_HAS_INDEX_LIST) 182 Eigen::IndexPairList<Eigen::type2indexpair<0, 0>, 183 Eigen::type2indexpair<2, 1> > paddings; 184 #else 185 Eigen::array<Eigen::IndexPair<TensorIndex>, 2> paddings; 186 paddings[0] = Eigen::IndexPair<TensorIndex>(0, 0); 187 paddings[1] = Eigen::IndexPair<TensorIndex>(2, 1); 188 #endif 189 190 StartBenchmarkTiming(); 191 for (int iter = 0; iter < num_iters; ++iter) { 192 B.device(device_) = A.pad(paddings); 193 } 194 // Record the number of values copied from the padded tensor A each second 195 finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters); 196 } 197 striding(int num_iters)198 void striding(int num_iters) { 199 eigen_assert(m_ == k_); 200 Eigen::array<TensorIndex, 2> size_a; 201 size_a[0] = m_; 202 size_a[1] = k_; 203 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a); 204 Eigen::array<TensorIndex, 2> size_b; 205 size_b[0] = m_; 206 size_b[1] = k_/2; 207 TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, size_b); 208 209 #ifndef EIGEN_HAS_INDEX_LIST 210 Eigen::array<TensorIndex, 2> strides; 211 strides[0] = 1; 212 strides[1] = 2; 213 #else 214 // Take advantage of cxx11 to give the compiler information it can use to 215 // optimize the code. 216 Eigen::IndexList<Eigen::type2index<1>, Eigen::type2index<2> > strides; 217 #endif 218 219 StartBenchmarkTiming(); 220 for (int iter = 0; iter < num_iters; ++iter) { 221 B.device(device_) = A.stride(strides); 222 } 223 // Record the number of values copied from the padded tensor A each second 224 finalizeBenchmark(static_cast<int64_t>(m_) * k_ * num_iters); 225 } 226 broadcasting(int num_iters)227 void broadcasting(int num_iters) { 228 Eigen::array<TensorIndex, 2> size_a; 229 size_a[0] = m_; 230 size_a[1] = 1; 231 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, size_a); 232 Eigen::array<TensorIndex, 2> size_c; 233 size_c[0] = m_; 234 size_c[1] = n_; 235 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, size_c); 236 237 #ifndef EIGEN_HAS_INDEX_LIST 238 Eigen::array<int, 2> broadcast; 239 broadcast[0] = 1; 240 broadcast[1] = n_; 241 #else 242 // Take advantage of cxx11 to give the compiler information it can use to 243 // optimize the code. 244 Eigen::IndexList<Eigen::type2index<1>, int> broadcast; 245 broadcast.set(1, n_); 246 #endif 247 248 StartBenchmarkTiming(); 249 for (int iter = 0; iter < num_iters; ++iter) { 250 C.device(device_) = A.broadcast(broadcast); 251 } 252 // Record the number of values broadcasted from A and copied to C each second 253 finalizeBenchmark(static_cast<int64_t>(m_) * n_ * num_iters); 254 } 255 coeffWiseOp(int num_iters)256 void coeffWiseOp(int num_iters) { 257 eigen_assert(m_ == k_ && k_ == n_); 258 Eigen::array<TensorIndex, 2> sizes; 259 sizes[0] = m_; 260 sizes[1] = m_; 261 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); 262 const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); 263 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); 264 265 StartBenchmarkTiming(); 266 for (int iter = 0; iter < num_iters; ++iter) { 267 C.device(device_) = A * A.constant(static_cast<T>(3.14)) + B * B.constant(static_cast<T>(2.7)); 268 } 269 // Record the number of FLOP executed per second (2 multiplications and 270 // 1 addition per value) 271 finalizeBenchmark(static_cast<int64_t>(3) * m_ * m_ * num_iters); 272 } 273 algebraicFunc(int num_iters)274 void algebraicFunc(int num_iters) { 275 eigen_assert(m_ == k_ && k_ == n_); 276 Eigen::array<TensorIndex, 2> sizes; 277 sizes[0] = m_; 278 sizes[1] = m_; 279 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); 280 const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); 281 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); 282 283 StartBenchmarkTiming(); 284 for (int iter = 0; iter < num_iters; ++iter) { 285 C.device(device_) = A.rsqrt() + B.sqrt() * B.square(); 286 } 287 // Record the number of FLOP executed per second (assuming one operation 288 // per value) 289 finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters); 290 } 291 transcendentalFunc(int num_iters)292 void transcendentalFunc(int num_iters) { 293 eigen_assert(m_ == k_ && k_ == n_); 294 Eigen::array<TensorIndex, 2> sizes; 295 sizes[0] = m_; 296 sizes[1] = m_; 297 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizes); 298 const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizes); 299 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizes); 300 301 StartBenchmarkTiming(); 302 for (int iter = 0; iter < num_iters; ++iter) { 303 C.device(device_) = A.exp() + B.log(); 304 } 305 // Record the number of FLOP executed per second (assuming one operation 306 // per value) 307 finalizeBenchmark(static_cast<int64_t>(m_) * m_ * num_iters); 308 } 309 310 // Row reduction rowReduction(int num_iters)311 void rowReduction(int num_iters) { 312 Eigen::array<TensorIndex, 2> input_size; 313 input_size[0] = k_; 314 input_size[1] = n_; 315 const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B(b_, input_size); 316 Eigen::array<TensorIndex, 1> output_size; 317 output_size[0] = n_; 318 TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C(c_, output_size); 319 320 #ifndef EIGEN_HAS_INDEX_LIST 321 Eigen::array<TensorIndex, 1> sum_along_dim; 322 sum_along_dim[0] = 0; 323 #else 324 // Take advantage of cxx11 to give the compiler information it can use to 325 // optimize the code. 326 Eigen::IndexList<Eigen::type2index<0>> sum_along_dim; 327 #endif 328 329 StartBenchmarkTiming(); 330 for (int iter = 0; iter < num_iters; ++iter) { 331 C.device(device_) = B.sum(sum_along_dim); 332 } 333 // Record the number of FLOP executed per second (assuming one operation 334 // per value) 335 finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters); 336 } 337 338 // Column reduction colReduction(int num_iters)339 void colReduction(int num_iters) { 340 Eigen::array<TensorIndex, 2> input_size; 341 input_size[0] = k_; 342 input_size[1] = n_; 343 const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B( 344 b_, input_size); 345 Eigen::array<TensorIndex, 1> output_size; 346 output_size[0] = k_; 347 TensorMap<Tensor<T, 1, 0, TensorIndex>, Eigen::Aligned> C( 348 c_, output_size); 349 350 #ifndef EIGEN_HAS_INDEX_LIST 351 Eigen::array<TensorIndex, 1> sum_along_dim; 352 sum_along_dim[0] = 1; 353 #else 354 // Take advantage of cxx11 to give the compiler information it can use to 355 // optimize the code. 356 Eigen::IndexList<Eigen::type2index<1>> sum_along_dim; 357 #endif 358 359 StartBenchmarkTiming(); 360 for (int iter = 0; iter < num_iters; ++iter) { 361 C.device(device_) = B.sum(sum_along_dim); 362 } 363 // Record the number of FLOP executed per second (assuming one operation 364 // per value) 365 finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters); 366 } 367 368 // Full reduction fullReduction(int num_iters)369 void fullReduction(int num_iters) { 370 Eigen::array<TensorIndex, 2> input_size; 371 input_size[0] = k_; 372 input_size[1] = n_; 373 const TensorMap<Tensor<T, 2, 0, TensorIndex>, Eigen::Aligned> B( 374 b_, input_size); 375 Eigen::array<TensorIndex, 0> output_size; 376 TensorMap<Tensor<T, 0, 0, TensorIndex>, Eigen::Aligned> C( 377 c_, output_size); 378 379 StartBenchmarkTiming(); 380 for (int iter = 0; iter < num_iters; ++iter) { 381 C.device(device_) = B.sum(); 382 } 383 // Record the number of FLOP executed per second (assuming one operation 384 // per value) 385 finalizeBenchmark(static_cast<int64_t>(k_) * n_ * num_iters); 386 } 387 388 // do a contraction which is equivalent to a matrix multiplication contraction(int num_iters)389 void contraction(int num_iters) { 390 Eigen::array<TensorIndex, 2> sizeA; 391 sizeA[0] = m_; 392 sizeA[1] = k_; 393 Eigen::array<TensorIndex, 2> sizeB; 394 sizeB[0] = k_; 395 sizeB[1] = n_; 396 Eigen::array<TensorIndex, 2> sizeC; 397 sizeC[0] = m_; 398 sizeC[1] = n_; 399 400 const TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, sizeA); 401 const TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, sizeB); 402 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, sizeC); 403 404 typedef typename Tensor<T, 2>::DimensionPair DimPair; 405 Eigen::array<DimPair, 1> dims; 406 dims[0] = DimPair(1, 0); 407 408 StartBenchmarkTiming(); 409 for (int iter = 0; iter < num_iters; ++iter) { 410 C.device(device_) = A.contract(B, dims); 411 } 412 // Record the number of FLOP executed per second (size_ multiplications and 413 // additions for each value in the resulting tensor) 414 finalizeBenchmark(static_cast<int64_t>(2) * m_ * n_ * k_ * num_iters); 415 } 416 convolution(int num_iters,int kernel_x,int kernel_y)417 void convolution(int num_iters, int kernel_x, int kernel_y) { 418 Eigen::array<TensorIndex, 2> input_sizes; 419 input_sizes[0] = m_; 420 input_sizes[1] = n_; 421 TensorMap<Tensor<T, 2>, Eigen::Aligned> A(a_, input_sizes); 422 Eigen::array<TensorIndex, 2> kernel_sizes; 423 kernel_sizes[0] = kernel_x; 424 kernel_sizes[1] = kernel_y; 425 TensorMap<Tensor<T, 2>, Eigen::Aligned> B(b_, kernel_sizes); 426 Eigen::array<TensorIndex, 2> result_sizes; 427 result_sizes[0] = m_ - kernel_x + 1; 428 result_sizes[1] = n_ - kernel_y + 1; 429 TensorMap<Tensor<T, 2>, Eigen::Aligned> C(c_, result_sizes); 430 Eigen::array<TensorIndex, 2> dims; 431 dims[0] = 0; 432 dims[1] = 1; 433 434 StartBenchmarkTiming(); 435 for (int iter = 0; iter < num_iters; ++iter) { 436 C.device(device_) = A.convolve(B, dims); 437 } 438 // Record the number of FLOP executed per second (kernel_size 439 // multiplications and additions for each value in the resulting tensor) 440 finalizeBenchmark(static_cast<int64_t>(2) * 441 (m_ - kernel_x + 1) * (n_ - kernel_y + 1) * kernel_x * kernel_y * num_iters); 442 } 443 444 private: initialize()445 void initialize() { 446 a_ = (T *) device_.allocate(m_ * k_ * sizeof(T)); 447 b_ = (T *) device_.allocate(k_ * n_ * sizeof(T)); 448 c_ = (T *) device_.allocate(m_ * n_ * sizeof(T)); 449 450 // Initialize the content of the memory pools to prevent asan from 451 // complaining. 452 device_.memset(a_, 12, m_ * k_ * sizeof(T)); 453 device_.memset(b_, 23, k_ * n_ * sizeof(T)); 454 device_.memset(c_, 31, m_ * n_ * sizeof(T)); 455 456 //BenchmarkUseRealTime(); 457 } 458 finalizeBenchmark(int64_t num_items)459 inline void finalizeBenchmark(int64_t num_items) { 460 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) 461 if (Eigen::internal::is_same<Device, Eigen::GpuDevice>::value) { 462 device_.synchronize(); 463 } 464 #endif 465 StopBenchmarkTiming(); 466 SetBenchmarkFlopsProcessed(num_items); 467 } 468 469 470 TensorIndex m_; 471 TensorIndex k_; 472 TensorIndex n_; 473 T* a_; 474 T* b_; 475 T* c_; 476 Device device_; 477 }; 478 #endif // THIRD_PARTY_EIGEN3_TENSOR_BENCHMARKS_H_ 479