1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H 12 13 namespace Eigen { 14 15 /** \class TensorShuffling 16 * \ingroup CXX11_Tensor_Module 17 * 18 * \brief Tensor shuffling class. 19 * 20 * 21 */ 22 namespace internal { 23 template<typename Shuffle, typename XprType> 24 struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType> 25 { 26 typedef typename XprType::Scalar Scalar; 27 typedef traits<XprType> XprTraits; 28 typedef typename XprTraits::StorageKind StorageKind; 29 typedef typename XprTraits::Index Index; 30 typedef typename XprType::Nested Nested; 31 typedef typename remove_reference<Nested>::type _Nested; 32 static const int NumDimensions = XprTraits::NumDimensions; 33 static const int Layout = XprTraits::Layout; 34 typedef typename XprTraits::PointerType PointerType; 35 }; 36 37 template<typename Shuffle, typename XprType> 38 struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense> 39 { 40 typedef const TensorShufflingOp<Shuffle, XprType>& type; 41 }; 42 43 template<typename Shuffle, typename XprType> 44 struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type> 45 { 46 typedef TensorShufflingOp<Shuffle, XprType> type; 47 }; 48 49 } // end namespace internal 50 51 52 53 template<typename Shuffle, typename XprType> 54 class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> > 55 { 56 public: 57 typedef TensorBase<TensorShufflingOp<Shuffle, XprType> > Base; 58 typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar; 59 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; 60 typedef typename XprType::CoeffReturnType CoeffReturnType; 61 typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested; 62 typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind; 63 typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index; 64 65 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl) 66 : m_xpr(expr), m_shuffle(shfl) {} 67 68 EIGEN_DEVICE_FUNC 69 const Shuffle& shufflePermutation() const { return m_shuffle; } 70 71 EIGEN_DEVICE_FUNC 72 const typename internal::remove_all<typename XprType::Nested>::type& 73 expression() const { return m_xpr; } 74 75 EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp) 76 77 78 protected: 79 typename XprType::Nested m_xpr; 80 const Shuffle m_shuffle; 81 }; 82 83 84 // Eval as rvalue 85 template<typename Shuffle, typename ArgType, typename Device> 86 struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> 87 { 88 typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self; 89 typedef TensorShufflingOp<Shuffle, ArgType> XprType; 90 typedef typename XprType::Index Index; 91 static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; 92 typedef DSizes<Index, NumDims> Dimensions; 93 typedef typename XprType::Scalar Scalar; 94 typedef typename XprType::CoeffReturnType CoeffReturnType; 95 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; 96 static const int PacketSize = PacketType<CoeffReturnType, Device>::size; 97 typedef StorageMemory<CoeffReturnType, Device> Storage; 98 typedef typename Storage::Type EvaluatorPointerType; 99 100 enum { 101 IsAligned = false, 102 PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), 103 BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess, 104 PreferBlockAccess = true, 105 Layout = TensorEvaluator<ArgType, Device>::Layout, 106 CoordAccess = false, // to be implemented 107 RawAccess = false 108 }; 109 110 typedef typename internal::remove_const<Scalar>::type ScalarNoConst; 111 112 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// 113 typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; 114 typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch; 115 116 typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims, 117 Layout, Index> 118 TensorBlock; 119 //===--------------------------------------------------------------------===// 120 121 EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) 122 : m_device(device), 123 m_impl(op.expression(), device) 124 { 125 const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions(); 126 const Shuffle& shuffle = op.shufflePermutation(); 127 m_is_identity = true; 128 for (int i = 0; i < NumDims; ++i) { 129 m_shuffle[i] = static_cast<int>(shuffle[i]); 130 m_dimensions[i] = input_dims[shuffle[i]]; 131 m_inverseShuffle[shuffle[i]] = i; 132 if (m_is_identity && shuffle[i] != i) { 133 m_is_identity = false; 134 } 135 } 136 137 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 138 m_unshuffledInputStrides[0] = 1; 139 m_outputStrides[0] = 1; 140 141 for (int i = 1; i < NumDims; ++i) { 142 m_unshuffledInputStrides[i] = 143 m_unshuffledInputStrides[i - 1] * input_dims[i - 1]; 144 m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1]; 145 m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>( 146 m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1)); 147 } 148 } else { 149 m_unshuffledInputStrides[NumDims - 1] = 1; 150 m_outputStrides[NumDims - 1] = 1; 151 for (int i = NumDims - 2; i >= 0; --i) { 152 m_unshuffledInputStrides[i] = 153 m_unshuffledInputStrides[i + 1] * input_dims[i + 1]; 154 m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1]; 155 m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>( 156 m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1)); 157 } 158 } 159 160 for (int i = 0; i < NumDims; ++i) { 161 m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]]; 162 } 163 } 164 165 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } 166 167 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) { 168 m_impl.evalSubExprsIfNeeded(NULL); 169 return true; 170 } 171 172 #ifdef EIGEN_USE_THREADS 173 template <typename EvalSubExprsCallback> 174 EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync( 175 EvaluatorPointerType, EvalSubExprsCallback done) { 176 m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); }); 177 } 178 #endif // EIGEN_USE_THREADS 179 180 EIGEN_STRONG_INLINE void cleanup() { 181 m_impl.cleanup(); 182 } 183 184 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const 185 { 186 if (m_is_identity) { 187 return m_impl.coeff(index); 188 } else { 189 return m_impl.coeff(srcCoeff(index)); 190 } 191 } 192 193 template <int LoadMode, typename Self, bool ImplPacketAccess> 194 struct PacketLoader { 195 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 196 static PacketReturnType Run(const Self& self, Index index) { 197 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; 198 EIGEN_UNROLL_LOOP 199 for (int i = 0; i < PacketSize; ++i) { 200 values[i] = self.coeff(index + i); 201 } 202 PacketReturnType rslt = internal::pload<PacketReturnType>(values); 203 return rslt; 204 } 205 }; 206 207 template<int LoadMode, typename Self> 208 struct PacketLoader<LoadMode, Self, true> { 209 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 210 static PacketReturnType Run(const Self& self, Index index) { 211 if (self.m_is_identity) { 212 return self.m_impl.template packet<LoadMode>(index); 213 } else { 214 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; 215 EIGEN_UNROLL_LOOP 216 for (int i = 0; i < PacketSize; ++i) { 217 values[i] = self.coeff(index + i); 218 } 219 PacketReturnType rslt = internal::pload<PacketReturnType>(values); 220 return rslt; 221 } 222 } 223 }; 224 225 template<int LoadMode> 226 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const 227 { 228 EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) 229 eigen_assert(index + PacketSize - 1 < dimensions().TotalSize()); 230 return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index); 231 } 232 233 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 234 internal::TensorBlockResourceRequirements getResourceRequirements() const { 235 static const int inner_dim = 236 Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1; 237 238 const size_t target_size = m_device.firstLevelCacheSize(); 239 const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim; 240 241 // Shuffled inner dimensions leads to a random memory access, which is not 242 // captured by default cost model bytes loaded/stored. We add this cost 243 // explicitly. The number of cycles picked based on the benchmarks. 244 // TODO(ezhulenev): This number was picked based on a very questionable 245 // benchmarks, add benchmarks that are representative of real workloads. 246 using BlockRequirements = internal::TensorBlockResourceRequirements; 247 if (inner_dim_shuffled) { 248 return BlockRequirements::uniform<Scalar>(target_size) 249 .addCostPerCoeff({0, 0, NumDims * 28}); 250 } else { 251 return BlockRequirements::skewed<Scalar>(target_size); 252 } 253 } 254 255 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock 256 block(TensorBlockDesc& desc, TensorBlockScratch& scratch, 257 bool root_of_expr_ast = false) const { 258 assert(m_impl.data() != NULL); 259 260 typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout> 261 TensorBlockIO; 262 typedef typename TensorBlockIO::Dst TensorBlockIODst; 263 typedef typename TensorBlockIO::Src TensorBlockIOSrc; 264 265 const typename TensorBlock::Storage block_storage = 266 TensorBlock::prepareStorage( 267 desc, scratch, /*allow_strided_storage=*/root_of_expr_ast); 268 269 typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides); 270 TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset())); 271 272 TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(), 273 block_storage.data()); 274 275 typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle); 276 TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); 277 278 return block_storage.AsTensorMaterializedBlock(); 279 } 280 281 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const { 282 const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() : 283 NumDims * (2 * TensorOpCost::AddCost<Index>() + 284 2 * TensorOpCost::MulCost<Index>() + 285 TensorOpCost::DivCost<Index>()); 286 return m_impl.costPerCoeff(vectorized) + 287 TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize); 288 } 289 290 EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; } 291 292 #ifdef EIGEN_USE_SYCL 293 // binding placeholder accessors to a command group handler for SYCL 294 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const { 295 m_impl.bind(cgh); 296 } 297 #endif 298 protected: 299 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex( 300 Index input_index, 301 const DSizes<Index, NumDims>& input_block_strides, 302 const DSizes<Index, NumDims>& output_block_strides, 303 const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const { 304 Index output_index = 0; 305 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 306 for (int i = NumDims - 1; i > 0; --i) { 307 const Index idx = input_index / fast_input_block_strides[i]; 308 output_index += idx * output_block_strides[m_inverseShuffle[i]]; 309 input_index -= idx * input_block_strides[i]; 310 } 311 return output_index + input_index * 312 output_block_strides[m_inverseShuffle[0]]; 313 } else { 314 for (int i = 0; i < NumDims - 1; ++i) { 315 const Index idx = input_index / fast_input_block_strides[i]; 316 output_index += idx * output_block_strides[m_inverseShuffle[i]]; 317 input_index -= idx * input_block_strides[i]; 318 } 319 return output_index + input_index * 320 output_block_strides[m_inverseShuffle[NumDims - 1]]; 321 } 322 } 323 324 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const { 325 Index inputIndex = 0; 326 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 327 for (int i = NumDims - 1; i > 0; --i) { 328 const Index idx = index / m_fastOutputStrides[i]; 329 inputIndex += idx * m_inputStrides[i]; 330 index -= idx * m_outputStrides[i]; 331 } 332 return inputIndex + index * m_inputStrides[0]; 333 } else { 334 for (int i = 0; i < NumDims - 1; ++i) { 335 const Index idx = index / m_fastOutputStrides[i]; 336 inputIndex += idx * m_inputStrides[i]; 337 index -= idx * m_outputStrides[i]; 338 } 339 return inputIndex + index * m_inputStrides[NumDims - 1]; 340 } 341 } 342 343 Dimensions m_dimensions; 344 bool m_is_identity; 345 array<int, NumDims> m_shuffle; 346 array<Index, NumDims> m_inverseShuffle; // TODO(ezhulenev): Make it int type. 347 array<Index, NumDims> m_outputStrides; 348 array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides; 349 array<Index, NumDims> m_inputStrides; 350 array<Index, NumDims> m_unshuffledInputStrides; 351 352 const Device EIGEN_DEVICE_REF m_device; 353 TensorEvaluator<ArgType, Device> m_impl; 354 }; 355 356 357 // Eval as lvalue 358 template<typename Shuffle, typename ArgType, typename Device> 359 struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device> 360 : public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> 361 { 362 typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base; 363 364 typedef TensorShufflingOp<Shuffle, ArgType> XprType; 365 typedef typename XprType::Index Index; 366 static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value; 367 typedef DSizes<Index, NumDims> Dimensions; 368 typedef typename XprType::Scalar Scalar; 369 typedef typename XprType::CoeffReturnType CoeffReturnType; 370 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; 371 static const int PacketSize = PacketType<CoeffReturnType, Device>::size; 372 373 enum { 374 IsAligned = false, 375 PacketAccess = (PacketType<CoeffReturnType, Device>::size > 1), 376 BlockAccess = TensorEvaluator<ArgType, Device>::RawAccess, 377 PreferBlockAccess = true, 378 Layout = TensorEvaluator<ArgType, Device>::Layout, 379 RawAccess = false 380 }; 381 382 typedef typename internal::remove_const<Scalar>::type ScalarNoConst; 383 384 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// 385 typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc; 386 //===--------------------------------------------------------------------===// 387 388 EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) 389 : Base(op, device) 390 { } 391 392 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index) 393 { 394 return this->m_impl.coeffRef(this->srcCoeff(index)); 395 } 396 397 template <int StoreMode> EIGEN_STRONG_INLINE 398 void writePacket(Index index, const PacketReturnType& x) 399 { 400 EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE) 401 402 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize]; 403 internal::pstore<CoeffReturnType, PacketReturnType>(values, x); 404 EIGEN_UNROLL_LOOP 405 for (int i = 0; i < PacketSize; ++i) { 406 this->coeffRef(index+i) = values[i]; 407 } 408 } 409 410 template <typename TensorBlock> 411 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock( 412 const TensorBlockDesc& desc, const TensorBlock& block) { 413 eigen_assert(this->m_impl.data() != NULL); 414 415 typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout> 416 TensorBlockIO; 417 typedef typename TensorBlockIO::Dst TensorBlockIODst; 418 typedef typename TensorBlockIO::Src TensorBlockIOSrc; 419 420 const Scalar* block_buffer = block.data(); 421 422 // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen 423 // expression with coefficient and packet access as `src`. 424 void* mem = NULL; 425 if (block_buffer == NULL) { 426 mem = this->m_device.allocate(desc.size() * sizeof(Scalar)); 427 ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem); 428 429 typedef internal::TensorBlockAssignment< 430 ScalarNoConst, NumDims, typename TensorBlock::XprType, Index> 431 TensorBlockAssignment; 432 433 TensorBlockAssignment::Run( 434 TensorBlockAssignment::target( 435 desc.dimensions(), internal::strides<Layout>(desc.dimensions()), 436 buf), 437 block.expr()); 438 439 block_buffer = buf; 440 } 441 442 // Read from block. 443 TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()), 444 block_buffer); 445 446 // Write to the output buffer. 447 typename TensorBlockIO::Dimensions output_strides( 448 this->m_unshuffledInputStrides); 449 typename TensorBlockIO::Dimensions output_dimensions; 450 for (int i = 0; i < NumDims; ++i) { 451 output_dimensions[this->m_shuffle[i]] = desc.dimension(i); 452 } 453 TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(), 454 this->srcCoeff(desc.offset())); 455 456 // Reorder dimensions according to the shuffle. 457 typename TensorBlockIO::DimensionsMap dst_to_src_dim_map; 458 for (int i = 0; i < NumDims; ++i) { 459 dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]); 460 } 461 TensorBlockIO::Copy(dst, src, dst_to_src_dim_map); 462 463 // Deallocate temporary buffer used for the block materialization. 464 if (mem != NULL) this->m_device.deallocate(mem); 465 } 466 }; 467 468 469 } // end namespace Eigen 470 471 #endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H 472