• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
12 
13 namespace Eigen {
14 
15 /** \class TensorShuffling
16   * \ingroup CXX11_Tensor_Module
17   *
18   * \brief Tensor shuffling class.
19   *
20   *
21   */
22 namespace internal {
23 template<typename Shuffle, typename XprType>
24 struct traits<TensorShufflingOp<Shuffle, XprType> > : public traits<XprType>
25 {
26   typedef typename XprType::Scalar Scalar;
27   typedef traits<XprType> XprTraits;
28   typedef typename XprTraits::StorageKind StorageKind;
29   typedef typename XprTraits::Index Index;
30   typedef typename XprType::Nested Nested;
31   typedef typename remove_reference<Nested>::type _Nested;
32   static const int NumDimensions = XprTraits::NumDimensions;
33   static const int Layout = XprTraits::Layout;
34   typedef typename XprTraits::PointerType PointerType;
35 };
36 
37 template<typename Shuffle, typename XprType>
38 struct eval<TensorShufflingOp<Shuffle, XprType>, Eigen::Dense>
39 {
40   typedef const TensorShufflingOp<Shuffle, XprType>& type;
41 };
42 
43 template<typename Shuffle, typename XprType>
44 struct nested<TensorShufflingOp<Shuffle, XprType>, 1, typename eval<TensorShufflingOp<Shuffle, XprType> >::type>
45 {
46   typedef TensorShufflingOp<Shuffle, XprType> type;
47 };
48 
49 }  // end namespace internal
50 
51 
52 
53 template<typename Shuffle, typename XprType>
54 class TensorShufflingOp : public TensorBase<TensorShufflingOp<Shuffle, XprType> >
55 {
56   public:
57     typedef TensorBase<TensorShufflingOp<Shuffle, XprType> > Base;
58     typedef typename Eigen::internal::traits<TensorShufflingOp>::Scalar Scalar;
59     typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
60     typedef typename XprType::CoeffReturnType CoeffReturnType;
61     typedef typename Eigen::internal::nested<TensorShufflingOp>::type Nested;
62     typedef typename Eigen::internal::traits<TensorShufflingOp>::StorageKind StorageKind;
63     typedef typename Eigen::internal::traits<TensorShufflingOp>::Index Index;
64 
65     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorShufflingOp(const XprType& expr, const Shuffle& shfl)
66       : m_xpr(expr), m_shuffle(shfl) {}
67 
68     EIGEN_DEVICE_FUNC
69     const Shuffle& shufflePermutation() const { return m_shuffle; }
70 
71     EIGEN_DEVICE_FUNC
72     const typename internal::remove_all<typename XprType::Nested>::type&
73     expression() const { return m_xpr; }
74 
75     EIGEN_TENSOR_INHERIT_ASSIGNMENT_OPERATORS(TensorShufflingOp)
76 
77 
78   protected:
79     typename XprType::Nested m_xpr;
80     const Shuffle m_shuffle;
81 };
82 
83 
84 // Eval as rvalue
85 template<typename Shuffle, typename ArgType, typename Device>
86 struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
87 {
88   typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Self;
89   typedef TensorShufflingOp<Shuffle, ArgType> XprType;
90   typedef typename XprType::Index Index;
91   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
92   typedef DSizes<Index, NumDims> Dimensions;
93   typedef typename XprType::Scalar Scalar;
94   typedef typename XprType::CoeffReturnType CoeffReturnType;
95   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
96   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
97   typedef StorageMemory<CoeffReturnType, Device> Storage;
98   typedef typename Storage::Type EvaluatorPointerType;
99 
100   enum {
101     IsAligned         = false,
102     PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
103     BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
104     PreferBlockAccess = true,
105     Layout            = TensorEvaluator<ArgType, Device>::Layout,
106     CoordAccess       = false,  // to be implemented
107     RawAccess         = false
108   };
109 
110   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
111 
112   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
113   typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
114   typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
115 
116   typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
117                                                      Layout, Index>
118       TensorBlock;
119   //===--------------------------------------------------------------------===//
120 
121   EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
122       : m_device(device),
123         m_impl(op.expression(), device)
124   {
125     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
126     const Shuffle& shuffle = op.shufflePermutation();
127     m_is_identity = true;
128     for (int i = 0; i < NumDims; ++i) {
129       m_shuffle[i] = static_cast<int>(shuffle[i]);
130       m_dimensions[i] = input_dims[shuffle[i]];
131       m_inverseShuffle[shuffle[i]] = i;
132       if (m_is_identity && shuffle[i] != i) {
133         m_is_identity = false;
134       }
135     }
136 
137     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
138       m_unshuffledInputStrides[0] = 1;
139       m_outputStrides[0] = 1;
140 
141       for (int i = 1; i < NumDims; ++i) {
142         m_unshuffledInputStrides[i] =
143             m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
144         m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
145         m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
146                   m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
147       }
148     } else {
149       m_unshuffledInputStrides[NumDims - 1] = 1;
150       m_outputStrides[NumDims - 1] = 1;
151       for (int i = NumDims - 2; i >= 0; --i) {
152         m_unshuffledInputStrides[i] =
153             m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
154         m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
155         m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(
156                   m_outputStrides[i] > 0 ? m_outputStrides[i] : Index(1));
157       }
158     }
159 
160     for (int i = 0; i < NumDims; ++i) {
161       m_inputStrides[i] = m_unshuffledInputStrides[shuffle[i]];
162     }
163   }
164 
165   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
166 
167   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType /*data*/) {
168     m_impl.evalSubExprsIfNeeded(NULL);
169     return true;
170   }
171 
172 #ifdef EIGEN_USE_THREADS
173   template <typename EvalSubExprsCallback>
174   EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
175       EvaluatorPointerType, EvalSubExprsCallback done) {
176     m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
177   }
178 #endif  // EIGEN_USE_THREADS
179 
180   EIGEN_STRONG_INLINE void cleanup() {
181     m_impl.cleanup();
182   }
183 
184   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
185   {
186     if (m_is_identity) {
187       return m_impl.coeff(index);
188     } else {
189       return m_impl.coeff(srcCoeff(index));
190     }
191   }
192 
193   template <int LoadMode, typename Self, bool ImplPacketAccess>
194   struct PacketLoader {
195     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
196     static PacketReturnType Run(const Self& self, Index index) {
197       EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
198       EIGEN_UNROLL_LOOP
199       for (int i = 0; i < PacketSize; ++i) {
200         values[i] = self.coeff(index + i);
201       }
202       PacketReturnType rslt = internal::pload<PacketReturnType>(values);
203       return rslt;
204     }
205   };
206 
207   template<int LoadMode, typename Self>
208   struct PacketLoader<LoadMode, Self, true> {
209     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
210     static PacketReturnType Run(const Self& self, Index index) {
211       if (self.m_is_identity) {
212         return self.m_impl.template packet<LoadMode>(index);
213       } else {
214         EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
215         EIGEN_UNROLL_LOOP
216         for (int i = 0; i < PacketSize; ++i) {
217           values[i] = self.coeff(index + i);
218         }
219         PacketReturnType rslt = internal::pload<PacketReturnType>(values);
220         return rslt;
221       }
222     }
223   };
224 
225   template<int LoadMode>
226   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
227   {
228     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
229         eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
230     return PacketLoader<LoadMode, Self, TensorEvaluator<ArgType, Device>::PacketAccess>::Run(*this, index);
231   }
232 
233   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
234   internal::TensorBlockResourceRequirements getResourceRequirements() const {
235     static const int inner_dim =
236         Layout == static_cast<int>(ColMajor) ? 0 : NumDims - 1;
237 
238     const size_t target_size = m_device.firstLevelCacheSize();
239     const bool inner_dim_shuffled = m_shuffle[inner_dim] != inner_dim;
240 
241     // Shuffled inner dimensions leads to a random memory access, which is not
242     // captured by default cost model bytes loaded/stored. We add this cost
243     // explicitly. The number of cycles picked based on the benchmarks.
244     // TODO(ezhulenev): This number was picked based on a very questionable
245     // benchmarks, add benchmarks that are representative of real workloads.
246     using BlockRequirements = internal::TensorBlockResourceRequirements;
247     if (inner_dim_shuffled) {
248       return BlockRequirements::uniform<Scalar>(target_size)
249           .addCostPerCoeff({0, 0, NumDims * 28});
250     } else {
251       return BlockRequirements::skewed<Scalar>(target_size);
252     }
253   }
254 
255   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
256   block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
257           bool root_of_expr_ast = false) const {
258     assert(m_impl.data() != NULL);
259 
260     typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
261         TensorBlockIO;
262     typedef typename TensorBlockIO::Dst TensorBlockIODst;
263     typedef typename TensorBlockIO::Src TensorBlockIOSrc;
264 
265     const typename TensorBlock::Storage block_storage =
266         TensorBlock::prepareStorage(
267             desc, scratch, /*allow_strided_storage=*/root_of_expr_ast);
268 
269     typename TensorBlockIO::Dimensions input_strides(m_unshuffledInputStrides);
270     TensorBlockIOSrc src(input_strides, m_impl.data(), srcCoeff(desc.offset()));
271 
272     TensorBlockIODst dst(block_storage.dimensions(), block_storage.strides(),
273                          block_storage.data());
274 
275     typename TensorBlockIO::DimensionsMap dst_to_src_dim_map(m_shuffle);
276     TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
277 
278     return block_storage.AsTensorMaterializedBlock();
279   }
280 
281   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
282     const double compute_cost = m_is_identity ? TensorOpCost::AddCost<Index>() :
283                                 NumDims * (2 * TensorOpCost::AddCost<Index>() +
284                                            2 * TensorOpCost::MulCost<Index>() +
285                                            TensorOpCost::DivCost<Index>());
286     return m_impl.costPerCoeff(vectorized) +
287            TensorOpCost(0, 0, compute_cost, m_is_identity /* vectorized */, PacketSize);
288   }
289 
290   EIGEN_DEVICE_FUNC typename Storage::Type data() const { return NULL; }
291 
292 #ifdef EIGEN_USE_SYCL
293    // binding placeholder accessors to a command group handler for SYCL
294   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
295     m_impl.bind(cgh);
296   }
297 #endif
298  protected:
299   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index GetBlockOutputIndex(
300       Index input_index,
301       const DSizes<Index, NumDims>& input_block_strides,
302       const DSizes<Index, NumDims>& output_block_strides,
303       const DSizes<internal::TensorIntDivisor<Index>, NumDims>& fast_input_block_strides) const {
304     Index output_index = 0;
305     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
306       for (int i = NumDims - 1; i > 0; --i) {
307         const Index idx = input_index / fast_input_block_strides[i];
308         output_index += idx * output_block_strides[m_inverseShuffle[i]];
309         input_index -= idx * input_block_strides[i];
310       }
311       return output_index + input_index *
312           output_block_strides[m_inverseShuffle[0]];
313     } else {
314       for (int i = 0; i < NumDims - 1; ++i) {
315         const Index idx = input_index / fast_input_block_strides[i];
316         output_index += idx * output_block_strides[m_inverseShuffle[i]];
317         input_index -= idx * input_block_strides[i];
318       }
319       return output_index + input_index *
320           output_block_strides[m_inverseShuffle[NumDims - 1]];
321     }
322   }
323 
324   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const {
325     Index inputIndex = 0;
326     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
327       for (int i = NumDims - 1; i > 0; --i) {
328         const Index idx = index / m_fastOutputStrides[i];
329         inputIndex += idx * m_inputStrides[i];
330         index -= idx * m_outputStrides[i];
331       }
332       return inputIndex + index * m_inputStrides[0];
333     } else {
334       for (int i = 0; i < NumDims - 1; ++i) {
335         const Index idx = index / m_fastOutputStrides[i];
336         inputIndex += idx * m_inputStrides[i];
337         index -= idx * m_outputStrides[i];
338       }
339       return inputIndex + index * m_inputStrides[NumDims - 1];
340     }
341   }
342 
343   Dimensions m_dimensions;
344   bool m_is_identity;
345   array<int, NumDims> m_shuffle;
346   array<Index, NumDims> m_inverseShuffle;  // TODO(ezhulenev): Make it int type.
347   array<Index, NumDims> m_outputStrides;
348   array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
349   array<Index, NumDims> m_inputStrides;
350   array<Index, NumDims> m_unshuffledInputStrides;
351 
352   const Device EIGEN_DEVICE_REF m_device;
353   TensorEvaluator<ArgType, Device> m_impl;
354 };
355 
356 
357 // Eval as lvalue
358 template<typename Shuffle, typename ArgType, typename Device>
359 struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
360     : public TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
361 {
362   typedef TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device> Base;
363 
364   typedef TensorShufflingOp<Shuffle, ArgType> XprType;
365   typedef typename XprType::Index Index;
366   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
367   typedef DSizes<Index, NumDims> Dimensions;
368   typedef typename XprType::Scalar Scalar;
369   typedef typename XprType::CoeffReturnType CoeffReturnType;
370   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
371   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
372 
373   enum {
374     IsAligned         = false,
375     PacketAccess      = (PacketType<CoeffReturnType, Device>::size > 1),
376     BlockAccess       = TensorEvaluator<ArgType, Device>::RawAccess,
377     PreferBlockAccess = true,
378     Layout            = TensorEvaluator<ArgType, Device>::Layout,
379     RawAccess         = false
380   };
381 
382   typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
383 
384   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
385   typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
386   //===--------------------------------------------------------------------===//
387 
388   EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
389       : Base(op, device)
390   { }
391 
392   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
393   {
394     return this->m_impl.coeffRef(this->srcCoeff(index));
395   }
396 
397   template <int StoreMode> EIGEN_STRONG_INLINE
398   void writePacket(Index index, const PacketReturnType& x)
399   {
400     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
401 
402     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
403     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
404     EIGEN_UNROLL_LOOP
405     for (int i = 0; i < PacketSize; ++i) {
406       this->coeffRef(index+i) = values[i];
407     }
408   }
409 
410   template <typename TensorBlock>
411   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void writeBlock(
412       const TensorBlockDesc& desc, const TensorBlock& block) {
413     eigen_assert(this->m_impl.data() != NULL);
414 
415     typedef internal::TensorBlockIO<ScalarNoConst, Index, NumDims, Layout>
416         TensorBlockIO;
417     typedef typename TensorBlockIO::Dst TensorBlockIODst;
418     typedef typename TensorBlockIO::Src TensorBlockIOSrc;
419 
420     const Scalar* block_buffer = block.data();
421 
422     // TODO(ezhulenev): TensorBlockIO should be able to read from any Eigen
423     // expression with coefficient and packet access as `src`.
424     void* mem = NULL;
425     if (block_buffer == NULL) {
426       mem = this->m_device.allocate(desc.size() * sizeof(Scalar));
427       ScalarNoConst* buf = static_cast<ScalarNoConst*>(mem);
428 
429       typedef internal::TensorBlockAssignment<
430           ScalarNoConst, NumDims, typename TensorBlock::XprType, Index>
431           TensorBlockAssignment;
432 
433       TensorBlockAssignment::Run(
434           TensorBlockAssignment::target(
435               desc.dimensions(), internal::strides<Layout>(desc.dimensions()),
436               buf),
437           block.expr());
438 
439       block_buffer = buf;
440     }
441 
442     // Read from block.
443     TensorBlockIOSrc src(internal::strides<Layout>(desc.dimensions()),
444                          block_buffer);
445 
446     // Write to the output buffer.
447     typename TensorBlockIO::Dimensions output_strides(
448         this->m_unshuffledInputStrides);
449     typename TensorBlockIO::Dimensions output_dimensions;
450     for (int i = 0; i < NumDims; ++i) {
451       output_dimensions[this->m_shuffle[i]] = desc.dimension(i);
452     }
453     TensorBlockIODst dst(output_dimensions, output_strides, this->m_impl.data(),
454                          this->srcCoeff(desc.offset()));
455 
456     // Reorder dimensions according to the shuffle.
457     typename TensorBlockIO::DimensionsMap dst_to_src_dim_map;
458     for (int i = 0; i < NumDims; ++i) {
459       dst_to_src_dim_map[i] = static_cast<int>(this->m_inverseShuffle[i]);
460     }
461     TensorBlockIO::Copy(dst, src, dst_to_src_dim_map);
462 
463     // Deallocate temporary buffer used for the block materialization.
464     if (mem != NULL) this->m_device.deallocate(mem);
465   }
466 };
467 
468 
469 } // end namespace Eigen
470 
471 #endif // EIGEN_CXX11_TENSOR_TENSOR_SHUFFLING_H
472