1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // This Source Code Form is subject to the terms of the Mozilla
5 // Public License v. 2.0. If a copy of the MPL was not distributed
6 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
7
8 #ifndef EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
9 #define EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
10
11 namespace Eigen {
12 namespace internal {
13
14 // -------------------------------------------------------------------------- //
15 // Forward declarations for templates defined below.
16 template <typename Scalar, typename IndexType, int NumDims, int Layout>
17 class TensorBlockIO;
18
19 // -------------------------------------------------------------------------- //
20 // Helper function to compute strides for densely stored buffer of given
21 // dimensions.
22
23 // TODO(ezhulenev): We compute strides 1000 times in different evaluators, use
24 // this function instead everywhere.
25 template <int Layout, typename IndexType, int NumDims>
strides(const DSizes<IndexType,NumDims> & dimensions)26 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
27 const DSizes<IndexType, NumDims>& dimensions) {
28 DSizes<IndexType, NumDims> strides;
29 if (NumDims == 0) return strides;
30
31 // TODO(ezhulenev): Use templates to unroll this loop (similar to
32 // h_array_reduce in CXX11meta.h)? Benchmark it.
33 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
34 strides[0] = 1;
35 for (int i = 1; i < NumDims; ++i) {
36 strides[i] = strides[i - 1] * dimensions[i - 1];
37 }
38 } else {
39 strides[NumDims - 1] = 1;
40 for (int i = NumDims - 2; i >= 0; --i) {
41 strides[i] = strides[i + 1] * dimensions[i + 1];
42 }
43 }
44
45 return strides;
46 }
47
48 template <int Layout, typename IndexType, size_t NumDims>
strides(const Eigen::array<IndexType,NumDims> & dimensions)49 EIGEN_ALWAYS_INLINE DSizes<IndexType, NumDims> strides(
50 const Eigen::array<IndexType, NumDims>& dimensions) {
51 return strides<Layout>(DSizes<IndexType, NumDims>(dimensions));
52 }
53
54 template <int Layout, std::ptrdiff_t... Indices>
strides(const Sizes<Indices...> & sizes)55 EIGEN_STRONG_INLINE DSizes<std::ptrdiff_t, sizeof...(Indices)> strides(
56 const Sizes<Indices...>& sizes) {
57 return strides<Layout>(DSizes<std::ptrdiff_t, sizeof...(Indices)>(sizes));
58 }
59
60 // -------------------------------------------------------------------------- //
61
62 // Tensor block shape type defines what are the shape preference for the blocks
63 // extracted from the larger tensor.
64 //
65 // Example: blocks of 100 elements from the large 100x100 tensor:
66 // - tensor: 100x100
67 // - target_block_size: 100
68 //
69 // TensorBlockShapeType:
70 // - kUniformAllDims: 100 blocks of size 10x10
71 // - kSkewedInnerDims: 100 blocks of size 100x1 (or 1x100 depending on a column
72 // or row major layout)
73 enum class TensorBlockShapeType { kUniformAllDims, kSkewedInnerDims };
74
75 struct TensorBlockResourceRequirements {
76 TensorBlockShapeType shape_type; // target block shape
77 size_t size; // target block size
78 TensorOpCost cost_per_coeff; // cost of computing a single block element
79
80 #ifdef EIGEN_HIPCC
81 // For HIPCC, we need to explicitly declare as a "device fun", the constructor
82 // which is implicitly invoked in the "merge" / "any" routines. else HIPCC
83 // errors out complaining about the lack of a matching constructor
84 EIGEN_DEVICE_FUNC
TensorBlockResourceRequirementsTensorBlockResourceRequirements85 TensorBlockResourceRequirements(TensorBlockShapeType shape_type_, size_t size_,
86 TensorOpCost cost_)
87 : shape_type(shape_type_), size(size_), cost_per_coeff(cost_)
88 {}
89 #endif
90
91 template <typename Scalar>
withShapeAndSizeTensorBlockResourceRequirements92 EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
93 TensorBlockShapeType shape_type, size_t size_in_bytes,
94 TensorOpCost cost) {
95 const size_t size = numext::maxi(size_t(1), size_in_bytes / sizeof(Scalar));
96 return {shape_type, size, cost};
97 }
98
99 template <typename Scalar>
withShapeAndSizeTensorBlockResourceRequirements100 EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements withShapeAndSize(
101 TensorBlockShapeType shape_type, size_t size_in_bytes) {
102 // This default cost per coefficient is valid for most materialized tensor
103 // block evaluation implementations, because they typically just read
104 // coefficients from the underlying tensor storage, and write to the tensor
105 // block buffer (scratch or destination memory, reads and writes have linear
106 // access pattern). We ignore the fixed cost of block evaluation, because in
107 // practice it should negligible.
108 //
109 // Lazy block evaluation adds the cost of calling a functor for each
110 // coefficient.
111 //
112 // All non-trivial block evaluation implementations must provide their own
113 // cost approximation (e.g. shuffling inner dimension has a much higher cost
114 // because it reads memory randomly, although the total number of moved
115 // bytes is the same).
116 return withShapeAndSize<Scalar>(shape_type, size_in_bytes,
117 {/*bytes_loaded=*/sizeof(Scalar),
118 /*bytes_stored=*/sizeof(Scalar),
119 /*compute_cycles=*/0});
120 }
121
122 template <typename Scalar>
skewedTensorBlockResourceRequirements123 EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements skewed(
124 size_t size_in_bytes) {
125 return withShapeAndSize<Scalar>(TensorBlockShapeType::kSkewedInnerDims,
126 size_in_bytes);
127 }
128
129 template <typename Scalar>
uniformTensorBlockResourceRequirements130 EIGEN_DEVICE_FUNC static TensorBlockResourceRequirements uniform(
131 size_t size_in_bytes) {
132 return withShapeAndSize<Scalar>(TensorBlockShapeType::kUniformAllDims,
133 size_in_bytes);
134 }
135
136 EIGEN_DEVICE_FUNC
137 static EIGEN_STRONG_INLINE TensorBlockResourceRequirements
mergeTensorBlockResourceRequirements138 merge(const TensorBlockResourceRequirements& lhs,
139 const TensorBlockResourceRequirements& rhs) {
140 return {merge(lhs.shape_type, rhs.shape_type), // shape_type
141 merge(lhs.size, rhs.size), // size
142 merge(lhs.cost_per_coeff, rhs.cost_per_coeff)}; // cost_per_coeff
143 }
144
addCostPerCoeffTensorBlockResourceRequirements145 EIGEN_DEVICE_FUNC TensorBlockResourceRequirements& addCostPerCoeff(
146 TensorOpCost cost) {
147 cost_per_coeff += cost;
148 return *this;
149 }
150
151 // This is a resource requirement that should be returned from expressions
152 // that do not have any block evaluation preference (e.g. default tensor
153 // expression with raw buffer access).
154 EIGEN_DEVICE_FUNC
anyTensorBlockResourceRequirements155 static EIGEN_STRONG_INLINE TensorBlockResourceRequirements any() {
156 return {TensorBlockShapeType::kUniformAllDims, 1, {0, 0, 0}};
157 }
158
159 private:
160 using Requirements = TensorBlockResourceRequirements;
161
162 EIGEN_DEVICE_FUNC
mergeTensorBlockResourceRequirements163 static EIGEN_STRONG_INLINE size_t merge(size_t lhs_size, size_t rhs_size) {
164 return numext::maxi(lhs_size, rhs_size);
165 }
166
167 EIGEN_DEVICE_FUNC
168 static EIGEN_STRONG_INLINE TensorBlockShapeType
mergeTensorBlockResourceRequirements169 merge(TensorBlockShapeType lhs, TensorBlockShapeType rhs) {
170 return (lhs == TensorBlockShapeType::kSkewedInnerDims ||
171 rhs == TensorBlockShapeType::kSkewedInnerDims)
172 ? TensorBlockShapeType::kSkewedInnerDims
173 : TensorBlockShapeType::kUniformAllDims;
174 }
175
176 EIGEN_DEVICE_FUNC
mergeTensorBlockResourceRequirements177 static EIGEN_STRONG_INLINE TensorOpCost merge(TensorOpCost lhs_cost,
178 TensorOpCost rhs_cost) {
179 return lhs_cost + rhs_cost;
180 }
181 };
182
183 // -------------------------------------------------------------------------- //
184 // TensorBlockDescriptor specifies a block offset within a tensor and the block
185 // sizes along each of the tensor dimensions.
186
187 template <int NumDims, typename IndexType = Eigen::Index>
188 class TensorBlockDescriptor {
189 public:
190 typedef DSizes<IndexType, NumDims> Dimensions;
191
192 // If we evaluate a Tensor assignment, and expression on the left, already has
193 // a memory buffer, then we might do performance optimization, and evaluate
194 // the root expression directly into the final output memory. Some time it's
195 // possible to reuse it for materializing subexpressions inside an expression
196 // tree, to to avoid dynamic memory allocation.
197 //
198 // The pointer type of the underlying storage is erased, because passing
199 // Scalar type through all the expression evaluation layers is way too many
200 // templates. In practice destination buffer type should always match the
201 // evaluated expression scalar type.
202 class DestinationBuffer {
203 public:
204 enum DestinationBufferKind : int {
205 // The above explicit specification of "int" as the enum basetype is
206 // needed to get around a HIPCC link error ("the field type is not
207 // amp-compatible")
208 // which is issued for class members with the enum type.
209 // TODO(rocm):
210 // remove the "int" basetype once HIPCC has been fixed to not error out
211 // in the above scenario.
212
213 // Destination buffer is not defined (`m_data` == nullptr).
214 kEmpty,
215
216 // Tensor block defined by an owning tensor block descriptor can fit
217 // contiguously into the destination buffer. In this case it's safe to
218 // materialize tensor block in the destination buffer, wrap it in a
219 // TensorMap, and use to build Eigen expression on top of it.
220 kContiguous,
221
222 // Destination buffer strides do not match strides of the contiguously
223 // stored block, and it's impossible to define a TensorMap over this
224 // buffer. However if we are evaluating a root of an expression tree, we
225 // still can materialize an output into this destination, because we can
226 // guarantee that no one will ever access it through block API.
227 //
228 // In theory it is possible to build valid TensorStriding<TensorMap>
229 // expression on top of this destination buffer, however it has
230 // inefficient coeff/packet access, and defeats the purpose of fast block
231 // evaluation API.
232 kStrided
233 };
234
235 template <typename Scalar>
data()236 Scalar* data() const {
237 eigen_assert(m_data_type_size == sizeof(Scalar));
238 return static_cast<Scalar*>(m_data);
239 }
240
strides()241 const Dimensions& strides() const { return m_strides; }
kind()242 const DestinationBufferKind& kind() const { return m_kind; }
243
244 private:
245 friend class TensorBlockDescriptor;
246
DestinationBuffer()247 DestinationBuffer() : m_data(NULL), m_data_type_size(0), m_kind(kEmpty) {}
248
249 template <typename Scalar>
DestinationBuffer(Scalar * data,const Dimensions & strides,DestinationBufferKind kind)250 DestinationBuffer(Scalar* data, const Dimensions& strides,
251 DestinationBufferKind kind)
252 : m_data(static_cast<void*>(data)),
253 m_data_type_size(sizeof(Scalar)),
254 m_strides(strides),
255 m_kind(kind) {}
256
257 template <int Layout, typename Scalar>
make(const TensorBlockDescriptor & desc,Scalar * data,const Dimensions & strides)258 static DestinationBuffer make(const TensorBlockDescriptor& desc,
259 Scalar* data, const Dimensions& strides) {
260 return DestinationBuffer(data, strides, kind<Layout>(desc, strides));
261 }
262
263 template <int Layout>
kind(const TensorBlockDescriptor & desc,const Dimensions & strides)264 static DestinationBufferKind kind(const TensorBlockDescriptor& desc,
265 const Dimensions& strides) {
266 const Dimensions& desc_dims = desc.dimensions();
267 const Dimensions& desc_strides = internal::strides<Layout>(desc_dims);
268 for (int i = 0; i < NumDims; ++i) {
269 if (desc_dims[i] == 1) continue;
270 if (desc_strides[i] != strides[i]) return kStrided;
271 }
272 return kContiguous;
273 }
274
275 // Storage pointer is type erased, to reduce template bloat, but we still
276 // keep the size of the underlying element type for error checking.
277 void* m_data;
278 size_t m_data_type_size;
279
280 // Destination buffer dimensions always match the dimensions of a tensor
281 // block descriptor it belongs to, however strides might be different.
282 Dimensions m_strides;
283
284 DestinationBufferKind m_kind;
285 };
286
TensorBlockDescriptor(const IndexType offset,const Dimensions & dimensions,const DestinationBuffer & destination)287 TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions,
288 const DestinationBuffer& destination)
289 : m_offset(offset),
290 m_dimensions(dimensions),
291 m_destination(destination) {}
292
TensorBlockDescriptor(const IndexType offset,const Dimensions & dimensions)293 TensorBlockDescriptor(const IndexType offset, const Dimensions& dimensions)
294 : m_offset(offset),
295 m_dimensions(dimensions),
296 m_destination(DestinationBuffer()) {}
297
offset()298 IndexType offset() const { return m_offset; }
dimensions()299 const Dimensions& dimensions() const { return m_dimensions; }
dimension(int index)300 IndexType dimension(int index) const { return m_dimensions[index]; }
size()301 IndexType size() const { return array_prod<IndexType>(m_dimensions); }
302
destination()303 const DestinationBuffer& destination() const { return m_destination; }
304
305 template <int Layout, typename Scalar>
AddDestinationBuffer(Scalar * dst_base,const Dimensions & dst_strides)306 void AddDestinationBuffer(Scalar* dst_base, const Dimensions& dst_strides) {
307 eigen_assert(dst_base != NULL);
308 m_destination =
309 DestinationBuffer::template make<Layout>(*this, dst_base, dst_strides);
310 }
311
312 template <int Layout, typename Scalar, typename DstStridesIndexType>
AddDestinationBuffer(Scalar * dst_base,const DSizes<DstStridesIndexType,NumDims> & dst_strides)313 void AddDestinationBuffer(
314 Scalar* dst_base,
315 const DSizes<DstStridesIndexType, NumDims>& dst_strides) {
316 // DSizes constructor will do index type promotion if it's safe.
317 AddDestinationBuffer<Layout>(dst_base, Dimensions(dst_strides));
318 }
319
DropDestinationBuffer()320 TensorBlockDescriptor& DropDestinationBuffer() {
321 m_destination.m_data = NULL;
322 m_destination.m_kind = DestinationBuffer::kEmpty;
323 return *this;
324 }
325
HasDestinationBuffer()326 bool HasDestinationBuffer() const {
327 return m_destination.kind() != DestinationBuffer::kEmpty;
328 }
329
330 // Returns a copy of `*this` with updated offset.
WithOffset(IndexType offset)331 TensorBlockDescriptor WithOffset(IndexType offset) const {
332 return TensorBlockDescriptor(offset, m_dimensions, m_destination);
333 }
334
335 private:
336 // Offset and dimensions are immutable after construction. Block descriptor
337 // can only be mutated by adding or dropping destination.
338 const IndexType m_offset;
339 const Dimensions m_dimensions;
340 DestinationBuffer m_destination;
341 };
342
343 // -------------------------------------------------------------------------- //
344 // TensorBlockMapper is responsible for iterating over the blocks of a tensor.
345
346 template <int NumDims, int Layout, typename IndexType = Eigen::Index>
347 class TensorBlockMapper {
348 typedef TensorBlockDescriptor<NumDims, IndexType> BlockDescriptor;
349
350 public:
351 typedef DSizes<IndexType, NumDims> Dimensions;
352
353 TensorBlockMapper() = default;
TensorBlockMapper(const DSizes<IndexType,NumDims> & dimensions,const TensorBlockResourceRequirements & requirements)354 TensorBlockMapper(const DSizes<IndexType, NumDims>& dimensions,
355 const TensorBlockResourceRequirements& requirements)
356 : m_tensor_dimensions(dimensions), m_requirements(requirements) {
357 // Compute block dimensions and the total number of blocks.
358 InitializeBlockDimensions();
359 }
360
blockCount()361 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockCount() const {
362 return m_total_block_count;
363 }
364
blockTotalSize()365 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType blockTotalSize() const {
366 return m_block_dimensions.TotalSize();
367 }
368
369 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const DSizes<IndexType, NumDims>&
blockDimensions()370 blockDimensions() const {
371 return m_block_dimensions;
372 }
373
374 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE BlockDescriptor
blockDescriptor(IndexType block_index)375 blockDescriptor(IndexType block_index) const {
376 static const bool isColMajor = Layout == static_cast<int>(ColMajor);
377
378 IndexType offset = 0;
379 DSizes<IndexType, NumDims> dimensions;
380
381 if (NumDims == 0) return BlockDescriptor(offset, dimensions);
382
383 // Iterate outer -> inner dimensions.
384 for (int i = NumDims - 1; i >= 0; --i) {
385 const int dim = isColMajor ? i : NumDims - i - 1;
386
387 const IndexType idx = block_index / m_block_strides[dim];
388 block_index -= idx * m_block_strides[dim];
389
390 const IndexType coord = idx * m_block_dimensions[dim];
391 dimensions[dim] = numext::mini(m_tensor_dimensions[dim] - coord,
392 m_block_dimensions[dim]);
393 offset += coord * m_tensor_strides[dim];
394 }
395
396 return {offset, dimensions};
397 }
398
399 private:
InitializeBlockDimensions()400 void InitializeBlockDimensions() {
401 // Requested block shape and size.
402 const TensorBlockShapeType shape_type = m_requirements.shape_type;
403 IndexType target_block_size =
404 numext::maxi<IndexType>(1, static_cast<IndexType>(m_requirements.size));
405
406 IndexType tensor_size = m_tensor_dimensions.TotalSize();
407
408 // Corner case: one of the dimensions is zero. Logic below is too complex
409 // to handle this case on a general basis, just use unit block size.
410 // Note: we must not yield blocks with zero dimensions (recipe for
411 // overflows/underflows, divisions by zero and NaNs later).
412 if (tensor_size == 0) {
413 for (int i = 0; i < NumDims; ++i) {
414 m_block_dimensions[i] = 1;
415 }
416 m_total_block_count = 0;
417 return;
418 }
419
420 // If tensor fits into a target block size, evaluate it as a single block.
421 if (tensor_size <= target_block_size) {
422 m_block_dimensions = m_tensor_dimensions;
423 m_total_block_count = 1;
424 // The only valid block index is `0`, and in this case we do not need
425 // to compute real strides for tensor or blocks (see blockDescriptor).
426 for (int i = 0; i < NumDims; ++i) {
427 m_tensor_strides[i] = 0;
428 m_block_strides[i] = 1;
429 }
430 return;
431 }
432
433 static const bool isColMajor = Layout == static_cast<int>(ColMajor);
434
435 // Block shape skewed towards inner dimension.
436 if (shape_type == TensorBlockShapeType::kSkewedInnerDims) {
437 IndexType coeff_to_allocate = target_block_size;
438
439 for (int i = 0; i < NumDims; ++i) {
440 const int dim = isColMajor ? i : NumDims - i - 1;
441 m_block_dimensions[dim] =
442 numext::mini(coeff_to_allocate, m_tensor_dimensions[dim]);
443 coeff_to_allocate = divup(
444 coeff_to_allocate,
445 numext::maxi(static_cast<IndexType>(1), m_block_dimensions[dim]));
446 }
447 eigen_assert(coeff_to_allocate == 1);
448
449 } else if (shape_type == TensorBlockShapeType::kUniformAllDims) {
450 // Tensor will not fit within 'target_block_size' budget: calculate tensor
451 // block dimension sizes based on "square" dimension size target.
452 const IndexType dim_size_target = convert_index<IndexType>(
453 std::pow(static_cast<float>(target_block_size),
454 1.0f / static_cast<float>(m_block_dimensions.rank())));
455
456 for (int i = 0; i < NumDims; ++i) {
457 // TODO(andydavis) Adjust the inner most 'block_dim_size' to make it
458 // a multiple of the packet size. Note that reducing
459 // 'block_dim_size' in this manner can increase the number of
460 // blocks, and so will amplify any per-block overhead.
461 m_block_dimensions[i] =
462 numext::mini(dim_size_target, m_tensor_dimensions[i]);
463 }
464
465 // Add any un-allocated coefficients to inner dimension(s).
466 IndexType total_size = m_block_dimensions.TotalSize();
467 for (int i = 0; i < NumDims; ++i) {
468 const int dim = isColMajor ? i : NumDims - i - 1;
469
470 if (m_block_dimensions[dim] < m_tensor_dimensions[dim]) {
471 const IndexType total_size_other_dims =
472 total_size / m_block_dimensions[dim];
473 const IndexType alloc_avail =
474 divup<IndexType>(target_block_size, total_size_other_dims);
475 if (alloc_avail == m_block_dimensions[dim]) {
476 // Insufficient excess coefficients to allocate.
477 break;
478 }
479 m_block_dimensions[dim] =
480 numext::mini(m_tensor_dimensions[dim], alloc_avail);
481 total_size = total_size_other_dims * m_block_dimensions[dim];
482 }
483 }
484
485 } else {
486 eigen_assert(false); // unknown block shape
487 }
488
489 eigen_assert(m_block_dimensions.TotalSize() >=
490 numext::mini<IndexType>(target_block_size,
491 m_tensor_dimensions.TotalSize()));
492
493 // Calculate block counts by dimension and total block count.
494 DSizes<IndexType, NumDims> block_count;
495 for (int i = 0; i < NumDims; ++i) {
496 block_count[i] = divup(m_tensor_dimensions[i], m_block_dimensions[i]);
497 }
498 m_total_block_count = array_prod(block_count);
499
500 // Calculate block strides (used for enumerating blocks).
501 m_tensor_strides = strides<Layout>(m_tensor_dimensions);
502 m_block_strides = strides<Layout>(block_count);
503 }
504
505 DSizes<IndexType, NumDims> m_tensor_dimensions;
506 TensorBlockResourceRequirements m_requirements;
507
508 DSizes<IndexType, NumDims> m_block_dimensions;
509 IndexType m_total_block_count;
510
511 DSizes<IndexType, NumDims> m_tensor_strides;
512 DSizes<IndexType, NumDims> m_block_strides;
513 };
514
515 // -------------------------------------------------------------------------- //
516 // TensorBlockScratchAllocator is responsible for allocating temporary buffers
517 // for block evaluation (output or input block materialization). Given that
518 // Eigen expression traversal order is deterministic, all temporary allocations
519 // are happening in the same order, and usually have exactly the same size.
520 // Scratch allocator keeps a trace of all dynamic allocations, and after the
521 // first block evaluation is completed, we should be able to reuse all the
522 // temporary buffers for the next block evaluation.
523
524 template <typename Device>
525 class TensorBlockScratchAllocator {
526 public:
TensorBlockScratchAllocator(const Device & device)527 explicit TensorBlockScratchAllocator(const Device& device)
528 : m_device(device), m_allocation_index(0) {}
529
~TensorBlockScratchAllocator()530 ~TensorBlockScratchAllocator() {
531 for (size_t i = 0; i < m_allocations.size(); ++i) {
532 m_device.deallocate(m_allocations[i].ptr);
533 }
534 }
535
allocate(size_t size)536 void* allocate(size_t size) {
537 // TODO(ezhulenev): Remove when replaced with inlined vector.
538 if (m_allocations.capacity() == 0) m_allocations.reserve(8);
539
540 // Check if we already have an existing allocation att current index.
541 const int num_allocations = static_cast<int>(m_allocations.size());
542 const bool has_allocation = m_allocation_index < num_allocations;
543
544 // Allocation index can't be larger than the number of allocations.
545 eigen_assert(m_allocation_index <= num_allocations);
546
547 // If we have existing allocation, and its size is larger or equal to
548 // requested size, we do nothing.
549
550 // If current allocation can't fit requested size, we deallocate it, and
551 // replace with a larger allocation.
552 if (has_allocation && m_allocations[m_allocation_index].size < size) {
553 m_device.deallocate(m_allocations[m_allocation_index].ptr);
554 m_allocations[m_allocation_index].ptr = m_device.allocate(size);
555 m_allocations[m_allocation_index].size = size;
556 }
557
558 // Make a new allocation if we don't have and existing one.
559 if (!has_allocation) {
560 Allocation allocation;
561 allocation.ptr = m_device.allocate(size);
562 allocation.size = size;
563 m_allocations.push_back(allocation);
564 }
565
566 eigen_assert(m_allocations[m_allocation_index].ptr != NULL);
567 eigen_assert(m_allocations[m_allocation_index].size >= size);
568
569 return m_allocations[m_allocation_index++].ptr;
570 }
571
reset()572 void reset() { m_allocation_index = 0; }
573
574 private:
575 struct Allocation {
576 void* ptr;
577 size_t size;
578 };
579
580 const Device& m_device;
581 int m_allocation_index;
582 // TODO(ezhulenev): This should be an inlined vector.
583 std::vector<Allocation> m_allocations;
584 };
585
586 // -------------------------------------------------------------------------- //
587 // TensorBlockKind represents all possible block kinds, that can be produced by
588 // TensorEvaluator::evalBlock function.
589 enum TensorBlockKind {
590 // Tensor block that is a lazy expression that must be assigned to a
591 // destination using TensorBlockAssign.
592 kExpr,
593
594 // Tensor block that is a view into a memory buffer owned by an underlying
595 // Tensor expression (e.g. it can be a view into a Tensor buffer).
596 kView,
597
598 // Tensor block that was materialized in a scratch memory buffer, allocated
599 // with TensorBlockScratchAllocator. This block must be copied to a
600 // destination, similar to a block of `kExpr` type.
601 kMaterializedInScratch,
602
603 // Tensor block that was materialized directly into the final output memory
604 // buffer. For example if the left side of an assignment is a Tensor, we can
605 // directly materialize the block in the destination memory.
606 //
607 // If strides in the output buffer do not match tensor block strides, the
608 // Tensor expression will be invalid, and should not be used by
609 // TensorBlockAssign or for constructing another block expression.
610 kMaterializedInOutput
611 };
612
613 // -------------------------------------------------------------------------- //
614 // TensorBlockNotImplemented should be used to defined TensorBlock typedef in
615 // TensorEvaluators that do not support block evaluation.
616
617 class TensorBlockNotImplemented {
618 public:
619 typedef void XprType;
620 };
621
622 // -------------------------------------------------------------------------- //
623 // XprScalar extracts Scalar type from the Eigen expressions (if expression type
624 // is not void). It's required to be able to define lazy block expression for
625 // argument types, that do not support block evaluation.
626
627 template <typename XprType>
628 struct XprScalar {
629 typedef typename XprType::Scalar type;
630 };
631 template <>
632 struct XprScalar<void> {
633 typedef void type;
634 };
635
636 // -------------------------------------------------------------------------- //
637 // TensorMaterializedBlock is a fully evaluated block of the original tensor,
638 // and XprType is just a TensorMap over the data. This block type is typically
639 // used to materialize blocks of tensor expressions, that can't be efficiently
640 // represented as lazy Tensor expressions with fast coeff/packet operations,
641 // e.g. we materialize all broadcasts into evaluated blocks.
642 //
643 // TensorMaterializedBlock does not own its memory buffer, it's either a memory
644 // buffer that backs the original expression (e.g. block is just a view into a
645 // Tensor), or a memory buffer allocated with scratch allocator, and in this
646 // case the scratch allocator will deallocate it at the end of block based
647 // expression execution.
648 //
649 // If the block was evaluated directly into the output buffer, and strides in
650 // the output buffer do not match block strides, the TensorMap expression will
651 // be invalid, and should never be used in block assignment or any other tensor
652 // expression.
653
654 template <typename Scalar, int NumDims, int Layout,
655 typename IndexType = Eigen::Index>
656 class TensorMaterializedBlock {
657 public:
658 typedef DSizes<IndexType, NumDims> Dimensions;
659 typedef TensorMap<const Tensor<Scalar, NumDims, Layout> > XprType;
660
661 TensorMaterializedBlock(TensorBlockKind kind, const Scalar* data,
662 const Dimensions& dimensions, bool valid_expr = true)
663 : m_kind(kind),
664 m_data(data),
665 m_dimensions(dimensions),
666 m_expr(m_data, m_dimensions),
667 m_valid_expr(valid_expr) {
668 eigen_assert(m_kind == internal::TensorBlockKind::kView ||
669 m_kind == internal::TensorBlockKind::kMaterializedInScratch ||
670 m_kind == internal::TensorBlockKind::kMaterializedInOutput);
671 }
672
673 TensorBlockKind kind() const { return m_kind; }
674 // NOTE(ezhulenev): Returning XprType by value like in other block types
675 // causes asan failures. The theory is that XprType::Nested doesn't work
676 // properly for TensorMap.
677 const XprType& expr() const {
678 eigen_assert(m_valid_expr);
679 return m_expr;
680 }
681 const Scalar* data() const { return m_data; }
682 void cleanup() {}
683
684 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
685
686 // TensorMaterializedBlock can be backed by different types of storage:
687 //
688 // (1) Contiguous block of memory allocated with scratch allocator.
689 // (2) Contiguous block of memory reused from tensor block descriptor
690 // destination buffer.
691 // (3) Strided block of memory reused from tensor block descriptor
692 // destination buffer.
693 //
694 class Storage {
695 public:
696 Scalar* data() const { return m_data; }
697 const Dimensions& dimensions() const { return m_dimensions; }
698 const Dimensions& strides() const { return m_strides; }
699
700 TensorMaterializedBlock AsTensorMaterializedBlock() const {
701 return TensorMaterializedBlock(
702 m_materialized_in_output
703 ? internal::TensorBlockKind::kMaterializedInOutput
704 : internal::TensorBlockKind::kMaterializedInScratch,
705 m_data, m_dimensions, !m_strided_storage);
706 }
707
708 private:
709 friend class TensorMaterializedBlock;
710
711 Storage(Scalar* data, const Dimensions& dimensions,
712 const Dimensions& strides, bool materialized_in_output,
713 bool strided_storage)
714 : m_data(data),
715 m_dimensions(dimensions),
716 m_strides(strides),
717 m_materialized_in_output(materialized_in_output),
718 m_strided_storage(strided_storage) {}
719
720 Scalar* m_data;
721 Dimensions m_dimensions;
722 Dimensions m_strides;
723 bool m_materialized_in_output;
724 bool m_strided_storage;
725 };
726
727 // Creates a storage for materialized block either from the block descriptor
728 // destination buffer, or allocates a new buffer with scratch allocator.
729 template <typename TensorBlockScratch>
730 EIGEN_STRONG_INLINE static Storage prepareStorage(
731 TensorBlockDesc& desc, TensorBlockScratch& scratch,
732 bool allow_strided_storage = false) {
733 // Try to reuse destination as an output block buffer.
734 typedef typename TensorBlockDesc::DestinationBuffer DestinationBuffer;
735
736 if (desc.destination().kind() == DestinationBuffer::kContiguous) {
737 Scalar* buffer = desc.destination().template data<Scalar>();
738 desc.DropDestinationBuffer();
739 return Storage(buffer, desc.dimensions(),
740 internal::strides<Layout>(desc.dimensions()),
741 /*materialized_in_output=*/true,
742 /*strided_storage=*/false);
743
744 } else if (desc.destination().kind() == DestinationBuffer::kStrided &&
745 allow_strided_storage) {
746 Scalar* buffer = desc.destination().template data<Scalar>();
747 desc.DropDestinationBuffer();
748 return Storage(buffer, desc.dimensions(), desc.destination().strides(),
749 /*materialized_in_output=*/true, /*strided_storage=*/true);
750
751 } else {
752 void* mem = scratch.allocate(desc.size() * sizeof(Scalar));
753 return Storage(static_cast<Scalar*>(mem), desc.dimensions(),
754 internal::strides<Layout>(desc.dimensions()),
755 /*materialized_in_output=*/false,
756 /*strided_storage=*/false);
757 }
758 }
759
760 // Creates a materialized block for the given descriptor from a memory buffer.
761 template <typename DataDimensions, typename TensorBlockScratch>
762 EIGEN_STRONG_INLINE static TensorMaterializedBlock materialize(
763 const Scalar* data, const DataDimensions& data_dims,
764 TensorBlockDesc& desc, TensorBlockScratch& scratch) {
765 eigen_assert(array_size<DataDimensions>::value == desc.dimensions().size());
766
767 // If a tensor block dimensions covers a contiguous block of the underlying
768 // memory, we can skip block buffer memory allocation, and construct a block
769 // from existing `data` memory buffer.
770 //
771 // Example: (RowMajor layout)
772 // data_dims: [11, 12, 13, 14]
773 // desc.dimensions(): [1, 1, 3, 14]
774 //
775 // In this case we can construct a TensorBlock starting at
776 // `data + desc.offset()`, with a `desc.dimensions()` block sizes.
777 static const bool is_col_major = Layout == ColMajor;
778
779 // Find out how many inner dimensions have a matching size.
780 int num_matching_inner_dims = 0;
781 for (int i = 0; i < NumDims; ++i) {
782 int dim = is_col_major ? i : NumDims - i - 1;
783 if (data_dims[dim] != desc.dimensions()[dim]) break;
784 ++num_matching_inner_dims;
785 }
786
787 // All the outer dimensions must be of size `1`, except a single dimension
788 // before the matching inner dimension (`3` in the example above).
789 bool can_use_direct_access = true;
790 for (int i = num_matching_inner_dims + 1; i < NumDims; ++i) {
791 int dim = is_col_major ? i : NumDims - i - 1;
792 if (desc.dimension(dim) != 1) {
793 can_use_direct_access = false;
794 break;
795 }
796 }
797
798 if (can_use_direct_access) {
799 const Scalar* block_start = data + desc.offset();
800 return TensorMaterializedBlock(internal::TensorBlockKind::kView,
801 block_start, desc.dimensions());
802
803 } else {
804 // Reuse destination buffer or allocate new buffer with scratch allocator.
805 const Storage storage = prepareStorage(desc, scratch);
806
807 typedef internal::TensorBlockIO<Scalar, IndexType, NumDims, Layout>
808 TensorBlockIO;
809 typedef typename TensorBlockIO::Dst TensorBlockIODst;
810 typedef typename TensorBlockIO::Src TensorBlockIOSrc;
811
812 TensorBlockIOSrc src(internal::strides<Layout>(Dimensions(data_dims)),
813 data, desc.offset());
814 TensorBlockIODst dst(storage.dimensions(), storage.strides(),
815 storage.data());
816
817 TensorBlockIO::Copy(dst, src);
818 return storage.AsTensorMaterializedBlock();
819 }
820 }
821
822 private:
823 TensorBlockKind m_kind;
824 const Scalar* m_data;
825 Dimensions m_dimensions;
826 XprType m_expr;
827 bool m_valid_expr;
828 };
829
830 // -------------------------------------------------------------------------- //
831 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies UnaryOp
832 // functor to the blocks produced by the underlying Tensor expression.
833
834 template <typename UnaryOp, typename ArgTensorBlock>
835 class TensorCwiseUnaryBlock {
836 static const bool NoArgBlockAccess =
837 internal::is_void<typename ArgTensorBlock::XprType>::value;
838
839 public:
840 typedef typename conditional<
841 NoArgBlockAccess, void,
842 TensorCwiseUnaryOp<UnaryOp, const typename ArgTensorBlock::XprType> >::
843 type XprType;
844
845 typedef typename XprScalar<XprType>::type Scalar;
846
847 TensorCwiseUnaryBlock(const ArgTensorBlock& arg_block, const UnaryOp& functor)
848 : m_arg_block(arg_block), m_functor(functor) {}
849
850 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
851
852 XprType expr() const { return XprType(m_arg_block.expr(), m_functor); }
853 const Scalar* data() const { return NULL; }
854 void cleanup() { m_arg_block.cleanup(); }
855
856 private:
857 ArgTensorBlock m_arg_block;
858 UnaryOp m_functor;
859 };
860
861 // -------------------------------------------------------------------------- //
862 // TensorCwiseUnaryBlock is a lazy tensor expression block that applies BinaryOp
863 // functor to the blocks produced by the underlying Tensor expression.
864
865 template <typename BinaryOp, typename LhsTensorBlock, typename RhsTensorBlock>
866 class TensorCwiseBinaryBlock {
867 static const bool NoArgBlockAccess =
868 internal::is_void<typename LhsTensorBlock::XprType>::value ||
869 internal::is_void<typename RhsTensorBlock::XprType>::value;
870
871 public:
872 typedef typename conditional<
873 NoArgBlockAccess, void,
874 TensorCwiseBinaryOp<BinaryOp, const typename LhsTensorBlock::XprType,
875 const typename RhsTensorBlock::XprType> >::type
876 XprType;
877
878 typedef typename XprScalar<XprType>::type Scalar;
879
880 TensorCwiseBinaryBlock(const LhsTensorBlock& left_block,
881 const RhsTensorBlock& right_block,
882 const BinaryOp& functor)
883 : m_left_block(left_block),
884 m_right_block(right_block),
885 m_functor(functor) {}
886
887 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
888
889 XprType expr() const {
890 return XprType(m_left_block.expr(), m_right_block.expr(), m_functor);
891 }
892
893 const Scalar* data() const { return NULL; }
894
895 void cleanup() {
896 m_left_block.cleanup();
897 m_right_block.cleanup();
898 }
899
900 private:
901 LhsTensorBlock m_left_block;
902 RhsTensorBlock m_right_block;
903 BinaryOp m_functor;
904 };
905
906 // -------------------------------------------------------------------------- //
907 // TensorUnaryExprBlock is a lazy tensor expression block that can construct
908 // an arbitrary tensor expression from a block of the underlying type (this is a
909 // generalization of the TensorCwiseUnaryBlock for arbitrary expressions).
910
911 template <typename BlockFactory, typename ArgTensorBlock>
912 class TensorUnaryExprBlock {
913 typedef typename ArgTensorBlock::XprType ArgXprType;
914 static const bool NoArgBlockAccess = internal::is_void<ArgXprType>::value;
915
916 public:
917 typedef typename conditional<
918 NoArgBlockAccess, void,
919 typename BlockFactory::template XprType<ArgXprType>::type>::type XprType;
920
921 typedef typename XprScalar<XprType>::type Scalar;
922
923 TensorUnaryExprBlock(const ArgTensorBlock& arg_block,
924 const BlockFactory& factory)
925 : m_arg_block(arg_block), m_factory(factory) {}
926
927 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
928 XprType expr() const { return m_factory.expr(m_arg_block.expr()); }
929 const Scalar* data() const { return NULL; }
930 void cleanup() { m_arg_block.cleanup(); }
931
932 private:
933 ArgTensorBlock m_arg_block;
934 BlockFactory m_factory;
935 };
936
937 // -------------------------------------------------------------------------- //
938 // TensorTernaryExprBlock is a lazy tensor expression block that can construct
939 // an arbitrary tensor expression from three blocks of the underlying type.
940
941 template <typename BlockFactory, typename Arg1TensorBlock,
942 typename Arg2TensorBlock, typename Arg3TensorBlock>
943 class TensorTernaryExprBlock {
944 typedef typename Arg1TensorBlock::XprType Arg1XprType;
945 typedef typename Arg2TensorBlock::XprType Arg2XprType;
946 typedef typename Arg3TensorBlock::XprType Arg3XprType;
947
948 static const bool NoArgBlockAccess = internal::is_void<Arg1XprType>::value ||
949 internal::is_void<Arg2XprType>::value ||
950 internal::is_void<Arg3XprType>::value;
951
952 public:
953 typedef typename conditional<
954 NoArgBlockAccess, void,
955 typename BlockFactory::template XprType<Arg1XprType, Arg2XprType,
956 Arg3XprType>::type>::type XprType;
957
958 typedef typename XprScalar<XprType>::type Scalar;
959
960 TensorTernaryExprBlock(const Arg1TensorBlock& arg1_block,
961 const Arg2TensorBlock& arg2_block,
962 const Arg3TensorBlock& arg3_block,
963 const BlockFactory& factory)
964 : m_arg1_block(arg1_block),
965 m_arg2_block(arg2_block),
966 m_arg3_block(arg3_block),
967 m_factory(factory) {}
968
969 TensorBlockKind kind() const { return internal::TensorBlockKind::kExpr; }
970 XprType expr() const {
971 return m_factory.expr(m_arg1_block.expr(), m_arg2_block.expr(),
972 m_arg3_block.expr());
973 }
974 const Scalar* data() const { return NULL; }
975 void cleanup() {
976 m_arg1_block.cleanup();
977 m_arg2_block.cleanup();
978 m_arg3_block.cleanup();
979 }
980
981 private:
982 Arg1TensorBlock m_arg1_block;
983 Arg2TensorBlock m_arg2_block;
984 Arg3TensorBlock m_arg3_block;
985 BlockFactory m_factory;
986 };
987
988 // -------------------------------------------------------------------------- //
989 // StridedLinearBufferCopy provides a method to copy data between two linear
990 // buffers with different strides, with optimized paths for scatter/gather.
991
992 template <typename Scalar, typename IndexType>
993 class StridedLinearBufferCopy {
994 typedef typename packet_traits<Scalar>::type Packet;
995 enum {
996 Vectorizable = packet_traits<Scalar>::Vectorizable,
997 PacketSize = packet_traits<Scalar>::size
998 };
999
1000 public:
1001 // Specifying linear copy kind statically gives ~30% speedup for small sizes.
1002 enum class Kind {
1003 Linear = 0, // src_stride == 1 && dst_stride == 1
1004 Scatter = 1, // src_stride == 1 && dst_stride != 1
1005 FillLinear = 2, // src_stride == 0 && dst_stride == 1
1006 FillScatter = 3, // src_stride == 0 && dst_stride != 1
1007 Gather = 4, // dst_stride == 1
1008 Random = 5 // everything else
1009 };
1010
1011 struct Dst {
1012 Dst(IndexType o, IndexType s, Scalar* d) : offset(o), stride(s), data(d) {}
1013
1014 IndexType offset;
1015 IndexType stride;
1016 Scalar* data;
1017 };
1018
1019 struct Src {
1020 Src(IndexType o, IndexType s, const Scalar* d)
1021 : offset(o), stride(s), data(d) {}
1022
1023 IndexType offset;
1024 IndexType stride;
1025 const Scalar* data;
1026 };
1027
1028 template <typename StridedLinearBufferCopy::Kind kind>
1029 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(const Dst& dst,
1030 const Src& src,
1031 const size_t count) {
1032 Run<kind>(count, dst.offset, dst.stride, dst.data, src.offset, src.stride,
1033 src.data);
1034 }
1035
1036 private:
1037 template <typename StridedLinearBufferCopy::Kind kind>
1038 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
1039 const IndexType count, const IndexType dst_offset,
1040 const IndexType dst_stride, Scalar* EIGEN_RESTRICT dst_data,
1041 const IndexType src_offset, const IndexType src_stride,
1042 const Scalar* EIGEN_RESTRICT src_data) {
1043 const Scalar* src = &src_data[src_offset];
1044 Scalar* dst = &dst_data[dst_offset];
1045
1046 if (!Vectorizable) {
1047 for (Index i = 0; i < count; ++i) {
1048 dst[i * dst_stride] = src[i * src_stride];
1049 }
1050 return;
1051 }
1052
1053 const IndexType vectorized_size = count - PacketSize;
1054 IndexType i = 0;
1055
1056 if (kind == StridedLinearBufferCopy::Kind::Linear) {
1057 // ******************************************************************** //
1058 // Linear copy from `src` to `dst`.
1059 const IndexType unrolled_size = count - 4 * PacketSize;
1060 eigen_assert(src_stride == 1 && dst_stride == 1);
1061 for (; i <= unrolled_size; i += 4 * PacketSize) {
1062 for (int j = 0; j < 4; ++j) {
1063 Packet p = ploadu<Packet>(src + i + j * PacketSize);
1064 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1065 }
1066 }
1067 for (; i <= vectorized_size; i += PacketSize) {
1068 Packet p = ploadu<Packet>(src + i);
1069 pstoreu<Scalar, Packet>(dst + i, p);
1070 }
1071 for (; i < count; ++i) {
1072 dst[i] = src[i];
1073 }
1074 // ******************************************************************** //
1075 } else if (kind == StridedLinearBufferCopy::Kind::Scatter) {
1076 // Scatter from `src` to `dst`.
1077 eigen_assert(src_stride == 1 && dst_stride != 1);
1078 for (; i <= vectorized_size; i += PacketSize) {
1079 Packet p = ploadu<Packet>(src + i);
1080 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1081 }
1082 for (; i < count; ++i) {
1083 dst[i * dst_stride] = src[i];
1084 }
1085 // ******************************************************************** //
1086 } else if (kind == StridedLinearBufferCopy::Kind::FillLinear) {
1087 // Fill `dst` with value at `*src`.
1088 eigen_assert(src_stride == 0 && dst_stride == 1);
1089 const IndexType unrolled_size = count - 4 * PacketSize;
1090 Packet p = pload1<Packet>(src);
1091 for (; i <= unrolled_size; i += 4 * PacketSize) {
1092 for (int j = 0; j < 4; ++j) {
1093 pstoreu<Scalar, Packet>(dst + i + j * PacketSize, p);
1094 }
1095 }
1096 for (; i <= vectorized_size; i += PacketSize) {
1097 pstoreu<Scalar, Packet>(dst + i, p);
1098 }
1099 for (; i < count; ++i) {
1100 dst[i] = *src;
1101 }
1102 // ******************************************************************** //
1103 } else if (kind == StridedLinearBufferCopy::Kind::FillScatter) {
1104 // Scatter `*src` into `dst`.
1105 eigen_assert(src_stride == 0 && dst_stride != 1);
1106 Packet p = pload1<Packet>(src);
1107 for (; i <= vectorized_size; i += PacketSize) {
1108 pscatter<Scalar, Packet>(dst + i * dst_stride, p, dst_stride);
1109 }
1110 for (; i < count; ++i) {
1111 dst[i * dst_stride] = *src;
1112 }
1113 // ******************************************************************** //
1114 } else if (kind == StridedLinearBufferCopy::Kind::Gather) {
1115 // Gather from `src` into `dst`.
1116 eigen_assert(dst_stride == 1);
1117 for (; i <= vectorized_size; i += PacketSize) {
1118 Packet p = pgather<Scalar, Packet>(src + i * src_stride, src_stride);
1119 pstoreu<Scalar, Packet>(dst + i, p);
1120 }
1121 for (; i < count; ++i) {
1122 dst[i] = src[i * src_stride];
1123 }
1124 // ******************************************************************** //
1125 } else if (kind == StridedLinearBufferCopy::Kind::Random) {
1126 // Random.
1127 for (; i < count; ++i) {
1128 dst[i * dst_stride] = src[i * src_stride];
1129 }
1130 } else {
1131 eigen_assert(false);
1132 }
1133 }
1134 };
1135
1136 // -------------------------------------------------------------------------- //
1137 // TensorBlockIO copies data from `src` tensor block, to the `dst` tensor block.
1138 // It's possible to specify src->dst dimension mapping for the copy operation.
1139 // Dimensions of `dst` specify how many elements have to be copied, for the
1140 // `src` we need to know only stride to navigate through source memory buffer.
1141
1142 template <typename Scalar, typename IndexType, int NumDims, int Layout>
1143 class TensorBlockIO {
1144 static const bool IsColMajor = (Layout == ColMajor);
1145
1146 typedef StridedLinearBufferCopy<Scalar, IndexType> LinCopy;
1147
1148 public:
1149 typedef DSizes<IndexType, NumDims> Dimensions;
1150 typedef DSizes<int, NumDims> DimensionsMap;
1151
1152 struct Dst {
1153 Dst(const Dimensions& dst_dims, const Dimensions& dst_strides, Scalar* dst,
1154 IndexType dst_offset = 0)
1155 : dims(dst_dims), strides(dst_strides), data(dst), offset(dst_offset) {}
1156
1157 Dimensions dims;
1158 Dimensions strides;
1159 Scalar* data;
1160 IndexType offset;
1161 };
1162
1163 struct Src {
1164 Src(const Dimensions& src_strides, const Scalar* src,
1165 IndexType src_offset = 0)
1166 : strides(src_strides), data(src), offset(src_offset) {}
1167
1168 Dimensions strides;
1169 const Scalar* data;
1170 IndexType offset;
1171 };
1172
1173 // Copies data to `dst` from `src`, using provided dimensions mapping:
1174 //
1175 // src_dimension_index = dst_to_src_dim_map[dst_dimension_index]
1176 //
1177 // Returns the number of copied elements.
1178 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE IndexType Copy(
1179 const Dst& dst, const Src& src, const DimensionsMap& dst_to_src_dim_map) {
1180 // Copy single scalar value from `src` to `dst`.
1181 if (NumDims == 0) {
1182 *(dst.data + dst.offset) = *(src.data + src.offset);
1183 return 1;
1184 }
1185
1186 // Both `dst` and `src` must have contiguous innermost dimension. We also
1187 // accept the special case with stride '0', because it's used as a trick to
1188 // implement broadcasting.
1189 {
1190 int inner_dim = IsColMajor ? 0 : NumDims - 1;
1191 EIGEN_UNUSED_VARIABLE(inner_dim);
1192 eigen_assert(dst.strides[inner_dim] == 1 || dst.strides[inner_dim] == 0);
1193 eigen_assert(src.strides[inner_dim] == 1 || src.strides[inner_dim] == 0);
1194 }
1195
1196 // Give a shorter name to `dst_to_src_dim_map`.
1197 const DimensionsMap& dim_map = dst_to_src_dim_map;
1198
1199 // Do not squeeze reordered inner dimensions.
1200 int num_squeezable_dims = NumSqueezableInnerDims(dim_map);
1201
1202 // NOTE: We find the innermost dimension (contiguous in memory) in the dst
1203 // block, and we write data linearly into that dimension, reading it from
1204 // the src. If dimensions are reordered, we might end up reading data from
1205 // the src with `stride != 1`.
1206 //
1207 // NOTE: Random-Read/Linear-Write can be up to ~2X faster than
1208 // Linear-Read/Random-Write: https://stackoverflow.com/a/54935680
1209
1210 // Find the innermost dimension in the dst whose size is not 1. This is the
1211 // effective inner dim.
1212 int num_size_one_inner_dims = 0;
1213 for (int i = 0; i < num_squeezable_dims; ++i) {
1214 const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1215 if (dst.dims[dst_dim] != 1) break;
1216 num_size_one_inner_dims++;
1217 }
1218
1219 // If all dimensions are of size 1, just copy a scalar from `src` to `dst`.
1220 if (num_size_one_inner_dims == NumDims) {
1221 *(dst.data + dst.offset) = *(src.data + src.offset);
1222 return 1;
1223 }
1224
1225 // Outermost dimension in the dst with `stride == 1` (contiguous in memory).
1226 const int dst_stride1_dim = IsColMajor
1227 ? num_size_one_inner_dims
1228 : NumDims - num_size_one_inner_dims - 1;
1229
1230 // Dimension in the src that corresponds to the dst innermost dimension.
1231 const int src_dim_for_dst_stride1_dim =
1232 NumDims == 0 ? 1 : dim_map[dst_stride1_dim];
1233
1234 // Size of the innermost dimension (length of contiguous blocks of memory).
1235 IndexType dst_inner_dim_size = NumDims == 0 ? 1 : dst.dims[dst_stride1_dim];
1236
1237 // Squeeze multiple inner dims into one if they are contiguous in `dst` and
1238 // `src` memory, so we can do less linear copy calls.
1239 for (int i = num_size_one_inner_dims + 1; i < num_squeezable_dims; ++i) {
1240 const int dst_dim = IsColMajor ? i : NumDims - i - 1;
1241 const IndexType dst_stride = dst.strides[dst_dim];
1242 const IndexType src_stride = src.strides[dim_map[dst_dim]];
1243 if (dst_inner_dim_size == dst_stride && dst_stride == src_stride) {
1244 dst_inner_dim_size *= dst.dims[dst_dim];
1245 ++num_size_one_inner_dims;
1246 } else {
1247 break;
1248 }
1249 }
1250
1251 // Setup strides to read data from `src` and write to `dst`.
1252 IndexType input_offset = src.offset;
1253 IndexType output_offset = dst.offset;
1254 IndexType input_stride =
1255 NumDims == 0 ? 1 : src.strides[src_dim_for_dst_stride1_dim];
1256 IndexType output_stride = NumDims == 0 ? 1 : dst.strides[dst_stride1_dim];
1257
1258 const int at_least_1_dim = NumDims <= 1 ? 1 : NumDims - 1;
1259 array<BlockIteratorState, at_least_1_dim> it;
1260
1261 // Initialize block iterator state. Squeeze away any dimension of size 1.
1262 int idx = 0; // currently initialized iterator state index
1263 for (int i = num_size_one_inner_dims; i < NumDims - 1; ++i) {
1264 const int dst_dim = IsColMajor ? i + 1 : NumDims - i - 2;
1265 if (dst.dims[dst_dim] == 1) continue;
1266
1267 it[idx].size = dst.dims[dst_dim];
1268 it[idx].input_stride = src.strides[dim_map[dst_dim]];
1269 it[idx].output_stride = dst.strides[dst_dim];
1270
1271 it[idx].input_span = it[idx].input_stride * (it[idx].size - 1);
1272 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1273
1274 idx++;
1275 }
1276
1277 // Iterate copying data from src to dst.
1278 const IndexType block_total_size = NumDims == 0 ? 1 : dst.dims.TotalSize();
1279
1280 #define COPY_INNER_DIM(KIND) \
1281 IndexType num_copied = 0; \
1282 for (num_copied = 0; num_copied < block_total_size; \
1283 num_copied += dst_inner_dim_size) { \
1284 LinCopy::template Run<KIND>( \
1285 typename LinCopy::Dst(output_offset, output_stride, dst.data), \
1286 typename LinCopy::Src(input_offset, input_stride, src.data), \
1287 dst_inner_dim_size); \
1288 \
1289 for (int j = 0; j < idx; ++j) { \
1290 if (++it[j].count < it[j].size) { \
1291 input_offset += it[j].input_stride; \
1292 output_offset += it[j].output_stride; \
1293 break; \
1294 } \
1295 it[j].count = 0; \
1296 input_offset -= it[j].input_span; \
1297 output_offset -= it[j].output_span; \
1298 } \
1299 } \
1300 return num_copied;
1301
1302 if (input_stride == 1 && output_stride == 1) {
1303 COPY_INNER_DIM(LinCopy::Kind::Linear);
1304 } else if (input_stride == 1 && output_stride != 1) {
1305 COPY_INNER_DIM(LinCopy::Kind::Scatter);
1306 } else if (input_stride == 0 && output_stride == 1) {
1307 COPY_INNER_DIM(LinCopy::Kind::FillLinear);
1308 } else if (input_stride == 0 && output_stride != 1) {
1309 COPY_INNER_DIM(LinCopy::Kind::FillScatter);
1310 } else if (output_stride == 1) {
1311 COPY_INNER_DIM(LinCopy::Kind::Gather);
1312 } else {
1313 COPY_INNER_DIM(LinCopy::Kind::Random);
1314 }
1315
1316 #undef COPY_INNER_DIM
1317 }
1318
1319 // Copy from `src` to `dst` with an identity src->dst dimension map. Returns
1320 // the number of copied elements.
1321 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexType Copy(const Dst& dst,
1322 const Src& src) {
1323 DimensionsMap dst_to_src_map;
1324 for (int i = 0; i < NumDims; ++i) dst_to_src_map[i] = i;
1325 return Copy(dst, src, dst_to_src_map);
1326 }
1327
1328 private:
1329 struct BlockIteratorState {
1330 BlockIteratorState()
1331 : size(0),
1332 count(0),
1333 input_stride(0),
1334 output_stride(0),
1335 input_span(0),
1336 output_span(0) {}
1337
1338 IndexType size;
1339 IndexType count;
1340 IndexType input_stride;
1341 IndexType output_stride;
1342 IndexType input_span;
1343 IndexType output_span;
1344 };
1345
1346 // Compute how many inner dimensions it's allowed to squeeze when doing IO
1347 // between two tensor blocks. It's safe to squeeze inner dimensions, only
1348 // if they are not reordered.
1349 static int NumSqueezableInnerDims(const DimensionsMap& dim_map) {
1350 int num_squeezable_dims = 0;
1351 for (int i = 0; i < NumDims; ++i) {
1352 const int dim = IsColMajor ? i : NumDims - i - 1;
1353 if (dim_map[dim] != dim) break;
1354 num_squeezable_dims++;
1355 }
1356 return num_squeezable_dims;
1357 }
1358 };
1359
1360 // -------------------------------------------------------------------------- //
1361 // TensorBlockAssignment assigns a block expression of type `TensorBlockExpr` to
1362 // a Tensor block defined by `desc`, backed by a memory buffer at `target`.
1363 //
1364 // Currently there is no way to write from a Tensor expression to a block of
1365 // memory, if dimensions are reordered. If you need to do that, you should
1366 // materialize a Tensor block expression into a memory buffer, and then use
1367 // TensorBlockIO to copy data between two memory buffers with a custom
1368 // `target->src` dimension map (see definition above).
1369 //
1370 // Also currently the innermost dimension of `target` must have a stride '1'
1371 // (contiguous in memory). This restriction could be lifted with a `pscatter`,
1372 // but in practice it's never needed, and there is a similar TensorBlockIO
1373 // workaround for that.
1374 //
1375 // TODO(ezhulenev): TensorBlockAssignment is a special case of TensorBlockIO
1376 // where `src` is a tensor expression. Explore if it is possible to rewrite IO
1377 // to use expressions instead of pointers, and after that TensorBlockAssignment
1378 // will become an alias to IO.
1379 template <typename Scalar, int NumDims, typename TensorBlockExpr,
1380 typename IndexType = Eigen::Index>
1381 class TensorBlockAssignment {
1382 // We will use coeff/packet path to evaluate block expressions.
1383 typedef TensorEvaluator<const TensorBlockExpr, DefaultDevice>
1384 TensorBlockEvaluator;
1385
1386 typedef DSizes<IndexType, NumDims> Dimensions;
1387
1388 enum {
1389 Vectorizable = packet_traits<Scalar>::Vectorizable,
1390 PacketSize = packet_traits<Scalar>::size
1391 };
1392
1393 template <bool Vectorizable, typename Evaluator>
1394 struct InnerDimAssign {
1395 EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
1396 const Evaluator& eval,
1397 IndexType eval_offset) {
1398 for (IndexType i = 0; i < count; ++i) {
1399 target[i] = eval.coeff(eval_offset + i);
1400 }
1401 }
1402 };
1403
1404 template <typename Evaluator>
1405 struct InnerDimAssign<true, Evaluator> {
1406 EIGEN_ALWAYS_INLINE static void Run(Scalar* target, IndexType count,
1407 const Evaluator& eval,
1408 IndexType eval_offset) {
1409 typedef typename packet_traits<Scalar>::type Packet;
1410
1411 const IndexType unrolled_size = count - 4 * PacketSize;
1412 const IndexType vectorized_size = count - PacketSize;
1413 IndexType i = 0;
1414
1415 for (; i <= unrolled_size; i += 4 * PacketSize) {
1416 for (int j = 0; j < 4; ++j) {
1417 const IndexType idx = eval_offset + i + j * PacketSize;
1418 Packet p = eval.template packet<Unaligned>(idx);
1419 pstoreu<Scalar>(target + i + j * PacketSize, p);
1420 }
1421 }
1422
1423 for (; i <= vectorized_size; i += PacketSize) {
1424 Packet p = eval.template packet<Unaligned>(eval_offset + i);
1425 pstoreu<Scalar>(target + i, p);
1426 }
1427
1428 for (; i < count; ++i) {
1429 target[i] = eval.coeff(eval_offset + i);
1430 }
1431 }
1432 };
1433
1434 public:
1435 struct Target {
1436 Target(const Dimensions& target_dims, const Dimensions& target_strides,
1437 Scalar* target_data, IndexType target_offset = 0)
1438 : dims(target_dims),
1439 strides(target_strides),
1440 data(target_data),
1441 offset(target_offset) {}
1442
1443 Dimensions dims;
1444 Dimensions strides;
1445 Scalar* data;
1446 IndexType offset;
1447 };
1448
1449 static Target target(const Dimensions& target_dims,
1450 const Dimensions& target_strides, Scalar* target_data,
1451 IndexType target_offset = 0) {
1452 return Target(target_dims, target_strides, target_data, target_offset);
1453 }
1454
1455 template <typename TargetDimsIndexType, typename TargetStridesIndexType>
1456 static Target target(
1457 const DSizes<TargetDimsIndexType, NumDims>& target_dims,
1458 const DSizes<TargetStridesIndexType, NumDims>& target_strides,
1459 Scalar* target_data, IndexType target_offset = 0) {
1460 // DSizes constructor will do index type promotion if it's safe.
1461 return Target(Dimensions(target_dims), Dimensions(target_strides),
1462 target_data, target_offset);
1463 }
1464
1465 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void Run(
1466 const Target& target, const TensorBlockExpr& expr) {
1467 // Prepare evaluator for block expression.
1468 DefaultDevice default_device;
1469 TensorBlockEvaluator eval(expr, default_device);
1470
1471 // Tensor block expression dimension should match destination dimensions.
1472 eigen_assert(dimensions_match(target.dims, eval.dimensions()));
1473
1474 static const int Layout = TensorBlockEvaluator::Layout;
1475 static const bool is_col_major = Layout == ColMajor;
1476
1477 // Initialize output inner dimension size based on a layout.
1478 const IndexType output_size = NumDims == 0 ? 1 : target.dims.TotalSize();
1479 const int inner_dim_idx = is_col_major ? 0 : NumDims - 1;
1480 IndexType output_inner_dim_size = target.dims[inner_dim_idx];
1481
1482 // Target inner dimension stride must be '1'.
1483 eigen_assert(target.strides[inner_dim_idx] == 1);
1484
1485 // Squeeze multiple inner dims into one if they are contiguous in `target`.
1486 IndexType num_squeezed_dims = 0;
1487 for (Index i = 1; i < NumDims; ++i) {
1488 const Index dim = is_col_major ? i : NumDims - i - 1;
1489 const IndexType target_stride = target.strides[dim];
1490
1491 if (output_inner_dim_size == target_stride) {
1492 output_inner_dim_size *= target.dims[dim];
1493 num_squeezed_dims++;
1494 } else {
1495 break;
1496 }
1497 }
1498
1499 // Initialize output block iterator state. Dimension in this array are
1500 // always in inner_most -> outer_most order (col major layout).
1501 array<BlockIteratorState, NumDims> it;
1502
1503 int idx = 0; // currently initialized iterator state index
1504 for (Index i = num_squeezed_dims; i < NumDims - 1; ++i) {
1505 const Index dim = is_col_major ? i + 1 : NumDims - i - 2;
1506
1507 it[idx].count = 0;
1508 it[idx].size = target.dims[dim];
1509 it[idx].output_stride = target.strides[dim];
1510 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
1511 idx++;
1512 }
1513
1514 // We read block expression from the beginning, and start writing data to
1515 // `target` at given offset.
1516 IndexType input_offset = 0;
1517 IndexType output_offset = target.offset;
1518
1519 // Iterate copying data from `eval` to `target`.
1520 for (IndexType i = 0; i < output_size; i += output_inner_dim_size) {
1521 // Assign to `target` at current offset.
1522 InnerDimAssign<Vectorizable && TensorBlockEvaluator::PacketAccess,
1523 TensorBlockEvaluator>::Run(target.data + output_offset,
1524 output_inner_dim_size, eval,
1525 input_offset);
1526
1527 // Move input offset forward by the number of assigned coefficients.
1528 input_offset += output_inner_dim_size;
1529
1530 // Update index.
1531 for (int j = 0; j < idx; ++j) {
1532 if (++it[j].count < it[j].size) {
1533 output_offset += it[j].output_stride;
1534 break;
1535 }
1536 it[j].count = 0;
1537 output_offset -= it[j].output_span;
1538 }
1539 }
1540 }
1541
1542 private:
1543 struct BlockIteratorState {
1544 BlockIteratorState()
1545 : count(0), size(0), output_stride(0), output_span(0) {}
1546
1547 IndexType count;
1548 IndexType size;
1549 IndexType output_stride;
1550 IndexType output_span;
1551 };
1552 };
1553
1554 // -------------------------------------------------------------------------- //
1555
1556 } // namespace internal
1557 } // namespace Eigen
1558
1559 #endif // EIGEN_CXX11_TENSOR_TENSOR_BLOCK_H
1560