• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // This file is part of Eigen, a lightweight C++ template library
2 // for linear algebra.
3 //
4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5 //
6 // This Source Code Form is subject to the terms of the Mozilla
7 // Public License v. 2.0. If a copy of the MPL was not distributed
8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 
10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12 
13 namespace Eigen {
14 
15 /** \class TensorConvolution
16   * \ingroup CXX11_Tensor_Module
17   *
18   * \brief Tensor convolution class.
19   *
20   *
21   */
22 namespace internal {
23 
24 template <typename Index, typename InputDims, int NumKernelDims, int Layout>
25 class IndexMapper {
26  public:
IndexMapper(const InputDims & input_dims,const array<Index,NumKernelDims> & kernel_dims,const array<Index,NumKernelDims> & indices)27   IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
28               const array<Index, NumKernelDims>& indices) {
29 
30     array<Index, NumDims> dimensions = input_dims;
31     for (int i = 0; i < NumKernelDims; ++i) {
32       const Index index = indices[i];
33       const Index input_dim = input_dims[index];
34       const Index kernel_dim = kernel_dims[i];
35       const Index result_dim = input_dim - kernel_dim + 1;
36       dimensions[index] = result_dim;
37     }
38 
39     array<Index, NumDims> inputStrides;
40     array<Index, NumDims> outputStrides;
41     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
42       inputStrides[0] = 1;
43       outputStrides[0] = 1;
44       for (int i = 1; i < NumDims; ++i) {
45         inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
46         outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
47       }
48     } else {
49       inputStrides[NumDims - 1] = 1;
50       outputStrides[NumDims - 1] = 1;
51       for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
52         inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
53         outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
54       }
55     }
56 
57     array<Index, NumDims> gpuInputDimensions;
58     array<Index, NumDims> gpuOutputDimensions;
59     array<Index, NumDims> tmp = dimensions;
60     array<Index, NumDims> ordering;
61     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
62                               ? 0
63                               : NumDims - NumKernelDims;
64     for (int i = 0; i < NumKernelDims; ++i) {
65       const Index index = i + offset;
66       ordering[index] = indices[i];
67       tmp[indices[i]] = -1;
68       gpuInputDimensions[index] = input_dims[indices[i]];
69       gpuOutputDimensions[index] = dimensions[indices[i]];
70     }
71 
72     int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
73                       ? NumKernelDims
74                       : 0;
75     for (int i = 0; i < NumDims; ++i) {
76       if (tmp[i] >= 0) {
77         ordering[written] = i;
78         gpuInputDimensions[written] = input_dims[i];
79         gpuOutputDimensions[written] = dimensions[i];
80         ++written;
81       }
82     }
83 
84     for (int i = 0; i < NumDims; ++i) {
85       m_inputStrides[i] = inputStrides[ordering[i]];
86       m_outputStrides[i] = outputStrides[ordering[i]];
87     }
88 
89     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
90       for (int i = 0; i < NumDims; ++i) {
91         if (i > NumKernelDims) {
92           m_gpuInputStrides[i] =
93               m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
94           m_gpuOutputStrides[i] =
95               m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
96         } else {
97           m_gpuInputStrides[i] = 1;
98           m_gpuOutputStrides[i] = 1;
99         }
100       }
101     } else {
102       for (int i = NumDims - 1; i >= 0; --i) {
103         if (static_cast<size_t>(i + 1) < offset) {
104           m_gpuInputStrides[i] =
105               m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
106           m_gpuOutputStrides[i] =
107               m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
108         } else {
109           m_gpuInputStrides[i] = 1;
110           m_gpuOutputStrides[i] = 1;
111         }
112       }
113     }
114   }
115 
mapGpuInputPlaneToTensorInputOffset(Index p)116   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
117     Index inputIndex = 0;
118     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
119       for (int d = NumDims - 1; d > NumKernelDims; --d) {
120         const Index idx = p / m_gpuInputStrides[d];
121         inputIndex += idx * m_inputStrides[d];
122         p -= idx * m_gpuInputStrides[d];
123       }
124       inputIndex += p * m_inputStrides[NumKernelDims];
125     } else {
126       std::ptrdiff_t limit = 0;
127       if (NumKernelDims < NumDims) {
128         limit = NumDims - NumKernelDims - 1;
129       }
130       for (int d = 0; d < limit; ++d) {
131         const Index idx = p / m_gpuInputStrides[d];
132         inputIndex += idx * m_inputStrides[d];
133         p -= idx * m_gpuInputStrides[d];
134       }
135       inputIndex += p * m_inputStrides[limit];
136     }
137     return inputIndex;
138   }
139 
mapGpuOutputPlaneToTensorOutputOffset(Index p)140   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
141     Index outputIndex = 0;
142     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
143       for (int d = NumDims - 1; d > NumKernelDims; --d) {
144         const Index idx = p / m_gpuOutputStrides[d];
145         outputIndex += idx * m_outputStrides[d];
146         p -= idx * m_gpuOutputStrides[d];
147       }
148       outputIndex += p * m_outputStrides[NumKernelDims];
149     } else {
150       std::ptrdiff_t limit = 0;
151       if (NumKernelDims < NumDims) {
152         limit = NumDims - NumKernelDims - 1;
153       }
154       for (int d = 0; d < limit; ++d) {
155         const Index idx = p / m_gpuOutputStrides[d];
156         outputIndex += idx * m_outputStrides[d];
157         p -= idx * m_gpuOutputStrides[d];
158       }
159       outputIndex += p * m_outputStrides[limit];
160     }
161     return outputIndex;
162   }
163 
mapGpuInputKernelToTensorInputOffset(Index i)164   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
165     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
166                               ? 0
167                               : NumDims - NumKernelDims;
168     return i * m_inputStrides[offset];
169   }
170 
mapGpuOutputKernelToTensorOutputOffset(Index i)171   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
172     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
173                               ? 0
174                               : NumDims - NumKernelDims;
175     return i * m_outputStrides[offset];
176   }
177 
mapGpuInputKernelToTensorInputOffset(Index i,Index j)178   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
179     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
180                               ? 0
181                               : NumDims - NumKernelDims;
182     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
183   }
184 
mapGpuOutputKernelToTensorOutputOffset(Index i,Index j)185   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
186     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
187                               ? 0
188                               : NumDims - NumKernelDims;
189     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
190   }
191 
mapGpuInputKernelToTensorInputOffset(Index i,Index j,Index k)192   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
193     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
194                               ? 0
195                               : NumDims - NumKernelDims;
196     return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
197            k * m_inputStrides[offset + 2];
198   }
199 
mapGpuOutputKernelToTensorOutputOffset(Index i,Index j,Index k)200   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
201     const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
202                               ? 0
203                               : NumDims - NumKernelDims;
204     return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
205            k * m_outputStrides[offset + 2];
206   }
207 
208  private:
209   static const int NumDims = internal::array_size<InputDims>::value;
210   array<Index, NumDims> m_inputStrides;
211   array<Index, NumDims> m_outputStrides;
212   array<Index, NumDims> m_gpuInputStrides;
213   array<Index, NumDims> m_gpuOutputStrides;
214 };
215 
216 
217 
218 template<typename Dimensions, typename InputXprType, typename KernelXprType>
219 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
220 {
221   // Type promotion to handle the case where the types of the lhs and the rhs are different.
222   typedef typename promote_storage_type<typename InputXprType::Scalar,
223                                         typename KernelXprType::Scalar>::ret Scalar;
224   typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
225                                         typename traits<KernelXprType>::StorageKind>::ret StorageKind;
226   typedef typename promote_index_type<typename traits<InputXprType>::Index,
227                                       typename traits<KernelXprType>::Index>::type Index;
228   typedef typename InputXprType::Nested LhsNested;
229   typedef typename KernelXprType::Nested RhsNested;
230   typedef typename remove_reference<LhsNested>::type _LhsNested;
231   typedef typename remove_reference<RhsNested>::type _RhsNested;
232   static const int NumDimensions = traits<InputXprType>::NumDimensions;
233   static const int Layout = traits<InputXprType>::Layout;
234   typedef typename conditional<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
235   typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>::type PointerType;
236 
237   enum {
238     Flags = 0
239   };
240 };
241 
242 template<typename Dimensions, typename InputXprType, typename KernelXprType>
243 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
244 {
245   typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
246 };
247 
248 template<typename Dimensions, typename InputXprType, typename KernelXprType>
249 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
250 {
251   typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
252 };
253 
254 }  // end namespace internal
255 
256 
257 
258 template<typename Indices, typename InputXprType, typename KernelXprType>
259 class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
260 {
261   public:
262   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
263   typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
264   typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
265                                                   typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
266   typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
267   typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
268   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
269 
270   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
271       : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
272 
273     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
274     const Indices& indices() const { return m_indices; }
275 
276     /** \returns the nested expressions */
277     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
278     const typename internal::remove_all<typename InputXprType::Nested>::type&
279     inputExpression() const { return m_input_xpr; }
280 
281     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
282     const typename internal::remove_all<typename KernelXprType::Nested>::type&
283     kernelExpression() const { return m_kernel_xpr; }
284 
285   protected:
286     typename InputXprType::Nested m_input_xpr;
287     typename KernelXprType::Nested m_kernel_xpr;
288     const Indices m_indices;
289 };
290 
291 
292 template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
293 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
294 {
295   typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
296 
297   static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
298   static const int NumKernelDims = internal::array_size<Indices>::value;
299   typedef typename XprType::Index Index;
300   typedef DSizes<Index, NumDims> Dimensions;
301 
302   typedef typename XprType::Scalar Scalar;
303   typedef typename XprType::CoeffReturnType CoeffReturnType;
304   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
305   static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
306   typedef StorageMemory<Scalar, Device> Storage;
307   typedef typename Storage::Type EvaluatorPointerType;
308 
309   enum {
310     IsAligned = int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
311     PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) & int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
312     BlockAccess = false,
313     PreferBlockAccess = false,
314     Layout = TensorEvaluator<InputArgType, Device>::Layout,
315     CoordAccess = false,  // to be implemented
316     RawAccess = false
317   };
318 
319   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
320   typedef internal::TensorBlockNotImplemented TensorBlock;
321   //===--------------------------------------------------------------------===//
322 
323   EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
324       : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
325   {
326     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
327 
328     const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
329     const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
330 
331     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
332       m_inputStride[0] = 1;
333       for (int i = 1; i < NumDims; ++i) {
334         m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
335       }
336     } else {
337       m_inputStride[NumDims - 1] = 1;
338       for (int i = NumDims - 2; i >= 0; --i) {
339         m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
340       }
341     }
342 
343     m_dimensions = m_inputImpl.dimensions();
344     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
345       for (int i = 0; i < NumKernelDims; ++i) {
346         const Index index = op.indices()[i];
347         const Index input_dim = input_dims[index];
348         const Index kernel_dim = kernel_dims[i];
349         const Index result_dim = input_dim - kernel_dim + 1;
350         m_dimensions[index] = result_dim;
351         if (i > 0) {
352           m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
353         } else {
354           m_kernelStride[0] = 1;
355         }
356         m_indexStride[i] = m_inputStride[index];
357       }
358 
359       m_outputStride[0] = 1;
360       for (int i = 1; i < NumDims; ++i) {
361         m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
362       }
363     } else {
364       for (int i = NumKernelDims - 1; i >= 0; --i) {
365         const Index index = op.indices()[i];
366         const Index input_dim = input_dims[index];
367         const Index kernel_dim = kernel_dims[i];
368         const Index result_dim = input_dim - kernel_dim + 1;
369         m_dimensions[index] = result_dim;
370         if (i < NumKernelDims - 1) {
371           m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
372         } else {
373           m_kernelStride[NumKernelDims - 1] = 1;
374         }
375         m_indexStride[i] = m_inputStride[index];
376       }
377 
378       m_outputStride[NumDims - 1] = 1;
379       for (int i = NumDims - 2; i >= 0; --i) {
380         m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
381       }
382     }
383   }
384 
385   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
386 
387   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
388     m_inputImpl.evalSubExprsIfNeeded(NULL);
389     preloadKernel();
390     return true;
391   }
392   EIGEN_STRONG_INLINE void cleanup() {
393     m_inputImpl.cleanup();
394     if (m_local_kernel) {
395       m_device.deallocate((void*)m_kernel);
396       m_local_kernel = false;
397     }
398     m_kernel = NULL;
399   }
400 
401   void evalTo(typename XprType::Scalar* buffer) {
402     evalSubExprsIfNeeded(NULL);
403     for (int i = 0; i < dimensions().TotalSize(); ++i) {
404       buffer[i] += coeff(i);
405     }
406     cleanup();
407   }
408 
409   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
410   {
411     CoeffReturnType result = CoeffReturnType(0);
412     convolve(firstInput(index), 0, NumKernelDims-1, result);
413     return result;
414   }
415 
416   template<int LoadMode>
417   EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
418   {
419     Index indices[2] = {index, index+PacketSize-1};
420     Index startInputs[2] = {0, 0};
421     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
422       for (int i = NumDims - 1; i > 0; --i) {
423         const Index idx0 = indices[0] / m_outputStride[i];
424         const Index idx1 = indices[1] / m_outputStride[i];
425         startInputs[0] += idx0 * m_inputStride[i];
426         startInputs[1] += idx1 * m_inputStride[i];
427         indices[0] -= idx0 * m_outputStride[i];
428         indices[1] -= idx1 * m_outputStride[i];
429       }
430     } else {
431       for (int i = 0; i < NumDims - 1; ++i) {
432         const Index idx0 = indices[0] / m_outputStride[i];
433         const Index idx1 = indices[1] / m_outputStride[i];
434         startInputs[0] += idx0 * m_inputStride[i];
435         startInputs[1] += idx1 * m_inputStride[i];
436         indices[0] -= idx0 * m_outputStride[i];
437         indices[1] -= idx1 * m_outputStride[i];
438       }
439     }
440     startInputs[0] += indices[0];
441     startInputs[1] += indices[1];
442 
443     if (startInputs[1]-startInputs[0] == PacketSize-1) {
444       PacketReturnType result = internal::pset1<PacketReturnType>(0);
445       convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
446       return result;
447     } else {
448       EIGEN_ALIGN_MAX Scalar data[PacketSize];
449       data[0] = Scalar(0);
450       convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
451       for (int i = 1; i < PacketSize-1; ++i) {
452         data[i] = Scalar(0);
453         convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
454       }
455       data[PacketSize-1] = Scalar(0);
456       convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
457       return internal::pload<PacketReturnType>(data);
458     }
459   }
460 
461   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
462   costPerCoeff(bool vectorized) const {
463     const double kernel_size = m_kernelImpl.dimensions().TotalSize();
464     // We ignore the use of fused multiply-add.
465     const double convolve_compute_cost =
466         TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
467     const double firstIndex_compute_cost =
468         NumDims *
469         (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
470          TensorOpCost::DivCost<Index>());
471     return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
472            kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
473                           m_kernelImpl.costPerCoeff(vectorized) +
474                           TensorOpCost(0, 0, convolve_compute_cost, vectorized,
475                                        PacketSize));
476   }
477 
478   EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
479 
480  private:
481   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
482     Index startInput = 0;
483     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
484       for (int i = NumDims - 1; i > 0; --i) {
485         const Index idx = index / m_outputStride[i];
486         startInput += idx * m_inputStride[i];
487         index -= idx * m_outputStride[i];
488       }
489     } else {
490       for (int i = 0; i < NumDims - 1; ++i) {
491         const Index idx = index / m_outputStride[i];
492         startInput += idx * m_inputStride[i];
493         index -= idx * m_outputStride[i];
494       }
495     }
496     startInput += index;
497     return startInput;
498   }
499 
500   EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
501     for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
502       const Index input = firstIndex + j * m_indexStride[DimIndex];
503       const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
504       if (DimIndex > 0) {
505         convolve(input, kernel, DimIndex-1, accum);
506       } else {
507         accum += m_inputImpl.coeff(input) * m_kernel[kernel];
508       }
509     }
510   }
511 
512   template <typename Packet>
513   EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
514     for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
515       const Index input = firstIndex + j * m_indexStride[DimIndex];
516       const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
517       if (DimIndex > 0) {
518         convolvePacket(input, kernel, DimIndex-1, accum);
519       } else {
520         accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
521       }
522     }
523   }
524 
525   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
526     // Don't make a local copy of the kernel unless we have to (i.e. it's an
527     // expression that needs to be evaluated)
528     const Scalar* in_place = m_kernelImpl.data();
529     if (in_place) {
530       m_kernel = in_place;
531       m_local_kernel = false;
532     } else {
533       size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
534       Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
535       typedef TensorEvalToOp<const KernelArgType> EvalTo;
536       EvalTo evalToTmp(local, m_kernelArg);
537       const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
538       internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
539 
540       m_kernel = local;
541       m_local_kernel = true;
542     }
543   }
544 
545   array<Index, NumDims> m_inputStride;
546   array<Index, NumDims> m_outputStride;
547 
548   array<Index, NumKernelDims> m_indexStride;
549   array<Index, NumKernelDims> m_kernelStride;
550   TensorEvaluator<InputArgType, Device> m_inputImpl;
551   TensorEvaluator<KernelArgType, Device> m_kernelImpl;
552   Dimensions m_dimensions;
553 
554   KernelArgType m_kernelArg;
555   const Scalar* m_kernel;
556   bool m_local_kernel;
557   const Device EIGEN_DEVICE_REF m_device;
558 };
559 
560 
561 
562 
563 // Use an optimized implementation of the evaluation code for GPUs whenever possible.
564 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
565 
566 template <int StaticKernelSize>
567 struct GetKernelSize {
568   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
569     return StaticKernelSize;
570   }
571 };
572 template <>
573 struct GetKernelSize<Dynamic> {
574   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
575     return kernelSize;
576   }
577 };
578 
579 template <typename InputEvaluator, typename Index, typename InputDims,
580           int StaticKernelSize>
581 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
582     InputEvaluator eval,
583     const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
584         indexMapper,
585     const float* __restrict kernel, const int numPlanes, const int numX,
586     const int maxX, const int kernelSize, float* buffer) {
587 #if defined(EIGEN_HIPCC)
588   HIP_DYNAMIC_SHARED(float, s)
589 #else
590   extern __shared__ float s[];
591 #endif
592 
593   const int first_x = blockIdx.x * maxX;
594   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
595   const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
596   const int num_x_output = last_x - first_x + 1;
597 
598   const int first_plane = blockIdx.y * blockDim.y;
599   const int plane_stride = blockDim.y * gridDim.y;
600 
601   for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
602     // Load inputs to shared memory
603     const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
604     const int plane_kernel_offset = threadIdx.y * num_x_input;
605     #pragma unroll
606     for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
607       const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
608       s[i + plane_kernel_offset] = eval.coeff(tensor_index);
609     }
610 
611     __syncthreads();
612 
613     // Compute the convolution
614     const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
615 
616     #pragma unroll
617     for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
618       const int kernel_offset = plane_kernel_offset + i;
619       float result = 0.0f;
620       #pragma unroll
621       for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
622         result += s[k + kernel_offset] * kernel[k];
623       }
624       const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
625       buffer[tensor_index] = result;
626     }
627     __syncthreads();
628   }
629 };
630 
631 template <typename InputEvaluator, typename Index, typename InputDims,
632           int StaticKernelSizeX, int StaticKernelSizeY>
633 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
634     InputEvaluator eval,
635     const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
636         indexMapper,
637     const float* __restrict kernel, const int numPlanes, const int numX,
638     const int maxX, const int numY, const int maxY, const int kernelSizeX,
639     const int kernelSizeY, float* buffer) {
640 #if defined(EIGEN_HIPCC)
641   HIP_DYNAMIC_SHARED(float, s)
642 #else
643   extern __shared__ float s[];
644 #endif
645 
646   const int first_x = blockIdx.x * maxX;
647   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
648   const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
649   const int num_x_output = last_x - first_x + 1;
650 
651   const int first_y = blockIdx.y * maxY;
652   const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
653   const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
654   const int num_y_output = last_y - first_y + 1;
655 
656   const int first_plane = blockIdx.z * blockDim.z;
657   const int plane_stride = blockDim.z * gridDim.z;
658 
659   for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
660 
661     const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
662     const int plane_kernel_offset = threadIdx.z * num_y_input;
663 
664     // Load inputs to shared memory
665     #pragma unroll
666     for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
667       const int input_offset = num_x_input * (j + plane_kernel_offset);
668       #pragma unroll
669       for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
670         const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
671         s[i + input_offset] = eval.coeff(tensor_index);
672       }
673     }
674 
675     __syncthreads();
676 
677     // Convolution
678     const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
679 
680     #pragma unroll
681     for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
682       #pragma unroll
683       for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
684         float result = 0.0f;
685         #pragma unroll
686         for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
687           const int kernel_offset = kernelSizeX * l;
688           const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
689           #pragma unroll
690           for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
691             result += s[k + input_offset] * kernel[k + kernel_offset];
692           }
693         }
694         const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
695         buffer[tensor_index] = result;
696       }
697     }
698 
699     __syncthreads();
700   }
701 };
702 
703 template <typename InputEvaluator, typename Index, typename InputDims>
704 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
705     InputEvaluator eval,
706     const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
707         indexMapper,
708     const float* __restrict kernel, const size_t numPlanes, const size_t numX,
709     const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
710     const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
711     const size_t kernelSizeZ, float* buffer) {
712 #if defined(EIGEN_HIPCC)
713   HIP_DYNAMIC_SHARED(float, s)
714 #else
715   extern __shared__ float s[];
716 #endif
717 
718   // Load inputs to shared memory
719   const int first_x = blockIdx.x * maxX;
720   const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
721   const int num_x_input = last_x - first_x + kernelSizeX;
722 
723   const int first_y = blockIdx.y * maxY;
724   const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
725   const int num_y_input = last_y - first_y + kernelSizeY;
726 
727   const int first_z = blockIdx.z * maxZ;
728   const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
729   const int num_z_input = last_z - first_z + kernelSizeZ;
730 
731   for (int p = 0; p < numPlanes; ++p) {
732 
733     const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
734     const int plane_kernel_offset = 0;
735 
736     for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
737       for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
738         for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
739           const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
740           s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
741         }
742       }
743     }
744 
745     __syncthreads();
746 
747     // Convolution
748     const int num_z_output = last_z - first_z + 1;
749     const int num_y_output = last_y - first_y + 1;
750     const int num_x_output = last_x - first_x + 1;
751     const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
752 
753     for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
754       for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
755         for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
756           float result = 0.0f;
757           for (int n = 0; n < kernelSizeZ; ++n) {
758             for (int m = 0; m < kernelSizeY; ++m) {
759               for (int l = 0; l < kernelSizeX; ++l) {
760                 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
761               }
762             }
763           }
764           const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
765           buffer[tensor_index] = result;
766         }
767       }
768     }
769     __syncthreads();
770   }
771 };
772 
773 
774 
775 template<typename Indices, typename InputArgType, typename KernelArgType>
776 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
777 {
778   typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
779 
780   static const int NumDims =  internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
781   static const int NumKernelDims = internal::array_size<Indices>::value;
782   typedef typename XprType::Index Index;
783   typedef DSizes<Index, NumDims> Dimensions;
784   typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
785 
786   enum {
787     IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
788     PacketAccess = false,
789     BlockAccess = false,
790     PreferBlockAccess = false,
791     Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
792     CoordAccess = false,  // to be implemented
793     RawAccess = false
794   };
795 
796   //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
797   typedef internal::TensorBlockNotImplemented TensorBlock;
798   //===--------------------------------------------------------------------===//
799 
800   TensorEvaluator(const XprType& op, const GpuDevice& device)
801       : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
802   {
803     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
804 
805     const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
806     const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
807 
808     m_dimensions = m_inputImpl.dimensions();
809     for (int i = 0; i < NumKernelDims; ++i) {
810       const Index index = op.indices()[i];
811       const Index input_dim = input_dims[index];
812       const Index kernel_dim = kernel_dims[i];
813       const Index result_dim = input_dim - kernel_dim + 1;
814       m_dimensions[index] = result_dim;
815     }
816   }
817 
818   typedef typename XprType::CoeffReturnType CoeffReturnType;
819   typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
820   typedef typename InputArgType::Scalar Scalar;
821   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
822 
823   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
824 
825   EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
826     preloadKernel();
827     m_inputImpl.evalSubExprsIfNeeded(NULL);
828     if (data) {
829       executeEval(data);
830       return false;
831     } else {
832       m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
833       executeEval(m_buf);
834       return true;
835     }
836   }
837 
838   EIGEN_STRONG_INLINE void cleanup() {
839     m_inputImpl.cleanup();
840     if (m_buf) {
841       m_device.deallocate(m_buf);
842       m_buf = NULL;
843     }
844     if (m_local_kernel) {
845       m_device.deallocate((void*)m_kernel);
846       m_local_kernel = false;
847     }
848     m_kernel = NULL;
849   }
850 
851   EIGEN_STRONG_INLINE void preloadKernel() {
852     // Don't make a local copy of the kernel unless we have to (i.e. it's an
853     // expression that needs to be evaluated)
854     const Scalar* in_place = m_kernelImpl.data();
855     if (in_place) {
856       m_kernel = in_place;
857       m_local_kernel = false;
858     } else {
859       size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
860       Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
861       typedef TensorEvalToOp<const KernelArgType> EvalTo;
862       EvalTo evalToTmp(local, m_kernelArg);
863       const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
864       internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
865 
866       m_kernel = local;
867       m_local_kernel = true;
868     }
869   }
870 
871   static unsigned int ceil(unsigned int num, unsigned int denom) {
872     const unsigned int rounded_toward_zero = num / denom;
873     if (num > rounded_toward_zero * denom) {
874       return rounded_toward_zero + 1;
875     }
876     return rounded_toward_zero;
877   }
878 
879   void executeEval(Scalar* data) const {
880     typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
881 
882     const int maxSharedMem = m_device.sharedMemPerBlock();
883     const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
884     const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
885     const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
886     const int warpSize = 32;
887 
888     switch (NumKernelDims) {
889       case 1: {
890         const int kernel_size = m_kernelImpl.dimensions().TotalSize();
891 
892         const int numX = dimensions()[m_indices[0]];
893         const int numP = dimensions().TotalSize() / numX;
894         int maxX;
895         dim3 block_size;
896 
897         const int single_stride_dim =
898             static_cast<int>(Layout) == static_cast<int>(ColMajor)
899                 ? 0
900                 : m_inputImpl.dimensions().rank() - 1;
901         if (m_indices[0] == single_stride_dim) {
902           // Maximum the reuse
903           const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
904           maxX = numext::mini<int>(inner_dim, numX);
905           const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
906           block_size.x = numext::mini(maxThreadsPerBlock, maxX);
907           block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
908         }
909         else {
910           // Read as much as possible alongside the inner most dimension, that is the plane
911           const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
912           const int maxP = numext::mini<int>(inner_dim, numP);
913           maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
914 
915           block_size.x = numext::mini(warpSize, maxX);
916           block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
917         }
918 
919         const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
920         gpu_assert(shared_mem <= maxSharedMem);
921 
922         const int num_x_blocks = ceil(numX, maxX);
923         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
924         const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
925 
926         dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
927 
928 
929         //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
930 
931         const array<Index, 1> indices(m_indices[0]);
932         const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
933         internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
934             m_inputImpl.dimensions(), kernel_dims, indices);
935         switch(kernel_size) {
936           case 4: {
937             LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
938             break;
939           }
940           case 7: {
941             LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
942             break;
943           }
944           default: {
945             LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
946           }
947         }
948         break;
949       }
950 
951       case 2: {
952         const int idxX =
953             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
954         const int idxY =
955             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
956         const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
957         const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
958 
959         const int numX = dimensions()[m_indices[idxX]];
960         const int numY = dimensions()[m_indices[idxY]];
961         const int numP = dimensions().TotalSize() / (numX*numY);
962 
963         const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
964 
965         // Snap maxX to warp size
966         int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
967         const int maxX = numext::mini<int>(inner_dim, numX);
968         const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
969         const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
970 
971         dim3 block_size;
972         block_size.x = numext::mini(1024, maxX);
973         block_size.y = numext::mini<int>(1024/block_size.x, maxY);
974         block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
975 
976         const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
977         gpu_assert(shared_mem <= maxSharedMem);
978 
979         const int num_x_blocks = ceil(numX, maxX);
980         const int num_y_blocks = ceil(numY, maxY);
981         const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
982         const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
983 
984         dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
985 
986 
987         //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
988 
989         const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
990         const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
991                                           m_kernelImpl.dimensions()[idxY]);
992         internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
993             m_inputImpl.dimensions(), kernel_dims, indices);
994         switch (kernel_size_x) {
995           case 4: {
996             switch (kernel_size_y) {
997               case 7: {
998                 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
999                 break;
1000               }
1001               default: {
1002                 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
1003                 break;
1004               }
1005             }
1006             break;
1007           }
1008           case 7: {
1009             switch (kernel_size_y) {
1010               case 4: {
1011                 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
1012                 break;
1013               }
1014               default: {
1015                 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
1016                 break;
1017               }
1018             }
1019             break;
1020           }
1021           default: {
1022             LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
1023             break;
1024           }
1025         }
1026         break;
1027       }
1028 
1029       case 3: {
1030         const int idxX =
1031             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
1032         const int idxY =
1033             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
1034         const int idxZ =
1035             static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
1036 
1037         const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1038         const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1039         const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1040 
1041         const int numX = dimensions()[m_indices[idxX]];
1042         const int numY = dimensions()[m_indices[idxY]];
1043         const int numZ = dimensions()[m_indices[idxZ]];
1044         const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1045 
1046         const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1047         const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1048         const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1049 
1050         dim3 block_size;
1051         block_size.x = numext::mini(32, maxX);
1052         block_size.y = numext::mini(32, maxY);
1053         block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1054         dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1055 
1056         const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1057         gpu_assert(shared_mem <= maxSharedMem);
1058 
1059         //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y  << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z  << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
1060         const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1061                                       m_indices[idxZ]);
1062         const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1063                                           m_kernelImpl.dimensions()[idxY],
1064                                           m_kernelImpl.dimensions()[idxZ]);
1065         internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
1066             m_inputImpl.dimensions(), kernel_dims, indices);
1067 
1068         LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1069         break;
1070       }
1071 
1072       default: {
1073         EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1074       }
1075     }
1076   }
1077 
1078   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
1079   {
1080     eigen_assert(m_buf);
1081     eigen_assert(index < m_dimensions.TotalSize());
1082     return m_buf[index];
1083   }
1084 
1085   template<int LoadMode>
1086   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
1087   {
1088     eigen_assert(m_buf);
1089     eigen_assert(index < m_dimensions.TotalSize());
1090     return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1091   }
1092 
1093   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
1094   costPerCoeff(bool vectorized) const {
1095     // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
1096     // model.
1097     const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1098     // We ignore the use of fused multiply-add.
1099     const double convolve_compute_cost =
1100         TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1101     const double firstIndex_compute_cost =
1102         NumDims *
1103         (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1104          TensorOpCost::DivCost<Index>());
1105     return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1106            kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1107                           m_kernelImpl.costPerCoeff(vectorized) +
1108                           TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1109                                        PacketSize));
1110   }
1111 
1112  private:
1113   // No assignment (copies are needed by the kernels)
1114   TensorEvaluator& operator = (const TensorEvaluator&);
1115 
1116   TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1117   TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1118   KernelArgType m_kernelArg;
1119   Indices m_indices;
1120   Dimensions m_dimensions;
1121   Scalar* m_buf;
1122   const Scalar* m_kernel;
1123   bool m_local_kernel;
1124 
1125   const GpuDevice& m_device;
1126 };
1127 #endif
1128 
1129 
1130 } // end namespace Eigen
1131 
1132 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
1133