1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H 12 13 namespace Eigen { 14 15 /** \class TensorConvolution 16 * \ingroup CXX11_Tensor_Module 17 * 18 * \brief Tensor convolution class. 19 * 20 * 21 */ 22 namespace internal { 23 24 template <typename Index, typename InputDims, int NumKernelDims, int Layout> 25 class IndexMapper { 26 public: IndexMapper(const InputDims & input_dims,const array<Index,NumKernelDims> & kernel_dims,const array<Index,NumKernelDims> & indices)27 IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims, 28 const array<Index, NumKernelDims>& indices) { 29 30 array<Index, NumDims> dimensions = input_dims; 31 for (int i = 0; i < NumKernelDims; ++i) { 32 const Index index = indices[i]; 33 const Index input_dim = input_dims[index]; 34 const Index kernel_dim = kernel_dims[i]; 35 const Index result_dim = input_dim - kernel_dim + 1; 36 dimensions[index] = result_dim; 37 } 38 39 array<Index, NumDims> inputStrides; 40 array<Index, NumDims> outputStrides; 41 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 42 inputStrides[0] = 1; 43 outputStrides[0] = 1; 44 for (int i = 1; i < NumDims; ++i) { 45 inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; 46 outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; 47 } 48 } else { 49 inputStrides[NumDims - 1] = 1; 50 outputStrides[NumDims - 1] = 1; 51 for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) { 52 inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; 53 outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; 54 } 55 } 56 57 array<Index, NumDims> gpuInputDimensions; 58 array<Index, NumDims> gpuOutputDimensions; 59 array<Index, NumDims> tmp = dimensions; 60 array<Index, NumDims> ordering; 61 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 62 ? 0 63 : NumDims - NumKernelDims; 64 for (int i = 0; i < NumKernelDims; ++i) { 65 const Index index = i + offset; 66 ordering[index] = indices[i]; 67 tmp[indices[i]] = -1; 68 gpuInputDimensions[index] = input_dims[indices[i]]; 69 gpuOutputDimensions[index] = dimensions[indices[i]]; 70 } 71 72 int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) 73 ? NumKernelDims 74 : 0; 75 for (int i = 0; i < NumDims; ++i) { 76 if (tmp[i] >= 0) { 77 ordering[written] = i; 78 gpuInputDimensions[written] = input_dims[i]; 79 gpuOutputDimensions[written] = dimensions[i]; 80 ++written; 81 } 82 } 83 84 for (int i = 0; i < NumDims; ++i) { 85 m_inputStrides[i] = inputStrides[ordering[i]]; 86 m_outputStrides[i] = outputStrides[ordering[i]]; 87 } 88 89 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 90 for (int i = 0; i < NumDims; ++i) { 91 if (i > NumKernelDims) { 92 m_gpuInputStrides[i] = 93 m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1]; 94 m_gpuOutputStrides[i] = 95 m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1]; 96 } else { 97 m_gpuInputStrides[i] = 1; 98 m_gpuOutputStrides[i] = 1; 99 } 100 } 101 } else { 102 for (int i = NumDims - 1; i >= 0; --i) { 103 if (static_cast<size_t>(i + 1) < offset) { 104 m_gpuInputStrides[i] = 105 m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1]; 106 m_gpuOutputStrides[i] = 107 m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1]; 108 } else { 109 m_gpuInputStrides[i] = 1; 110 m_gpuOutputStrides[i] = 1; 111 } 112 } 113 } 114 } 115 mapGpuInputPlaneToTensorInputOffset(Index p)116 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const { 117 Index inputIndex = 0; 118 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 119 for (int d = NumDims - 1; d > NumKernelDims; --d) { 120 const Index idx = p / m_gpuInputStrides[d]; 121 inputIndex += idx * m_inputStrides[d]; 122 p -= idx * m_gpuInputStrides[d]; 123 } 124 inputIndex += p * m_inputStrides[NumKernelDims]; 125 } else { 126 std::ptrdiff_t limit = 0; 127 if (NumKernelDims < NumDims) { 128 limit = NumDims - NumKernelDims - 1; 129 } 130 for (int d = 0; d < limit; ++d) { 131 const Index idx = p / m_gpuInputStrides[d]; 132 inputIndex += idx * m_inputStrides[d]; 133 p -= idx * m_gpuInputStrides[d]; 134 } 135 inputIndex += p * m_inputStrides[limit]; 136 } 137 return inputIndex; 138 } 139 mapGpuOutputPlaneToTensorOutputOffset(Index p)140 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const { 141 Index outputIndex = 0; 142 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 143 for (int d = NumDims - 1; d > NumKernelDims; --d) { 144 const Index idx = p / m_gpuOutputStrides[d]; 145 outputIndex += idx * m_outputStrides[d]; 146 p -= idx * m_gpuOutputStrides[d]; 147 } 148 outputIndex += p * m_outputStrides[NumKernelDims]; 149 } else { 150 std::ptrdiff_t limit = 0; 151 if (NumKernelDims < NumDims) { 152 limit = NumDims - NumKernelDims - 1; 153 } 154 for (int d = 0; d < limit; ++d) { 155 const Index idx = p / m_gpuOutputStrides[d]; 156 outputIndex += idx * m_outputStrides[d]; 157 p -= idx * m_gpuOutputStrides[d]; 158 } 159 outputIndex += p * m_outputStrides[limit]; 160 } 161 return outputIndex; 162 } 163 mapGpuInputKernelToTensorInputOffset(Index i)164 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const { 165 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 166 ? 0 167 : NumDims - NumKernelDims; 168 return i * m_inputStrides[offset]; 169 } 170 mapGpuOutputKernelToTensorOutputOffset(Index i)171 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const { 172 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 173 ? 0 174 : NumDims - NumKernelDims; 175 return i * m_outputStrides[offset]; 176 } 177 mapGpuInputKernelToTensorInputOffset(Index i,Index j)178 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const { 179 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 180 ? 0 181 : NumDims - NumKernelDims; 182 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; 183 } 184 mapGpuOutputKernelToTensorOutputOffset(Index i,Index j)185 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const { 186 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 187 ? 0 188 : NumDims - NumKernelDims; 189 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; 190 } 191 mapGpuInputKernelToTensorInputOffset(Index i,Index j,Index k)192 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const { 193 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 194 ? 0 195 : NumDims - NumKernelDims; 196 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + 197 k * m_inputStrides[offset + 2]; 198 } 199 mapGpuOutputKernelToTensorOutputOffset(Index i,Index j,Index k)200 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { 201 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 202 ? 0 203 : NumDims - NumKernelDims; 204 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + 205 k * m_outputStrides[offset + 2]; 206 } 207 208 private: 209 static const int NumDims = internal::array_size<InputDims>::value; 210 array<Index, NumDims> m_inputStrides; 211 array<Index, NumDims> m_outputStrides; 212 array<Index, NumDims> m_gpuInputStrides; 213 array<Index, NumDims> m_gpuOutputStrides; 214 }; 215 216 217 218 template<typename Dimensions, typename InputXprType, typename KernelXprType> 219 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > 220 { 221 // Type promotion to handle the case where the types of the lhs and the rhs are different. 222 typedef typename promote_storage_type<typename InputXprType::Scalar, 223 typename KernelXprType::Scalar>::ret Scalar; 224 typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind, 225 typename traits<KernelXprType>::StorageKind>::ret StorageKind; 226 typedef typename promote_index_type<typename traits<InputXprType>::Index, 227 typename traits<KernelXprType>::Index>::type Index; 228 typedef typename InputXprType::Nested LhsNested; 229 typedef typename KernelXprType::Nested RhsNested; 230 typedef typename remove_reference<LhsNested>::type _LhsNested; 231 typedef typename remove_reference<RhsNested>::type _RhsNested; 232 static const int NumDimensions = traits<InputXprType>::NumDimensions; 233 static const int Layout = traits<InputXprType>::Layout; 234 typedef typename conditional<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val, 235 typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>::type PointerType; 236 237 enum { 238 Flags = 0 239 }; 240 }; 241 242 template<typename Dimensions, typename InputXprType, typename KernelXprType> 243 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense> 244 { 245 typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type; 246 }; 247 248 template<typename Dimensions, typename InputXprType, typename KernelXprType> 249 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type> 250 { 251 typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type; 252 }; 253 254 } // end namespace internal 255 256 257 258 template<typename Indices, typename InputXprType, typename KernelXprType> 259 class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> 260 { 261 public: 262 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar; 263 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; 264 typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType, 265 typename KernelXprType::CoeffReturnType>::ret CoeffReturnType; 266 typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested; 267 typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind; 268 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index; 269 270 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) 271 : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} 272 273 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 274 const Indices& indices() const { return m_indices; } 275 276 /** \returns the nested expressions */ 277 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 278 const typename internal::remove_all<typename InputXprType::Nested>::type& 279 inputExpression() const { return m_input_xpr; } 280 281 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 282 const typename internal::remove_all<typename KernelXprType::Nested>::type& 283 kernelExpression() const { return m_kernel_xpr; } 284 285 protected: 286 typename InputXprType::Nested m_input_xpr; 287 typename KernelXprType::Nested m_kernel_xpr; 288 const Indices m_indices; 289 }; 290 291 292 template<typename Indices, typename InputArgType, typename KernelArgType, typename Device> 293 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device> 294 { 295 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; 296 297 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value; 298 static const int NumKernelDims = internal::array_size<Indices>::value; 299 typedef typename XprType::Index Index; 300 typedef DSizes<Index, NumDims> Dimensions; 301 302 typedef typename XprType::Scalar Scalar; 303 typedef typename XprType::CoeffReturnType CoeffReturnType; 304 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; 305 static const int PacketSize = PacketType<CoeffReturnType, Device>::size; 306 typedef StorageMemory<Scalar, Device> Storage; 307 typedef typename Storage::Type EvaluatorPointerType; 308 309 enum { 310 IsAligned = int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned), 311 PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) & int(TensorEvaluator<KernelArgType, Device>::PacketAccess), 312 BlockAccess = false, 313 PreferBlockAccess = false, 314 Layout = TensorEvaluator<InputArgType, Device>::Layout, 315 CoordAccess = false, // to be implemented 316 RawAccess = false 317 }; 318 319 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// 320 typedef internal::TensorBlockNotImplemented TensorBlock; 321 //===--------------------------------------------------------------------===// 322 323 EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) 324 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) 325 { 326 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); 327 328 const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions(); 329 const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); 330 331 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 332 m_inputStride[0] = 1; 333 for (int i = 1; i < NumDims; ++i) { 334 m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; 335 } 336 } else { 337 m_inputStride[NumDims - 1] = 1; 338 for (int i = NumDims - 2; i >= 0; --i) { 339 m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; 340 } 341 } 342 343 m_dimensions = m_inputImpl.dimensions(); 344 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 345 for (int i = 0; i < NumKernelDims; ++i) { 346 const Index index = op.indices()[i]; 347 const Index input_dim = input_dims[index]; 348 const Index kernel_dim = kernel_dims[i]; 349 const Index result_dim = input_dim - kernel_dim + 1; 350 m_dimensions[index] = result_dim; 351 if (i > 0) { 352 m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; 353 } else { 354 m_kernelStride[0] = 1; 355 } 356 m_indexStride[i] = m_inputStride[index]; 357 } 358 359 m_outputStride[0] = 1; 360 for (int i = 1; i < NumDims; ++i) { 361 m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; 362 } 363 } else { 364 for (int i = NumKernelDims - 1; i >= 0; --i) { 365 const Index index = op.indices()[i]; 366 const Index input_dim = input_dims[index]; 367 const Index kernel_dim = kernel_dims[i]; 368 const Index result_dim = input_dim - kernel_dim + 1; 369 m_dimensions[index] = result_dim; 370 if (i < NumKernelDims - 1) { 371 m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; 372 } else { 373 m_kernelStride[NumKernelDims - 1] = 1; 374 } 375 m_indexStride[i] = m_inputStride[index]; 376 } 377 378 m_outputStride[NumDims - 1] = 1; 379 for (int i = NumDims - 2; i >= 0; --i) { 380 m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; 381 } 382 } 383 } 384 385 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } 386 387 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { 388 m_inputImpl.evalSubExprsIfNeeded(NULL); 389 preloadKernel(); 390 return true; 391 } 392 EIGEN_STRONG_INLINE void cleanup() { 393 m_inputImpl.cleanup(); 394 if (m_local_kernel) { 395 m_device.deallocate((void*)m_kernel); 396 m_local_kernel = false; 397 } 398 m_kernel = NULL; 399 } 400 401 void evalTo(typename XprType::Scalar* buffer) { 402 evalSubExprsIfNeeded(NULL); 403 for (int i = 0; i < dimensions().TotalSize(); ++i) { 404 buffer[i] += coeff(i); 405 } 406 cleanup(); 407 } 408 409 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const 410 { 411 CoeffReturnType result = CoeffReturnType(0); 412 convolve(firstInput(index), 0, NumKernelDims-1, result); 413 return result; 414 } 415 416 template<int LoadMode> 417 EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const 418 { 419 Index indices[2] = {index, index+PacketSize-1}; 420 Index startInputs[2] = {0, 0}; 421 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 422 for (int i = NumDims - 1; i > 0; --i) { 423 const Index idx0 = indices[0] / m_outputStride[i]; 424 const Index idx1 = indices[1] / m_outputStride[i]; 425 startInputs[0] += idx0 * m_inputStride[i]; 426 startInputs[1] += idx1 * m_inputStride[i]; 427 indices[0] -= idx0 * m_outputStride[i]; 428 indices[1] -= idx1 * m_outputStride[i]; 429 } 430 } else { 431 for (int i = 0; i < NumDims - 1; ++i) { 432 const Index idx0 = indices[0] / m_outputStride[i]; 433 const Index idx1 = indices[1] / m_outputStride[i]; 434 startInputs[0] += idx0 * m_inputStride[i]; 435 startInputs[1] += idx1 * m_inputStride[i]; 436 indices[0] -= idx0 * m_outputStride[i]; 437 indices[1] -= idx1 * m_outputStride[i]; 438 } 439 } 440 startInputs[0] += indices[0]; 441 startInputs[1] += indices[1]; 442 443 if (startInputs[1]-startInputs[0] == PacketSize-1) { 444 PacketReturnType result = internal::pset1<PacketReturnType>(0); 445 convolvePacket(startInputs[0], 0, NumKernelDims-1, result); 446 return result; 447 } else { 448 EIGEN_ALIGN_MAX Scalar data[PacketSize]; 449 data[0] = Scalar(0); 450 convolve(startInputs[0], 0, NumKernelDims-1, data[0]); 451 for (int i = 1; i < PacketSize-1; ++i) { 452 data[i] = Scalar(0); 453 convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); 454 } 455 data[PacketSize-1] = Scalar(0); 456 convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); 457 return internal::pload<PacketReturnType>(data); 458 } 459 } 460 461 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost 462 costPerCoeff(bool vectorized) const { 463 const double kernel_size = m_kernelImpl.dimensions().TotalSize(); 464 // We ignore the use of fused multiply-add. 465 const double convolve_compute_cost = 466 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); 467 const double firstIndex_compute_cost = 468 NumDims * 469 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + 470 TensorOpCost::DivCost<Index>()); 471 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + 472 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + 473 m_kernelImpl.costPerCoeff(vectorized) + 474 TensorOpCost(0, 0, convolve_compute_cost, vectorized, 475 PacketSize)); 476 } 477 478 EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; } 479 480 private: 481 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { 482 Index startInput = 0; 483 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 484 for (int i = NumDims - 1; i > 0; --i) { 485 const Index idx = index / m_outputStride[i]; 486 startInput += idx * m_inputStride[i]; 487 index -= idx * m_outputStride[i]; 488 } 489 } else { 490 for (int i = 0; i < NumDims - 1; ++i) { 491 const Index idx = index / m_outputStride[i]; 492 startInput += idx * m_inputStride[i]; 493 index -= idx * m_outputStride[i]; 494 } 495 } 496 startInput += index; 497 return startInput; 498 } 499 500 EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { 501 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { 502 const Index input = firstIndex + j * m_indexStride[DimIndex]; 503 const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; 504 if (DimIndex > 0) { 505 convolve(input, kernel, DimIndex-1, accum); 506 } else { 507 accum += m_inputImpl.coeff(input) * m_kernel[kernel]; 508 } 509 } 510 } 511 512 template <typename Packet> 513 EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { 514 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { 515 const Index input = firstIndex + j * m_indexStride[DimIndex]; 516 const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; 517 if (DimIndex > 0) { 518 convolvePacket(input, kernel, DimIndex-1, accum); 519 } else { 520 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum); 521 } 522 } 523 } 524 525 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { 526 // Don't make a local copy of the kernel unless we have to (i.e. it's an 527 // expression that needs to be evaluated) 528 const Scalar* in_place = m_kernelImpl.data(); 529 if (in_place) { 530 m_kernel = in_place; 531 m_local_kernel = false; 532 } else { 533 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); 534 Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz); 535 typedef TensorEvalToOp<const KernelArgType> EvalTo; 536 EvalTo evalToTmp(local, m_kernelArg); 537 const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value; 538 internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device); 539 540 m_kernel = local; 541 m_local_kernel = true; 542 } 543 } 544 545 array<Index, NumDims> m_inputStride; 546 array<Index, NumDims> m_outputStride; 547 548 array<Index, NumKernelDims> m_indexStride; 549 array<Index, NumKernelDims> m_kernelStride; 550 TensorEvaluator<InputArgType, Device> m_inputImpl; 551 TensorEvaluator<KernelArgType, Device> m_kernelImpl; 552 Dimensions m_dimensions; 553 554 KernelArgType m_kernelArg; 555 const Scalar* m_kernel; 556 bool m_local_kernel; 557 const Device EIGEN_DEVICE_REF m_device; 558 }; 559 560 561 562 563 // Use an optimized implementation of the evaluation code for GPUs whenever possible. 564 #if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) 565 566 template <int StaticKernelSize> 567 struct GetKernelSize { 568 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { 569 return StaticKernelSize; 570 } 571 }; 572 template <> 573 struct GetKernelSize<Dynamic> { 574 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { 575 return kernelSize; 576 } 577 }; 578 579 template <typename InputEvaluator, typename Index, typename InputDims, 580 int StaticKernelSize> 581 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D( 582 InputEvaluator eval, 583 const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> 584 indexMapper, 585 const float* __restrict kernel, const int numPlanes, const int numX, 586 const int maxX, const int kernelSize, float* buffer) { 587 #if defined(EIGEN_HIPCC) 588 HIP_DYNAMIC_SHARED(float, s) 589 #else 590 extern __shared__ float s[]; 591 #endif 592 593 const int first_x = blockIdx.x * maxX; 594 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; 595 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize); 596 const int num_x_output = last_x - first_x + 1; 597 598 const int first_plane = blockIdx.y * blockDim.y; 599 const int plane_stride = blockDim.y * gridDim.y; 600 601 for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { 602 // Load inputs to shared memory 603 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); 604 const int plane_kernel_offset = threadIdx.y * num_x_input; 605 #pragma unroll 606 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { 607 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x); 608 s[i + plane_kernel_offset] = eval.coeff(tensor_index); 609 } 610 611 __syncthreads(); 612 613 // Compute the convolution 614 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); 615 616 #pragma unroll 617 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { 618 const int kernel_offset = plane_kernel_offset + i; 619 float result = 0.0f; 620 #pragma unroll 621 for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) { 622 result += s[k + kernel_offset] * kernel[k]; 623 } 624 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x); 625 buffer[tensor_index] = result; 626 } 627 __syncthreads(); 628 } 629 }; 630 631 template <typename InputEvaluator, typename Index, typename InputDims, 632 int StaticKernelSizeX, int StaticKernelSizeY> 633 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D( 634 InputEvaluator eval, 635 const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> 636 indexMapper, 637 const float* __restrict kernel, const int numPlanes, const int numX, 638 const int maxX, const int numY, const int maxY, const int kernelSizeX, 639 const int kernelSizeY, float* buffer) { 640 #if defined(EIGEN_HIPCC) 641 HIP_DYNAMIC_SHARED(float, s) 642 #else 643 extern __shared__ float s[]; 644 #endif 645 646 const int first_x = blockIdx.x * maxX; 647 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; 648 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX); 649 const int num_x_output = last_x - first_x + 1; 650 651 const int first_y = blockIdx.y * maxY; 652 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; 653 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY); 654 const int num_y_output = last_y - first_y + 1; 655 656 const int first_plane = blockIdx.z * blockDim.z; 657 const int plane_stride = blockDim.z * gridDim.z; 658 659 for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { 660 661 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); 662 const int plane_kernel_offset = threadIdx.z * num_y_input; 663 664 // Load inputs to shared memory 665 #pragma unroll 666 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { 667 const int input_offset = num_x_input * (j + plane_kernel_offset); 668 #pragma unroll 669 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { 670 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y); 671 s[i + input_offset] = eval.coeff(tensor_index); 672 } 673 } 674 675 __syncthreads(); 676 677 // Convolution 678 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); 679 680 #pragma unroll 681 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { 682 #pragma unroll 683 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { 684 float result = 0.0f; 685 #pragma unroll 686 for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) { 687 const int kernel_offset = kernelSizeX * l; 688 const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); 689 #pragma unroll 690 for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) { 691 result += s[k + input_offset] * kernel[k + kernel_offset]; 692 } 693 } 694 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y); 695 buffer[tensor_index] = result; 696 } 697 } 698 699 __syncthreads(); 700 } 701 }; 702 703 template <typename InputEvaluator, typename Index, typename InputDims> 704 __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D( 705 InputEvaluator eval, 706 const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> 707 indexMapper, 708 const float* __restrict kernel, const size_t numPlanes, const size_t numX, 709 const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, 710 const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, 711 const size_t kernelSizeZ, float* buffer) { 712 #if defined(EIGEN_HIPCC) 713 HIP_DYNAMIC_SHARED(float, s) 714 #else 715 extern __shared__ float s[]; 716 #endif 717 718 // Load inputs to shared memory 719 const int first_x = blockIdx.x * maxX; 720 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; 721 const int num_x_input = last_x - first_x + kernelSizeX; 722 723 const int first_y = blockIdx.y * maxY; 724 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; 725 const int num_y_input = last_y - first_y + kernelSizeY; 726 727 const int first_z = blockIdx.z * maxZ; 728 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; 729 const int num_z_input = last_z - first_z + kernelSizeZ; 730 731 for (int p = 0; p < numPlanes; ++p) { 732 733 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p); 734 const int plane_kernel_offset = 0; 735 736 for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { 737 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { 738 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { 739 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); 740 s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); 741 } 742 } 743 } 744 745 __syncthreads(); 746 747 // Convolution 748 const int num_z_output = last_z - first_z + 1; 749 const int num_y_output = last_y - first_y + 1; 750 const int num_x_output = last_x - first_x + 1; 751 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p); 752 753 for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { 754 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { 755 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { 756 float result = 0.0f; 757 for (int n = 0; n < kernelSizeZ; ++n) { 758 for (int m = 0; m < kernelSizeY; ++m) { 759 for (int l = 0; l < kernelSizeX; ++l) { 760 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; 761 } 762 } 763 } 764 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); 765 buffer[tensor_index] = result; 766 } 767 } 768 } 769 __syncthreads(); 770 } 771 }; 772 773 774 775 template<typename Indices, typename InputArgType, typename KernelArgType> 776 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> 777 { 778 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; 779 780 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value; 781 static const int NumKernelDims = internal::array_size<Indices>::value; 782 typedef typename XprType::Index Index; 783 typedef DSizes<Index, NumDims> Dimensions; 784 typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions; 785 786 enum { 787 IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned, 788 PacketAccess = false, 789 BlockAccess = false, 790 PreferBlockAccess = false, 791 Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout, 792 CoordAccess = false, // to be implemented 793 RawAccess = false 794 }; 795 796 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===// 797 typedef internal::TensorBlockNotImplemented TensorBlock; 798 //===--------------------------------------------------------------------===// 799 800 TensorEvaluator(const XprType& op, const GpuDevice& device) 801 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) 802 { 803 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); 804 805 const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions(); 806 const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); 807 808 m_dimensions = m_inputImpl.dimensions(); 809 for (int i = 0; i < NumKernelDims; ++i) { 810 const Index index = op.indices()[i]; 811 const Index input_dim = input_dims[index]; 812 const Index kernel_dim = kernel_dims[i]; 813 const Index result_dim = input_dim - kernel_dim + 1; 814 m_dimensions[index] = result_dim; 815 } 816 } 817 818 typedef typename XprType::CoeffReturnType CoeffReturnType; 819 typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType; 820 typedef typename InputArgType::Scalar Scalar; 821 static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; 822 823 EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } 824 825 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { 826 preloadKernel(); 827 m_inputImpl.evalSubExprsIfNeeded(NULL); 828 if (data) { 829 executeEval(data); 830 return false; 831 } else { 832 m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); 833 executeEval(m_buf); 834 return true; 835 } 836 } 837 838 EIGEN_STRONG_INLINE void cleanup() { 839 m_inputImpl.cleanup(); 840 if (m_buf) { 841 m_device.deallocate(m_buf); 842 m_buf = NULL; 843 } 844 if (m_local_kernel) { 845 m_device.deallocate((void*)m_kernel); 846 m_local_kernel = false; 847 } 848 m_kernel = NULL; 849 } 850 851 EIGEN_STRONG_INLINE void preloadKernel() { 852 // Don't make a local copy of the kernel unless we have to (i.e. it's an 853 // expression that needs to be evaluated) 854 const Scalar* in_place = m_kernelImpl.data(); 855 if (in_place) { 856 m_kernel = in_place; 857 m_local_kernel = false; 858 } else { 859 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); 860 Scalar* local = (Scalar*)m_device.allocate(kernel_sz); 861 typedef TensorEvalToOp<const KernelArgType> EvalTo; 862 EvalTo evalToTmp(local, m_kernelArg); 863 const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value; 864 internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device); 865 866 m_kernel = local; 867 m_local_kernel = true; 868 } 869 } 870 871 static unsigned int ceil(unsigned int num, unsigned int denom) { 872 const unsigned int rounded_toward_zero = num / denom; 873 if (num > rounded_toward_zero * denom) { 874 return rounded_toward_zero + 1; 875 } 876 return rounded_toward_zero; 877 } 878 879 void executeEval(Scalar* data) const { 880 typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims; 881 882 const int maxSharedMem = m_device.sharedMemPerBlock(); 883 const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock(); 884 const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock; 885 const int numMultiProcessors = m_device.getNumGpuMultiProcessors(); 886 const int warpSize = 32; 887 888 switch (NumKernelDims) { 889 case 1: { 890 const int kernel_size = m_kernelImpl.dimensions().TotalSize(); 891 892 const int numX = dimensions()[m_indices[0]]; 893 const int numP = dimensions().TotalSize() / numX; 894 int maxX; 895 dim3 block_size; 896 897 const int single_stride_dim = 898 static_cast<int>(Layout) == static_cast<int>(ColMajor) 899 ? 0 900 : m_inputImpl.dimensions().rank() - 1; 901 if (m_indices[0] == single_stride_dim) { 902 // Maximum the reuse 903 const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; 904 maxX = numext::mini<int>(inner_dim, numX); 905 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); 906 block_size.x = numext::mini(maxThreadsPerBlock, maxX); 907 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP); 908 } 909 else { 910 // Read as much as possible alongside the inner most dimension, that is the plane 911 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); 912 const int maxP = numext::mini<int>(inner_dim, numP); 913 maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); 914 915 block_size.x = numext::mini(warpSize, maxX); 916 block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP); 917 } 918 919 const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); 920 gpu_assert(shared_mem <= maxSharedMem); 921 922 const int num_x_blocks = ceil(numX, maxX); 923 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); 924 const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); 925 926 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y))); 927 928 929 //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; 930 931 const array<Index, 1> indices(m_indices[0]); 932 const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]); 933 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper( 934 m_inputImpl.dimensions(), kernel_dims, indices); 935 switch(kernel_size) { 936 case 4: { 937 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); 938 break; 939 } 940 case 7: { 941 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); 942 break; 943 } 944 default: { 945 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); 946 } 947 } 948 break; 949 } 950 951 case 2: { 952 const int idxX = 953 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1; 954 const int idxY = 955 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0; 956 const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; 957 const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; 958 959 const int numX = dimensions()[m_indices[idxX]]; 960 const int numY = dimensions()[m_indices[idxY]]; 961 const int numP = dimensions().TotalSize() / (numX*numY); 962 963 const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); 964 965 // Snap maxX to warp size 966 int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; 967 const int maxX = numext::mini<int>(inner_dim, numX); 968 const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); 969 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); 970 971 dim3 block_size; 972 block_size.x = numext::mini(1024, maxX); 973 block_size.y = numext::mini<int>(1024/block_size.x, maxY); 974 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP); 975 976 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); 977 gpu_assert(shared_mem <= maxSharedMem); 978 979 const int num_x_blocks = ceil(numX, maxX); 980 const int num_y_blocks = ceil(numY, maxY); 981 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); 982 const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); 983 984 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z))); 985 986 987 //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; 988 989 const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]); 990 const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX], 991 m_kernelImpl.dimensions()[idxY]); 992 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper( 993 m_inputImpl.dimensions(), kernel_dims, indices); 994 switch (kernel_size_x) { 995 case 4: { 996 switch (kernel_size_y) { 997 case 7: { 998 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); 999 break; 1000 } 1001 default: { 1002 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); 1003 break; 1004 } 1005 } 1006 break; 1007 } 1008 case 7: { 1009 switch (kernel_size_y) { 1010 case 4: { 1011 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); 1012 break; 1013 } 1014 default: { 1015 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); 1016 break; 1017 } 1018 } 1019 break; 1020 } 1021 default: { 1022 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); 1023 break; 1024 } 1025 } 1026 break; 1027 } 1028 1029 case 3: { 1030 const int idxX = 1031 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2; 1032 const int idxY = 1033 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1; 1034 const int idxZ = 1035 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0; 1036 1037 const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; 1038 const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; 1039 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; 1040 1041 const int numX = dimensions()[m_indices[idxX]]; 1042 const int numY = dimensions()[m_indices[idxY]]; 1043 const int numZ = dimensions()[m_indices[idxZ]]; 1044 const int numP = dimensions().TotalSize() / (numX*numY*numZ); 1045 1046 const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); 1047 const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); 1048 const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); 1049 1050 dim3 block_size; 1051 block_size.x = numext::mini(32, maxX); 1052 block_size.y = numext::mini(32, maxY); 1053 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ); 1054 dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); 1055 1056 const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); 1057 gpu_assert(shared_mem <= maxSharedMem); 1058 1059 //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; 1060 const array<Index, 3> indices(m_indices[idxX], m_indices[idxY], 1061 m_indices[idxZ]); 1062 const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX], 1063 m_kernelImpl.dimensions()[idxY], 1064 m_kernelImpl.dimensions()[idxZ]); 1065 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper( 1066 m_inputImpl.dimensions(), kernel_dims, indices); 1067 1068 LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); 1069 break; 1070 } 1071 1072 default: { 1073 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); 1074 } 1075 } 1076 } 1077 1078 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const 1079 { 1080 eigen_assert(m_buf); 1081 eigen_assert(index < m_dimensions.TotalSize()); 1082 return m_buf[index]; 1083 } 1084 1085 template<int LoadMode> 1086 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const 1087 { 1088 eigen_assert(m_buf); 1089 eigen_assert(index < m_dimensions.TotalSize()); 1090 return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index); 1091 } 1092 1093 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost 1094 costPerCoeff(bool vectorized) const { 1095 // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost 1096 // model. 1097 const double kernel_size = m_kernelImpl.dimensions().TotalSize(); 1098 // We ignore the use of fused multiply-add. 1099 const double convolve_compute_cost = 1100 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); 1101 const double firstIndex_compute_cost = 1102 NumDims * 1103 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + 1104 TensorOpCost::DivCost<Index>()); 1105 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + 1106 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + 1107 m_kernelImpl.costPerCoeff(vectorized) + 1108 TensorOpCost(0, 0, convolve_compute_cost, vectorized, 1109 PacketSize)); 1110 } 1111 1112 private: 1113 // No assignment (copies are needed by the kernels) 1114 TensorEvaluator& operator = (const TensorEvaluator&); 1115 1116 TensorEvaluator<InputArgType, GpuDevice> m_inputImpl; 1117 TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl; 1118 KernelArgType m_kernelArg; 1119 Indices m_indices; 1120 Dimensions m_dimensions; 1121 Scalar* m_buf; 1122 const Scalar* m_kernel; 1123 bool m_local_kernel; 1124 1125 const GpuDevice& m_device; 1126 }; 1127 #endif 1128 1129 1130 } // end namespace Eigen 1131 1132 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H 1133