1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H 12 13 namespace Eigen { 14 15 /** \class TensorConvolution 16 * \ingroup CXX11_Tensor_Module 17 * 18 * \brief Tensor convolution class. 19 * 20 * 21 */ 22 namespace internal { 23 24 template <typename Index, typename InputDims, int NumKernelDims, int Layout> 25 class IndexMapper { 26 public: IndexMapper(const InputDims & input_dims,const array<Index,NumKernelDims> & kernel_dims,const array<Index,NumKernelDims> & indices)27 IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims, 28 const array<Index, NumKernelDims>& indices) { 29 30 array<Index, NumDims> dimensions = input_dims; 31 for (int i = 0; i < NumKernelDims; ++i) { 32 const Index index = indices[i]; 33 const Index input_dim = input_dims[index]; 34 const Index kernel_dim = kernel_dims[i]; 35 const Index result_dim = input_dim - kernel_dim + 1; 36 dimensions[index] = result_dim; 37 } 38 39 array<Index, NumDims> inputStrides; 40 array<Index, NumDims> outputStrides; 41 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 42 inputStrides[0] = 1; 43 outputStrides[0] = 1; 44 for (int i = 1; i < NumDims; ++i) { 45 inputStrides[i] = inputStrides[i-1] * input_dims[i-1]; 46 outputStrides[i] = outputStrides[i-1] * dimensions[i-1]; 47 } 48 } else { 49 inputStrides[NumDims - 1] = 1; 50 outputStrides[NumDims - 1] = 1; 51 for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) { 52 inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1]; 53 outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1]; 54 } 55 } 56 57 array<Index, NumDims> cudaInputDimensions; 58 array<Index, NumDims> cudaOutputDimensions; 59 array<Index, NumDims> tmp = dimensions; 60 array<Index, NumDims> ordering; 61 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 62 ? 0 63 : NumDims - NumKernelDims; 64 for (int i = 0; i < NumKernelDims; ++i) { 65 const Index index = i + offset; 66 ordering[index] = indices[i]; 67 tmp[indices[i]] = -1; 68 cudaInputDimensions[index] = input_dims[indices[i]]; 69 cudaOutputDimensions[index] = dimensions[indices[i]]; 70 } 71 72 int written = static_cast<int>(Layout) == static_cast<int>(ColMajor) 73 ? NumKernelDims 74 : 0; 75 for (int i = 0; i < NumDims; ++i) { 76 if (tmp[i] >= 0) { 77 ordering[written] = i; 78 cudaInputDimensions[written] = input_dims[i]; 79 cudaOutputDimensions[written] = dimensions[i]; 80 ++written; 81 } 82 } 83 84 for (int i = 0; i < NumDims; ++i) { 85 m_inputStrides[i] = inputStrides[ordering[i]]; 86 m_outputStrides[i] = outputStrides[ordering[i]]; 87 } 88 89 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 90 for (int i = 0; i < NumDims; ++i) { 91 if (i > NumKernelDims) { 92 m_cudaInputStrides[i] = 93 m_cudaInputStrides[i - 1] * cudaInputDimensions[i - 1]; 94 m_cudaOutputStrides[i] = 95 m_cudaOutputStrides[i - 1] * cudaOutputDimensions[i - 1]; 96 } else { 97 m_cudaInputStrides[i] = 1; 98 m_cudaOutputStrides[i] = 1; 99 } 100 } 101 } else { 102 for (int i = NumDims - 1; i >= 0; --i) { 103 if (i + 1 < offset) { 104 m_cudaInputStrides[i] = 105 m_cudaInputStrides[i + 1] * cudaInputDimensions[i + 1]; 106 m_cudaOutputStrides[i] = 107 m_cudaOutputStrides[i + 1] * cudaOutputDimensions[i + 1]; 108 } else { 109 m_cudaInputStrides[i] = 1; 110 m_cudaOutputStrides[i] = 1; 111 } 112 } 113 } 114 } 115 mapCudaInputPlaneToTensorInputOffset(Index p)116 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputPlaneToTensorInputOffset(Index p) const { 117 Index inputIndex = 0; 118 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 119 for (int d = NumDims - 1; d > NumKernelDims; --d) { 120 const Index idx = p / m_cudaInputStrides[d]; 121 inputIndex += idx * m_inputStrides[d]; 122 p -= idx * m_cudaInputStrides[d]; 123 } 124 inputIndex += p * m_inputStrides[NumKernelDims]; 125 } else { 126 std::ptrdiff_t limit = 0; 127 if (NumKernelDims < NumDims) { 128 limit = NumDims - NumKernelDims - 1; 129 } 130 for (int d = 0; d < limit; ++d) { 131 const Index idx = p / m_cudaInputStrides[d]; 132 inputIndex += idx * m_inputStrides[d]; 133 p -= idx * m_cudaInputStrides[d]; 134 } 135 inputIndex += p * m_inputStrides[limit]; 136 } 137 return inputIndex; 138 } 139 mapCudaOutputPlaneToTensorOutputOffset(Index p)140 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputPlaneToTensorOutputOffset(Index p) const { 141 Index outputIndex = 0; 142 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 143 for (int d = NumDims - 1; d > NumKernelDims; --d) { 144 const Index idx = p / m_cudaOutputStrides[d]; 145 outputIndex += idx * m_outputStrides[d]; 146 p -= idx * m_cudaOutputStrides[d]; 147 } 148 outputIndex += p * m_outputStrides[NumKernelDims]; 149 } else { 150 std::ptrdiff_t limit = 0; 151 if (NumKernelDims < NumDims) { 152 limit = NumDims - NumKernelDims - 1; 153 } 154 for (int d = 0; d < limit; ++d) { 155 const Index idx = p / m_cudaOutputStrides[d]; 156 outputIndex += idx * m_outputStrides[d]; 157 p -= idx * m_cudaOutputStrides[d]; 158 } 159 outputIndex += p * m_outputStrides[limit]; 160 } 161 return outputIndex; 162 } 163 mapCudaInputKernelToTensorInputOffset(Index i)164 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i) const { 165 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 166 ? 0 167 : NumDims - NumKernelDims; 168 return i * m_inputStrides[offset]; 169 } 170 mapCudaOutputKernelToTensorOutputOffset(Index i)171 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i) const { 172 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 173 ? 0 174 : NumDims - NumKernelDims; 175 return i * m_outputStrides[offset]; 176 } 177 mapCudaInputKernelToTensorInputOffset(Index i,Index j)178 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j) const { 179 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 180 ? 0 181 : NumDims - NumKernelDims; 182 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1]; 183 } 184 mapCudaOutputKernelToTensorOutputOffset(Index i,Index j)185 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j) const { 186 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 187 ? 0 188 : NumDims - NumKernelDims; 189 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1]; 190 } 191 mapCudaInputKernelToTensorInputOffset(Index i,Index j,Index k)192 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaInputKernelToTensorInputOffset(Index i, Index j, Index k) const { 193 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 194 ? 0 195 : NumDims - NumKernelDims; 196 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] + 197 k * m_inputStrides[offset + 2]; 198 } 199 mapCudaOutputKernelToTensorOutputOffset(Index i,Index j,Index k)200 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapCudaOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const { 201 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor) 202 ? 0 203 : NumDims - NumKernelDims; 204 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] + 205 k * m_outputStrides[offset + 2]; 206 } 207 208 private: 209 static const int NumDims = internal::array_size<InputDims>::value; 210 array<Index, NumDims> m_inputStrides; 211 array<Index, NumDims> m_outputStrides; 212 array<Index, NumDims> m_cudaInputStrides; 213 array<Index, NumDims> m_cudaOutputStrides; 214 }; 215 216 217 218 template<typename Dimensions, typename InputXprType, typename KernelXprType> 219 struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> > 220 { 221 // Type promotion to handle the case where the types of the lhs and the rhs are different. 222 typedef typename promote_storage_type<typename InputXprType::Scalar, 223 typename KernelXprType::Scalar>::ret Scalar; 224 typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind, 225 typename traits<KernelXprType>::StorageKind>::ret StorageKind; 226 typedef typename promote_index_type<typename traits<InputXprType>::Index, 227 typename traits<KernelXprType>::Index>::type Index; 228 typedef typename InputXprType::Nested LhsNested; 229 typedef typename KernelXprType::Nested RhsNested; 230 typedef typename remove_reference<LhsNested>::type _LhsNested; 231 typedef typename remove_reference<RhsNested>::type _RhsNested; 232 static const int NumDimensions = traits<InputXprType>::NumDimensions; 233 static const int Layout = traits<InputXprType>::Layout; 234 235 enum { 236 Flags = 0 237 }; 238 }; 239 240 template<typename Dimensions, typename InputXprType, typename KernelXprType> 241 struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense> 242 { 243 typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type; 244 }; 245 246 template<typename Dimensions, typename InputXprType, typename KernelXprType> 247 struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type> 248 { 249 typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type; 250 }; 251 252 } // end namespace internal 253 254 255 256 template<typename Indices, typename InputXprType, typename KernelXprType> 257 class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors> 258 { 259 public: 260 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar; 261 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar; 262 typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType, 263 typename KernelXprType::CoeffReturnType>::ret CoeffReturnType; 264 typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested; 265 typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind; 266 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index; 267 268 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims) 269 : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {} 270 271 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 272 const Indices& indices() const { return m_indices; } 273 274 /** \returns the nested expressions */ 275 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 276 const typename internal::remove_all<typename InputXprType::Nested>::type& 277 inputExpression() const { return m_input_xpr; } 278 279 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 280 const typename internal::remove_all<typename KernelXprType::Nested>::type& 281 kernelExpression() const { return m_kernel_xpr; } 282 283 protected: 284 typename InputXprType::Nested m_input_xpr; 285 typename KernelXprType::Nested m_kernel_xpr; 286 const Indices m_indices; 287 }; 288 289 290 template<typename Indices, typename InputArgType, typename KernelArgType, typename Device> 291 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device> 292 { 293 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; 294 295 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value; 296 static const int NumKernelDims = internal::array_size<Indices>::value; 297 typedef typename XprType::Index Index; 298 typedef DSizes<Index, NumDims> Dimensions; 299 300 typedef typename XprType::Scalar Scalar; 301 typedef typename XprType::CoeffReturnType CoeffReturnType; 302 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType; 303 static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; 304 305 enum { 306 IsAligned = TensorEvaluator<InputArgType, Device>::IsAligned & TensorEvaluator<KernelArgType, Device>::IsAligned, 307 PacketAccess = TensorEvaluator<InputArgType, Device>::PacketAccess & TensorEvaluator<KernelArgType, Device>::PacketAccess, 308 Layout = TensorEvaluator<InputArgType, Device>::Layout, 309 CoordAccess = false, // to be implemented 310 RawAccess = false 311 }; 312 313 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device) 314 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device) 315 { 316 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); 317 318 const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions(); 319 const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); 320 321 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 322 m_inputStride[0] = 1; 323 for (int i = 1; i < NumDims; ++i) { 324 m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1]; 325 } 326 } else { 327 m_inputStride[NumDims - 1] = 1; 328 for (int i = NumDims - 2; i >= 0; --i) { 329 m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1]; 330 } 331 } 332 333 m_dimensions = m_inputImpl.dimensions(); 334 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 335 for (int i = 0; i < NumKernelDims; ++i) { 336 const Index index = op.indices()[i]; 337 const Index input_dim = input_dims[index]; 338 const Index kernel_dim = kernel_dims[i]; 339 const Index result_dim = input_dim - kernel_dim + 1; 340 m_dimensions[index] = result_dim; 341 if (i > 0) { 342 m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1]; 343 } else { 344 m_kernelStride[0] = 1; 345 } 346 m_indexStride[i] = m_inputStride[index]; 347 } 348 349 m_outputStride[0] = 1; 350 for (int i = 1; i < NumDims; ++i) { 351 m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1]; 352 } 353 } else { 354 for (int i = NumKernelDims - 1; i >= 0; --i) { 355 const Index index = op.indices()[i]; 356 const Index input_dim = input_dims[index]; 357 const Index kernel_dim = kernel_dims[i]; 358 const Index result_dim = input_dim - kernel_dim + 1; 359 m_dimensions[index] = result_dim; 360 if (i < NumKernelDims - 1) { 361 m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1]; 362 } else { 363 m_kernelStride[NumKernelDims - 1] = 1; 364 } 365 m_indexStride[i] = m_inputStride[index]; 366 } 367 368 m_outputStride[NumDims - 1] = 1; 369 for (int i = NumDims - 2; i >= 0; --i) { 370 m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1]; 371 } 372 } 373 } 374 375 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; } 376 377 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) { 378 m_inputImpl.evalSubExprsIfNeeded(NULL); 379 preloadKernel(); 380 return true; 381 } 382 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() { 383 m_inputImpl.cleanup(); 384 if (m_local_kernel) { 385 m_device.deallocate((void*)m_kernel); 386 m_local_kernel = false; 387 } 388 m_kernel = NULL; 389 } 390 391 void evalTo(typename XprType::Scalar* buffer) { 392 evalSubExprsIfNeeded(NULL); 393 for (int i = 0; i < dimensions().TotalSize(); ++i) { 394 buffer[i] += coeff(i); 395 } 396 cleanup(); 397 } 398 399 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const 400 { 401 CoeffReturnType result = CoeffReturnType(0); 402 convolve(firstInput(index), 0, NumKernelDims-1, result); 403 return result; 404 } 405 406 template<int LoadMode> 407 EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const 408 { 409 Index indices[2] = {index, index+PacketSize-1}; 410 Index startInputs[2] = {0, 0}; 411 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 412 for (int i = NumDims - 1; i > 0; --i) { 413 const Index idx0 = indices[0] / m_outputStride[i]; 414 const Index idx1 = indices[1] / m_outputStride[i]; 415 startInputs[0] += idx0 * m_inputStride[i]; 416 startInputs[1] += idx1 * m_inputStride[i]; 417 indices[0] -= idx0 * m_outputStride[i]; 418 indices[1] -= idx1 * m_outputStride[i]; 419 } 420 } else { 421 for (int i = 0; i < NumDims - 1; ++i) { 422 const Index idx0 = indices[0] / m_outputStride[i]; 423 const Index idx1 = indices[1] / m_outputStride[i]; 424 startInputs[0] += idx0 * m_inputStride[i]; 425 startInputs[1] += idx1 * m_inputStride[i]; 426 indices[0] -= idx0 * m_outputStride[i]; 427 indices[1] -= idx1 * m_outputStride[i]; 428 } 429 } 430 startInputs[0] += indices[0]; 431 startInputs[1] += indices[1]; 432 433 if (startInputs[1]-startInputs[0] == PacketSize-1) { 434 PacketReturnType result = internal::pset1<PacketReturnType>(0); 435 convolvePacket(startInputs[0], 0, NumKernelDims-1, result); 436 return result; 437 } else { 438 EIGEN_ALIGN_MAX Scalar data[PacketSize]; 439 data[0] = Scalar(0); 440 convolve(startInputs[0], 0, NumKernelDims-1, data[0]); 441 for (int i = 1; i < PacketSize-1; ++i) { 442 data[i] = Scalar(0); 443 convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]); 444 } 445 data[PacketSize-1] = Scalar(0); 446 convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]); 447 return internal::pload<PacketReturnType>(data); 448 } 449 } 450 451 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost 452 costPerCoeff(bool vectorized) const { 453 const double kernel_size = m_kernelImpl.dimensions().TotalSize(); 454 // We ignore the use of fused multiply-add. 455 const double convolve_compute_cost = 456 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); 457 const double firstIndex_compute_cost = 458 NumDims * 459 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + 460 TensorOpCost::DivCost<Index>()); 461 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + 462 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + 463 m_kernelImpl.costPerCoeff(vectorized) + 464 TensorOpCost(0, 0, convolve_compute_cost, vectorized, 465 PacketSize)); 466 } 467 468 EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; } 469 470 private: 471 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const { 472 Index startInput = 0; 473 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) { 474 for (int i = NumDims - 1; i > 0; --i) { 475 const Index idx = index / m_outputStride[i]; 476 startInput += idx * m_inputStride[i]; 477 index -= idx * m_outputStride[i]; 478 } 479 } else { 480 for (int i = 0; i < NumDims - 1; ++i) { 481 const Index idx = index / m_outputStride[i]; 482 startInput += idx * m_inputStride[i]; 483 index -= idx * m_outputStride[i]; 484 } 485 } 486 startInput += index; 487 return startInput; 488 } 489 490 EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const { 491 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { 492 const Index input = firstIndex + j * m_indexStride[DimIndex]; 493 const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; 494 if (DimIndex > 0) { 495 convolve(input, kernel, DimIndex-1, accum); 496 } else { 497 accum += m_inputImpl.coeff(input) * m_kernel[kernel]; 498 } 499 } 500 } 501 502 template <typename Packet> 503 EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const { 504 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) { 505 const Index input = firstIndex + j * m_indexStride[DimIndex]; 506 const Index kernel = firstKernel + j * m_kernelStride[DimIndex]; 507 if (DimIndex > 0) { 508 convolvePacket(input, kernel, DimIndex-1, accum); 509 } else { 510 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum); 511 } 512 } 513 } 514 515 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() { 516 // Don't make a local copy of the kernel unless we have to (i.e. it's an 517 // expression that needs to be evaluated) 518 const Scalar* in_place = m_kernelImpl.data(); 519 if (in_place) { 520 m_kernel = in_place; 521 m_local_kernel = false; 522 } else { 523 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); 524 Scalar* local = (Scalar*)m_device.allocate(kernel_sz); 525 typedef TensorEvalToOp<const KernelArgType> EvalTo; 526 EvalTo evalToTmp(local, m_kernelArg); 527 const bool PacketAccess = internal::IsVectorizable<Device, KernelArgType>::value; 528 internal::TensorExecutor<const EvalTo, Device, PacketAccess>::run(evalToTmp, m_device); 529 530 m_kernel = local; 531 m_local_kernel = true; 532 } 533 } 534 535 array<Index, NumDims> m_inputStride; 536 array<Index, NumDims> m_outputStride; 537 538 array<Index, NumKernelDims> m_indexStride; 539 array<Index, NumKernelDims> m_kernelStride; 540 TensorEvaluator<InputArgType, Device> m_inputImpl; 541 TensorEvaluator<KernelArgType, Device> m_kernelImpl; 542 Dimensions m_dimensions; 543 544 KernelArgType m_kernelArg; 545 const Scalar* m_kernel; 546 bool m_local_kernel; 547 const Device& m_device; 548 }; 549 550 551 552 553 // Use an optimized implementation of the evaluation code for GPUs whenever possible. 554 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__) 555 556 template <int StaticKernelSize> 557 struct GetKernelSize { 558 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const { 559 return StaticKernelSize; 560 } 561 }; 562 template <> 563 struct GetKernelSize<Dynamic> { 564 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const { 565 return kernelSize; 566 } 567 }; 568 569 template <typename InputEvaluator, typename Index, typename InputDims, 570 int StaticKernelSize> 571 __global__ void EigenConvolutionKernel1D( 572 InputEvaluator eval, 573 const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> 574 indexMapper, 575 const float* __restrict kernel, const int numPlanes, const int numX, 576 const int maxX, const int kernelSize, float* buffer) { 577 extern __shared__ float s[]; 578 579 const int first_x = blockIdx.x * maxX; 580 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; 581 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize); 582 const int num_x_output = last_x - first_x + 1; 583 584 const int first_plane = blockIdx.y * blockDim.y; 585 const int plane_stride = blockDim.y * gridDim.y; 586 587 for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) { 588 // Load inputs to shared memory 589 const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); 590 const int plane_kernel_offset = threadIdx.y * num_x_input; 591 #pragma unroll 592 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { 593 const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x); 594 s[i + plane_kernel_offset] = eval.coeff(tensor_index); 595 } 596 597 __syncthreads(); 598 599 // Compute the convolution 600 const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); 601 602 #pragma unroll 603 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { 604 const int kernel_offset = plane_kernel_offset + i; 605 float result = 0.0f; 606 #pragma unroll 607 for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) { 608 result += s[k + kernel_offset] * kernel[k]; 609 } 610 const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x); 611 buffer[tensor_index] = result; 612 } 613 __syncthreads(); 614 } 615 }; 616 617 template <typename InputEvaluator, typename Index, typename InputDims, 618 int StaticKernelSizeX, int StaticKernelSizeY> 619 __global__ void EigenConvolutionKernel2D( 620 InputEvaluator eval, 621 const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> 622 indexMapper, 623 const float* __restrict kernel, const int numPlanes, const int numX, 624 const int maxX, const int numY, const int maxY, const int kernelSizeX, 625 const int kernelSizeY, float* buffer) { 626 extern __shared__ float s[]; 627 628 const int first_x = blockIdx.x * maxX; 629 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; 630 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX); 631 const int num_x_output = last_x - first_x + 1; 632 633 const int first_y = blockIdx.y * maxY; 634 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; 635 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY); 636 const int num_y_output = last_y - first_y + 1; 637 638 const int first_plane = blockIdx.z * blockDim.z; 639 const int plane_stride = blockDim.z * gridDim.z; 640 641 for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) { 642 643 const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); 644 const int plane_kernel_offset = threadIdx.z * num_y_input; 645 646 // Load inputs to shared memory 647 #pragma unroll 648 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { 649 const int input_offset = num_x_input * (j + plane_kernel_offset); 650 #pragma unroll 651 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { 652 const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y); 653 s[i + input_offset] = eval.coeff(tensor_index); 654 } 655 } 656 657 __syncthreads(); 658 659 // Convolution 660 const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); 661 662 #pragma unroll 663 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { 664 #pragma unroll 665 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { 666 float result = 0.0f; 667 #pragma unroll 668 for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) { 669 const int kernel_offset = kernelSizeX * l; 670 const int input_offset = i + num_x_input * (j + l + plane_kernel_offset); 671 #pragma unroll 672 for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) { 673 result += s[k + input_offset] * kernel[k + kernel_offset]; 674 } 675 } 676 const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y); 677 buffer[tensor_index] = result; 678 } 679 } 680 681 __syncthreads(); 682 } 683 }; 684 685 template <typename InputEvaluator, typename Index, typename InputDims> 686 __global__ void EigenConvolutionKernel3D( 687 InputEvaluator eval, 688 const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> 689 indexMapper, 690 const float* __restrict kernel, const size_t numPlanes, const size_t numX, 691 const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ, 692 const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY, 693 const size_t kernelSizeZ, float* buffer) { 694 extern __shared__ float s[]; 695 696 // Load inputs to shared memory 697 const int first_x = blockIdx.x * maxX; 698 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1; 699 const int num_x_input = last_x - first_x + kernelSizeX; 700 701 const int first_y = blockIdx.y * maxY; 702 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1; 703 const int num_y_input = last_y - first_y + kernelSizeY; 704 705 const int first_z = blockIdx.z * maxZ; 706 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1; 707 const int num_z_input = last_z - first_z + kernelSizeZ; 708 709 for (int p = 0; p < numPlanes; ++p) { 710 711 const int plane_input_offset = indexMapper.mapCudaInputPlaneToTensorInputOffset(p); 712 const int plane_kernel_offset = 0; 713 714 for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) { 715 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) { 716 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) { 717 const int tensor_index = plane_input_offset + indexMapper.mapCudaInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z); 718 s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index); 719 } 720 } 721 } 722 723 __syncthreads(); 724 725 // Convolution 726 const int num_z_output = last_z - first_z + 1; 727 const int num_y_output = last_y - first_y + 1; 728 const int num_x_output = last_x - first_x + 1; 729 const int plane_output_offset = indexMapper.mapCudaOutputPlaneToTensorOutputOffset(p); 730 731 for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) { 732 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) { 733 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) { 734 float result = 0.0f; 735 for (int n = 0; n < kernelSizeZ; ++n) { 736 for (int m = 0; m < kernelSizeY; ++m) { 737 for (int l = 0; l < kernelSizeX; ++l) { 738 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)]; 739 } 740 } 741 } 742 const int tensor_index = plane_output_offset + indexMapper.mapCudaOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z); 743 buffer[tensor_index] = result; 744 } 745 } 746 } 747 __syncthreads(); 748 } 749 }; 750 751 752 753 template<typename Indices, typename InputArgType, typename KernelArgType> 754 struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice> 755 { 756 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType; 757 758 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value; 759 static const int NumKernelDims = internal::array_size<Indices>::value; 760 typedef typename XprType::Index Index; 761 typedef DSizes<Index, NumDims> Dimensions; 762 typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions; 763 764 enum { 765 IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned, 766 PacketAccess = false, 767 Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout, 768 CoordAccess = false, // to be implemented 769 RawAccess = false 770 }; 771 772 EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const GpuDevice& device) 773 : m_inputImpl(op.inputExpression(), device), m_kernelArg(op.kernelExpression()), m_kernelImpl(op.kernelExpression(), device), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device) 774 { 775 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE); 776 777 const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions(); 778 const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions(); 779 780 m_dimensions = m_inputImpl.dimensions(); 781 for (int i = 0; i < NumKernelDims; ++i) { 782 const Index index = op.indices()[i]; 783 const Index input_dim = input_dims[index]; 784 const Index kernel_dim = kernel_dims[i]; 785 const Index result_dim = input_dim - kernel_dim + 1; 786 m_dimensions[index] = result_dim; 787 } 788 } 789 790 typedef typename XprType::CoeffReturnType CoeffReturnType; 791 typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType; 792 typedef typename InputArgType::Scalar Scalar; 793 static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size; 794 795 EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; } 796 797 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) { 798 preloadKernel(); 799 m_inputImpl.evalSubExprsIfNeeded(NULL); 800 if (data) { 801 executeEval(data); 802 return false; 803 } else { 804 m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)); 805 executeEval(m_buf); 806 return true; 807 } 808 } 809 810 EIGEN_STRONG_INLINE void cleanup() { 811 m_inputImpl.cleanup(); 812 if (m_buf) { 813 m_device.deallocate(m_buf); 814 m_buf = NULL; 815 } 816 if (m_local_kernel) { 817 m_device.deallocate((void*)m_kernel); 818 m_local_kernel = false; 819 } 820 m_kernel = NULL; 821 } 822 823 EIGEN_STRONG_INLINE void preloadKernel() { 824 // Don't make a local copy of the kernel unless we have to (i.e. it's an 825 // expression that needs to be evaluated) 826 const Scalar* in_place = m_kernelImpl.data(); 827 if (in_place) { 828 m_kernel = in_place; 829 m_local_kernel = false; 830 } else { 831 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar); 832 Scalar* local = (Scalar*)m_device.allocate(kernel_sz); 833 typedef TensorEvalToOp<const KernelArgType> EvalTo; 834 EvalTo evalToTmp(local, m_kernelArg); 835 const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value; 836 internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device); 837 838 m_kernel = local; 839 m_local_kernel = true; 840 } 841 } 842 843 static unsigned int ceil(unsigned int num, unsigned int denom) { 844 const unsigned int rounded_toward_zero = num / denom; 845 if (num > rounded_toward_zero * denom) { 846 return rounded_toward_zero + 1; 847 } 848 return rounded_toward_zero; 849 } 850 851 void executeEval(Scalar* data) const { 852 typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims; 853 854 const int maxSharedMem = m_device.sharedMemPerBlock(); 855 const int maxThreadsPerBlock = m_device.maxCudaThreadsPerBlock(); 856 const int maxBlocksPerProcessor = m_device.maxCudaThreadsPerMultiProcessor() / maxThreadsPerBlock; 857 const int numMultiProcessors = m_device.getNumCudaMultiProcessors(); 858 const int warpSize = 32; 859 860 switch (NumKernelDims) { 861 case 1: { 862 const int kernel_size = m_kernelImpl.dimensions().TotalSize(); 863 864 const int numX = dimensions()[m_indices[0]]; 865 const int numP = dimensions().TotalSize() / numX; 866 int maxX; 867 dim3 block_size; 868 869 const int single_stride_dim = 870 static_cast<int>(Layout) == static_cast<int>(ColMajor) 871 ? 0 872 : m_inputImpl.dimensions().rank() - 1; 873 if (m_indices[0] == single_stride_dim) { 874 // Maximum the reuse 875 const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32; 876 maxX = numext::mini<int>(inner_dim, numX); 877 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP); 878 block_size.x = numext::mini(maxThreadsPerBlock, maxX); 879 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP); 880 } 881 else { 882 // Read as much as possible alongside the inner most dimension, that is the plane 883 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar)); 884 const int maxP = numext::mini<int>(inner_dim, numP); 885 maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX); 886 887 block_size.x = numext::mini(warpSize, maxX); 888 block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP); 889 } 890 891 const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar); 892 assert(shared_mem <= maxSharedMem); 893 894 const int num_x_blocks = ceil(numX, maxX); 895 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); 896 const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks); 897 898 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y))); 899 900 901 //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; 902 903 const array<Index, 1> indices(m_indices[0]); 904 const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]); 905 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper( 906 m_inputImpl.dimensions(), kernel_dims, indices); 907 switch(kernel_size) { 908 case 4: { 909 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data); 910 break; 911 } 912 case 7: { 913 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data); 914 break; 915 } 916 default: { 917 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data); 918 } 919 } 920 break; 921 } 922 923 case 2: { 924 const int idxX = 925 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1; 926 const int idxY = 927 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0; 928 const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; 929 const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; 930 931 const int numX = dimensions()[m_indices[idxX]]; 932 const int numY = dimensions()[m_indices[idxY]]; 933 const int numP = dimensions().TotalSize() / (numX*numY); 934 935 const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x)); 936 937 // Snap maxX to warp size 938 int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32; 939 const int maxX = numext::mini<int>(inner_dim, numX); 940 const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY); 941 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP); 942 943 dim3 block_size; 944 block_size.x = numext::mini(1024, maxX); 945 block_size.y = numext::mini<int>(1024/block_size.x, maxY); 946 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP); 947 948 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar); 949 assert(shared_mem <= maxSharedMem); 950 951 const int num_x_blocks = ceil(numX, maxX); 952 const int num_y_blocks = ceil(numY, maxY); 953 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem); 954 const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks); 955 956 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z))); 957 958 959 //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; 960 961 const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]); 962 const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX], 963 m_kernelImpl.dimensions()[idxY]); 964 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper( 965 m_inputImpl.dimensions(), kernel_dims, indices); 966 switch (kernel_size_x) { 967 case 4: { 968 switch (kernel_size_y) { 969 case 7: { 970 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data); 971 break; 972 } 973 default: { 974 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data); 975 break; 976 } 977 } 978 break; 979 } 980 case 7: { 981 switch (kernel_size_y) { 982 case 4: { 983 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data); 984 break; 985 } 986 default: { 987 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data); 988 break; 989 } 990 } 991 break; 992 } 993 default: { 994 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data); 995 break; 996 } 997 } 998 break; 999 } 1000 1001 case 3: { 1002 const int idxX = 1003 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2; 1004 const int idxY = 1005 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1; 1006 const int idxZ = 1007 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0; 1008 1009 const int kernel_size_x = m_kernelImpl.dimensions()[idxX]; 1010 const int kernel_size_y = m_kernelImpl.dimensions()[idxY]; 1011 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ]; 1012 1013 const int numX = dimensions()[m_indices[idxX]]; 1014 const int numY = dimensions()[m_indices[idxY]]; 1015 const int numZ = dimensions()[m_indices[idxZ]]; 1016 const int numP = dimensions().TotalSize() / (numX*numY*numZ); 1017 1018 const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX)); 1019 const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY)); 1020 const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ)); 1021 1022 dim3 block_size; 1023 block_size.x = numext::mini(32, maxX); 1024 block_size.y = numext::mini(32, maxY); 1025 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ); 1026 dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ)); 1027 1028 const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar); 1029 assert(shared_mem <= maxSharedMem); 1030 1031 //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl; 1032 const array<Index, 3> indices(m_indices[idxX], m_indices[idxY], 1033 m_indices[idxZ]); 1034 const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX], 1035 m_kernelImpl.dimensions()[idxY], 1036 m_kernelImpl.dimensions()[idxZ]); 1037 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper( 1038 m_inputImpl.dimensions(), kernel_dims, indices); 1039 1040 LAUNCH_CUDA_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data); 1041 break; 1042 } 1043 1044 default: { 1045 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE); 1046 } 1047 } 1048 } 1049 1050 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const 1051 { 1052 eigen_assert(m_buf); 1053 eigen_assert(index < m_dimensions.TotalSize()); 1054 return m_buf[index]; 1055 } 1056 1057 template<int LoadMode> 1058 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const 1059 { 1060 eigen_assert(m_buf); 1061 eigen_assert(index < m_dimensions.TotalSize()); 1062 return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index); 1063 } 1064 1065 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost 1066 costPerCoeff(bool vectorized) const { 1067 // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost 1068 // model. 1069 const double kernel_size = m_kernelImpl.dimensions().TotalSize(); 1070 // We ignore the use of fused multiply-add. 1071 const double convolve_compute_cost = 1072 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>(); 1073 const double firstIndex_compute_cost = 1074 NumDims * 1075 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + 1076 TensorOpCost::DivCost<Index>()); 1077 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) + 1078 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + 1079 m_kernelImpl.costPerCoeff(vectorized) + 1080 TensorOpCost(0, 0, convolve_compute_cost, vectorized, 1081 PacketSize)); 1082 } 1083 1084 private: 1085 // No assignment (copies are needed by the kernels) 1086 TensorEvaluator& operator = (const TensorEvaluator&); 1087 1088 TensorEvaluator<InputArgType, GpuDevice> m_inputImpl; 1089 TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl; 1090 KernelArgType m_kernelArg; 1091 Indices m_indices; 1092 Dimensions m_dimensions; 1093 Scalar* m_buf; 1094 const Scalar* m_kernel; 1095 bool m_local_kernel; 1096 1097 const GpuDevice& m_device; 1098 }; 1099 #endif 1100 1101 1102 } // end namespace Eigen 1103 1104 #endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H 1105