1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2011 Benoit Jacob <jacob.benoit.1@gmail.com> 5 // Copyright (C) 2011-2014 Gael Guennebaud <gael.guennebaud@inria.fr> 6 // Copyright (C) 2011-2012 Jitse Niesen <jitse@maths.leeds.ac.uk> 7 // 8 // This Source Code Form is subject to the terms of the Mozilla 9 // Public License v. 2.0. If a copy of the MPL was not distributed 10 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 11 12 #ifndef EIGEN_ASSIGN_EVALUATOR_H 13 #define EIGEN_ASSIGN_EVALUATOR_H 14 15 namespace Eigen { 16 17 // This implementation is based on Assign.h 18 19 namespace internal { 20 21 /*************************************************************************** 22 * Part 1 : the logic deciding a strategy for traversal and unrolling * 23 ***************************************************************************/ 24 25 // copy_using_evaluator_traits is based on assign_traits 26 27 template <typename DstEvaluator, typename SrcEvaluator, typename AssignFunc> 28 struct copy_using_evaluator_traits 29 { 30 typedef typename DstEvaluator::XprType Dst; 31 typedef typename Dst::Scalar DstScalar; 32 33 enum { 34 DstFlags = DstEvaluator::Flags, 35 SrcFlags = SrcEvaluator::Flags 36 }; 37 38 public: 39 enum { 40 DstAlignment = DstEvaluator::Alignment, 41 SrcAlignment = SrcEvaluator::Alignment, 42 DstHasDirectAccess = DstFlags & DirectAccessBit, 43 JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment) 44 }; 45 46 private: 47 enum { 48 InnerSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::SizeAtCompileTime) 49 : int(DstFlags)&RowMajorBit ? int(Dst::ColsAtCompileTime) 50 : int(Dst::RowsAtCompileTime), 51 InnerMaxSize = int(Dst::IsVectorAtCompileTime) ? int(Dst::MaxSizeAtCompileTime) 52 : int(DstFlags)&RowMajorBit ? int(Dst::MaxColsAtCompileTime) 53 : int(Dst::MaxRowsAtCompileTime), 54 OuterStride = int(outer_stride_at_compile_time<Dst>::ret), 55 MaxSizeAtCompileTime = Dst::SizeAtCompileTime 56 }; 57 58 // TODO distinguish between linear traversal and inner-traversals 59 typedef typename find_best_packet<DstScalar,Dst::SizeAtCompileTime>::type LinearPacketType; 60 typedef typename find_best_packet<DstScalar,InnerSize>::type InnerPacketType; 61 62 enum { 63 LinearPacketSize = unpacket_traits<LinearPacketType>::size, 64 InnerPacketSize = unpacket_traits<InnerPacketType>::size 65 }; 66 67 public: 68 enum { 69 LinearRequiredAlignment = unpacket_traits<LinearPacketType>::alignment, 70 InnerRequiredAlignment = unpacket_traits<InnerPacketType>::alignment 71 }; 72 73 private: 74 enum { 75 DstIsRowMajor = DstFlags&RowMajorBit, 76 SrcIsRowMajor = SrcFlags&RowMajorBit, 77 StorageOrdersAgree = (int(DstIsRowMajor) == int(SrcIsRowMajor)), 78 MightVectorize = bool(StorageOrdersAgree) 79 && (int(DstFlags) & int(SrcFlags) & ActualPacketAccessBit) 80 && bool(functor_traits<AssignFunc>::PacketAccess), 81 MayInnerVectorize = MightVectorize 82 && int(InnerSize)!=Dynamic && int(InnerSize)%int(InnerPacketSize)==0 83 && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0 84 && (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)), 85 MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit), 86 MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess 87 && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), 88 /* If the destination isn't aligned, we have to do runtime checks and we don't unroll, 89 so it's only good for large enough sizes. */ 90 MaySliceVectorize = bool(MightVectorize) && bool(DstHasDirectAccess) 91 && (int(InnerMaxSize)==Dynamic || int(InnerMaxSize)>=(EIGEN_UNALIGNED_VECTORIZE?InnerPacketSize:(3*InnerPacketSize))) 92 /* slice vectorization can be slow, so we only want it if the slices are big, which is 93 indicated by InnerMaxSize rather than InnerSize, think of the case of a dynamic block 94 in a fixed-size matrix 95 However, with EIGEN_UNALIGNED_VECTORIZE and unrolling, slice vectorization is still worth it */ 96 }; 97 98 public: 99 enum { 100 Traversal = int(MayLinearVectorize) && (LinearPacketSize>InnerPacketSize) ? int(LinearVectorizedTraversal) 101 : int(MayInnerVectorize) ? int(InnerVectorizedTraversal) 102 : int(MayLinearVectorize) ? int(LinearVectorizedTraversal) 103 : int(MaySliceVectorize) ? int(SliceVectorizedTraversal) 104 : int(MayLinearize) ? int(LinearTraversal) 105 : int(DefaultTraversal), 106 Vectorized = int(Traversal) == InnerVectorizedTraversal 107 || int(Traversal) == LinearVectorizedTraversal 108 || int(Traversal) == SliceVectorizedTraversal 109 }; 110 111 typedef typename conditional<int(Traversal)==LinearVectorizedTraversal, LinearPacketType, InnerPacketType>::type PacketType; 112 113 private: 114 enum { 115 ActualPacketSize = int(Traversal)==LinearVectorizedTraversal ? LinearPacketSize 116 : Vectorized ? InnerPacketSize 117 : 1, 118 UnrollingLimit = EIGEN_UNROLLING_LIMIT * ActualPacketSize, 119 MayUnrollCompletely = int(Dst::SizeAtCompileTime) != Dynamic 120 && int(Dst::SizeAtCompileTime) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit), 121 MayUnrollInner = int(InnerSize) != Dynamic 122 && int(InnerSize) * (int(DstEvaluator::CoeffReadCost)+int(SrcEvaluator::CoeffReadCost)) <= int(UnrollingLimit) 123 }; 124 125 public: 126 enum { 127 Unrolling = (int(Traversal) == int(InnerVectorizedTraversal) || int(Traversal) == int(DefaultTraversal)) 128 ? ( 129 int(MayUnrollCompletely) ? int(CompleteUnrolling) 130 : int(MayUnrollInner) ? int(InnerUnrolling) 131 : int(NoUnrolling) 132 ) 133 : int(Traversal) == int(LinearVectorizedTraversal) 134 ? ( bool(MayUnrollCompletely) && ( EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment))) 135 ? int(CompleteUnrolling) 136 : int(NoUnrolling) ) 137 : int(Traversal) == int(LinearTraversal) 138 ? ( bool(MayUnrollCompletely) ? int(CompleteUnrolling) 139 : int(NoUnrolling) ) 140 #if EIGEN_UNALIGNED_VECTORIZE 141 : int(Traversal) == int(SliceVectorizedTraversal) 142 ? ( bool(MayUnrollInner) ? int(InnerUnrolling) 143 : int(NoUnrolling) ) 144 #endif 145 : int(NoUnrolling) 146 }; 147 148 #ifdef EIGEN_DEBUG_ASSIGN debugcopy_using_evaluator_traits149 static void debug() 150 { 151 std::cerr << "DstXpr: " << typeid(typename DstEvaluator::XprType).name() << std::endl; 152 std::cerr << "SrcXpr: " << typeid(typename SrcEvaluator::XprType).name() << std::endl; 153 std::cerr.setf(std::ios::hex, std::ios::basefield); 154 std::cerr << "DstFlags" << " = " << DstFlags << " (" << demangle_flags(DstFlags) << " )" << std::endl; 155 std::cerr << "SrcFlags" << " = " << SrcFlags << " (" << demangle_flags(SrcFlags) << " )" << std::endl; 156 std::cerr.unsetf(std::ios::hex); 157 EIGEN_DEBUG_VAR(DstAlignment) 158 EIGEN_DEBUG_VAR(SrcAlignment) 159 EIGEN_DEBUG_VAR(LinearRequiredAlignment) 160 EIGEN_DEBUG_VAR(InnerRequiredAlignment) 161 EIGEN_DEBUG_VAR(JointAlignment) 162 EIGEN_DEBUG_VAR(InnerSize) 163 EIGEN_DEBUG_VAR(InnerMaxSize) 164 EIGEN_DEBUG_VAR(LinearPacketSize) 165 EIGEN_DEBUG_VAR(InnerPacketSize) 166 EIGEN_DEBUG_VAR(ActualPacketSize) 167 EIGEN_DEBUG_VAR(StorageOrdersAgree) 168 EIGEN_DEBUG_VAR(MightVectorize) 169 EIGEN_DEBUG_VAR(MayLinearize) 170 EIGEN_DEBUG_VAR(MayInnerVectorize) 171 EIGEN_DEBUG_VAR(MayLinearVectorize) 172 EIGEN_DEBUG_VAR(MaySliceVectorize) 173 std::cerr << "Traversal" << " = " << Traversal << " (" << demangle_traversal(Traversal) << ")" << std::endl; 174 EIGEN_DEBUG_VAR(SrcEvaluator::CoeffReadCost) 175 EIGEN_DEBUG_VAR(UnrollingLimit) 176 EIGEN_DEBUG_VAR(MayUnrollCompletely) 177 EIGEN_DEBUG_VAR(MayUnrollInner) 178 std::cerr << "Unrolling" << " = " << Unrolling << " (" << demangle_unrolling(Unrolling) << ")" << std::endl; 179 std::cerr << std::endl; 180 } 181 #endif 182 }; 183 184 /*************************************************************************** 185 * Part 2 : meta-unrollers 186 ***************************************************************************/ 187 188 /************************ 189 *** Default traversal *** 190 ************************/ 191 192 template<typename Kernel, int Index, int Stop> 193 struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling 194 { 195 // FIXME: this is not very clean, perhaps this information should be provided by the kernel? 196 typedef typename Kernel::DstEvaluatorType DstEvaluatorType; 197 typedef typename DstEvaluatorType::XprType DstXprType; 198 199 enum { 200 outer = Index / DstXprType::InnerSizeAtCompileTime, 201 inner = Index % DstXprType::InnerSizeAtCompileTime 202 }; 203 runcopy_using_evaluator_DefaultTraversal_CompleteUnrolling204 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) 205 { 206 kernel.assignCoeffByOuterInner(outer, inner); 207 copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel); 208 } 209 }; 210 211 template<typename Kernel, int Stop> 212 struct copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, Stop, Stop> 213 { 214 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { } 215 }; 216 217 template<typename Kernel, int Index_, int Stop> 218 struct copy_using_evaluator_DefaultTraversal_InnerUnrolling 219 { 220 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer) 221 { 222 kernel.assignCoeffByOuterInner(outer, Index_); 223 copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Index_+1, Stop>::run(kernel, outer); 224 } 225 }; 226 227 template<typename Kernel, int Stop> 228 struct copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, Stop, Stop> 229 { 230 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index) { } 231 }; 232 233 /*********************** 234 *** Linear traversal *** 235 ***********************/ 236 237 template<typename Kernel, int Index, int Stop> 238 struct copy_using_evaluator_LinearTraversal_CompleteUnrolling 239 { 240 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel& kernel) 241 { 242 kernel.assignCoeff(Index); 243 copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Index+1, Stop>::run(kernel); 244 } 245 }; 246 247 template<typename Kernel, int Stop> 248 struct copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, Stop, Stop> 249 { 250 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { } 251 }; 252 253 /************************** 254 *** Inner vectorization *** 255 **************************/ 256 257 template<typename Kernel, int Index, int Stop> 258 struct copy_using_evaluator_innervec_CompleteUnrolling 259 { 260 // FIXME: this is not very clean, perhaps this information should be provided by the kernel? 261 typedef typename Kernel::DstEvaluatorType DstEvaluatorType; 262 typedef typename DstEvaluatorType::XprType DstXprType; 263 typedef typename Kernel::PacketType PacketType; 264 265 enum { 266 outer = Index / DstXprType::InnerSizeAtCompileTime, 267 inner = Index % DstXprType::InnerSizeAtCompileTime, 268 SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, 269 DstAlignment = Kernel::AssignmentTraits::DstAlignment 270 }; 271 272 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) 273 { 274 kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner); 275 enum { NextIndex = Index + unpacket_traits<PacketType>::size }; 276 copy_using_evaluator_innervec_CompleteUnrolling<Kernel, NextIndex, Stop>::run(kernel); 277 } 278 }; 279 280 template<typename Kernel, int Stop> 281 struct copy_using_evaluator_innervec_CompleteUnrolling<Kernel, Stop, Stop> 282 { 283 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&) { } 284 }; 285 286 template<typename Kernel, int Index_, int Stop, int SrcAlignment, int DstAlignment> 287 struct copy_using_evaluator_innervec_InnerUnrolling 288 { 289 typedef typename Kernel::PacketType PacketType; 290 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, Index outer) 291 { 292 kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, Index_); 293 enum { NextIndex = Index_ + unpacket_traits<PacketType>::size }; 294 copy_using_evaluator_innervec_InnerUnrolling<Kernel, NextIndex, Stop, SrcAlignment, DstAlignment>::run(kernel, outer); 295 } 296 }; 297 298 template<typename Kernel, int Stop, int SrcAlignment, int DstAlignment> 299 struct copy_using_evaluator_innervec_InnerUnrolling<Kernel, Stop, Stop, SrcAlignment, DstAlignment> 300 { 301 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &, Index) { } 302 }; 303 304 /*************************************************************************** 305 * Part 3 : implementation of all cases 306 ***************************************************************************/ 307 308 // dense_assignment_loop is based on assign_impl 309 310 template<typename Kernel, 311 int Traversal = Kernel::AssignmentTraits::Traversal, 312 int Unrolling = Kernel::AssignmentTraits::Unrolling> 313 struct dense_assignment_loop; 314 315 /************************ 316 *** Default traversal *** 317 ************************/ 318 319 template<typename Kernel> 320 struct dense_assignment_loop<Kernel, DefaultTraversal, NoUnrolling> 321 { 322 EIGEN_DEVICE_FUNC static void EIGEN_STRONG_INLINE run(Kernel &kernel) 323 { 324 for(Index outer = 0; outer < kernel.outerSize(); ++outer) { 325 for(Index inner = 0; inner < kernel.innerSize(); ++inner) { 326 kernel.assignCoeffByOuterInner(outer, inner); 327 } 328 } 329 } 330 }; 331 332 template<typename Kernel> 333 struct dense_assignment_loop<Kernel, DefaultTraversal, CompleteUnrolling> 334 { 335 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) 336 { 337 typedef typename Kernel::DstEvaluatorType::XprType DstXprType; 338 copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel); 339 } 340 }; 341 342 template<typename Kernel> 343 struct dense_assignment_loop<Kernel, DefaultTraversal, InnerUnrolling> 344 { 345 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) 346 { 347 typedef typename Kernel::DstEvaluatorType::XprType DstXprType; 348 349 const Index outerSize = kernel.outerSize(); 350 for(Index outer = 0; outer < outerSize; ++outer) 351 copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime>::run(kernel, outer); 352 } 353 }; 354 355 /*************************** 356 *** Linear vectorization *** 357 ***************************/ 358 359 360 // The goal of unaligned_dense_assignment_loop is simply to factorize the handling 361 // of the non vectorizable beginning and ending parts 362 363 template <bool IsAligned = false> 364 struct unaligned_dense_assignment_loop 365 { 366 // if IsAligned = true, then do nothing 367 template <typename Kernel> 368 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel&, Index, Index) {} 369 }; 370 371 template <> 372 struct unaligned_dense_assignment_loop<false> 373 { 374 // MSVC must not inline this functions. If it does, it fails to optimize the 375 // packet access path. 376 // FIXME check which version exhibits this issue 377 #if EIGEN_COMP_MSVC 378 template <typename Kernel> 379 static EIGEN_DONT_INLINE void run(Kernel &kernel, 380 Index start, 381 Index end) 382 #else 383 template <typename Kernel> 384 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel, 385 Index start, 386 Index end) 387 #endif 388 { 389 for (Index index = start; index < end; ++index) 390 kernel.assignCoeff(index); 391 } 392 }; 393 394 template<typename Kernel> 395 struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, NoUnrolling> 396 { 397 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) 398 { 399 const Index size = kernel.size(); 400 typedef typename Kernel::Scalar Scalar; 401 typedef typename Kernel::PacketType PacketType; 402 enum { 403 requestedAlignment = Kernel::AssignmentTraits::LinearRequiredAlignment, 404 packetSize = unpacket_traits<PacketType>::size, 405 dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment), 406 dstAlignment = packet_traits<Scalar>::AlignedOnScalar ? int(requestedAlignment) 407 : int(Kernel::AssignmentTraits::DstAlignment), 408 srcAlignment = Kernel::AssignmentTraits::JointAlignment 409 }; 410 const Index alignedStart = dstIsAligned ? 0 : internal::first_aligned<requestedAlignment>(kernel.dstDataPtr(), size); 411 const Index alignedEnd = alignedStart + ((size-alignedStart)/packetSize)*packetSize; 412 413 unaligned_dense_assignment_loop<dstIsAligned!=0>::run(kernel, 0, alignedStart); 414 415 for(Index index = alignedStart; index < alignedEnd; index += packetSize) 416 kernel.template assignPacket<dstAlignment, srcAlignment, PacketType>(index); 417 418 unaligned_dense_assignment_loop<>::run(kernel, alignedEnd, size); 419 } 420 }; 421 422 template<typename Kernel> 423 struct dense_assignment_loop<Kernel, LinearVectorizedTraversal, CompleteUnrolling> 424 { 425 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) 426 { 427 typedef typename Kernel::DstEvaluatorType::XprType DstXprType; 428 typedef typename Kernel::PacketType PacketType; 429 430 enum { size = DstXprType::SizeAtCompileTime, 431 packetSize =unpacket_traits<PacketType>::size, 432 alignedSize = (size/packetSize)*packetSize }; 433 434 copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, alignedSize>::run(kernel); 435 copy_using_evaluator_DefaultTraversal_CompleteUnrolling<Kernel, alignedSize, size>::run(kernel); 436 } 437 }; 438 439 /************************** 440 *** Inner vectorization *** 441 **************************/ 442 443 template<typename Kernel> 444 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, NoUnrolling> 445 { 446 typedef typename Kernel::PacketType PacketType; 447 enum { 448 SrcAlignment = Kernel::AssignmentTraits::SrcAlignment, 449 DstAlignment = Kernel::AssignmentTraits::DstAlignment 450 }; 451 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) 452 { 453 const Index innerSize = kernel.innerSize(); 454 const Index outerSize = kernel.outerSize(); 455 const Index packetSize = unpacket_traits<PacketType>::size; 456 for(Index outer = 0; outer < outerSize; ++outer) 457 for(Index inner = 0; inner < innerSize; inner+=packetSize) 458 kernel.template assignPacketByOuterInner<DstAlignment, SrcAlignment, PacketType>(outer, inner); 459 } 460 }; 461 462 template<typename Kernel> 463 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, CompleteUnrolling> 464 { 465 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) 466 { 467 typedef typename Kernel::DstEvaluatorType::XprType DstXprType; 468 copy_using_evaluator_innervec_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel); 469 } 470 }; 471 472 template<typename Kernel> 473 struct dense_assignment_loop<Kernel, InnerVectorizedTraversal, InnerUnrolling> 474 { 475 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) 476 { 477 typedef typename Kernel::DstEvaluatorType::XprType DstXprType; 478 typedef typename Kernel::AssignmentTraits Traits; 479 const Index outerSize = kernel.outerSize(); 480 for(Index outer = 0; outer < outerSize; ++outer) 481 copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, DstXprType::InnerSizeAtCompileTime, 482 Traits::SrcAlignment, Traits::DstAlignment>::run(kernel, outer); 483 } 484 }; 485 486 /*********************** 487 *** Linear traversal *** 488 ***********************/ 489 490 template<typename Kernel> 491 struct dense_assignment_loop<Kernel, LinearTraversal, NoUnrolling> 492 { 493 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) 494 { 495 const Index size = kernel.size(); 496 for(Index i = 0; i < size; ++i) 497 kernel.assignCoeff(i); 498 } 499 }; 500 501 template<typename Kernel> 502 struct dense_assignment_loop<Kernel, LinearTraversal, CompleteUnrolling> 503 { 504 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) 505 { 506 typedef typename Kernel::DstEvaluatorType::XprType DstXprType; 507 copy_using_evaluator_LinearTraversal_CompleteUnrolling<Kernel, 0, DstXprType::SizeAtCompileTime>::run(kernel); 508 } 509 }; 510 511 /************************** 512 *** Slice vectorization *** 513 ***************************/ 514 515 template<typename Kernel> 516 struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, NoUnrolling> 517 { 518 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) 519 { 520 typedef typename Kernel::Scalar Scalar; 521 typedef typename Kernel::PacketType PacketType; 522 enum { 523 packetSize = unpacket_traits<PacketType>::size, 524 requestedAlignment = int(Kernel::AssignmentTraits::InnerRequiredAlignment), 525 alignable = packet_traits<Scalar>::AlignedOnScalar || int(Kernel::AssignmentTraits::DstAlignment)>=sizeof(Scalar), 526 dstIsAligned = int(Kernel::AssignmentTraits::DstAlignment)>=int(requestedAlignment), 527 dstAlignment = alignable ? int(requestedAlignment) 528 : int(Kernel::AssignmentTraits::DstAlignment) 529 }; 530 const Scalar *dst_ptr = kernel.dstDataPtr(); 531 if((!bool(dstIsAligned)) && (UIntPtr(dst_ptr) % sizeof(Scalar))>0) 532 { 533 // the pointer is not aligend-on scalar, so alignment is not possible 534 return dense_assignment_loop<Kernel,DefaultTraversal,NoUnrolling>::run(kernel); 535 } 536 const Index packetAlignedMask = packetSize - 1; 537 const Index innerSize = kernel.innerSize(); 538 const Index outerSize = kernel.outerSize(); 539 const Index alignedStep = alignable ? (packetSize - kernel.outerStride() % packetSize) & packetAlignedMask : 0; 540 Index alignedStart = ((!alignable) || bool(dstIsAligned)) ? 0 : internal::first_aligned<requestedAlignment>(dst_ptr, innerSize); 541 542 for(Index outer = 0; outer < outerSize; ++outer) 543 { 544 const Index alignedEnd = alignedStart + ((innerSize-alignedStart) & ~packetAlignedMask); 545 // do the non-vectorizable part of the assignment 546 for(Index inner = 0; inner<alignedStart ; ++inner) 547 kernel.assignCoeffByOuterInner(outer, inner); 548 549 // do the vectorizable part of the assignment 550 for(Index inner = alignedStart; inner<alignedEnd; inner+=packetSize) 551 kernel.template assignPacketByOuterInner<dstAlignment, Unaligned, PacketType>(outer, inner); 552 553 // do the non-vectorizable part of the assignment 554 for(Index inner = alignedEnd; inner<innerSize ; ++inner) 555 kernel.assignCoeffByOuterInner(outer, inner); 556 557 alignedStart = numext::mini((alignedStart+alignedStep)%packetSize, innerSize); 558 } 559 } 560 }; 561 562 #if EIGEN_UNALIGNED_VECTORIZE 563 template<typename Kernel> 564 struct dense_assignment_loop<Kernel, SliceVectorizedTraversal, InnerUnrolling> 565 { 566 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) 567 { 568 typedef typename Kernel::DstEvaluatorType::XprType DstXprType; 569 typedef typename Kernel::PacketType PacketType; 570 571 enum { size = DstXprType::InnerSizeAtCompileTime, 572 packetSize =unpacket_traits<PacketType>::size, 573 vectorizableSize = (size/packetSize)*packetSize }; 574 575 for(Index outer = 0; outer < kernel.outerSize(); ++outer) 576 { 577 copy_using_evaluator_innervec_InnerUnrolling<Kernel, 0, vectorizableSize, 0, 0>::run(kernel, outer); 578 copy_using_evaluator_DefaultTraversal_InnerUnrolling<Kernel, vectorizableSize, size>::run(kernel, outer); 579 } 580 } 581 }; 582 #endif 583 584 585 /*************************************************************************** 586 * Part 4 : Generic dense assignment kernel 587 ***************************************************************************/ 588 589 // This class generalize the assignment of a coefficient (or packet) from one dense evaluator 590 // to another dense writable evaluator. 591 // It is parametrized by the two evaluators, and the actual assignment functor. 592 // This abstraction level permits to keep the evaluation loops as simple and as generic as possible. 593 // One can customize the assignment using this generic dense_assignment_kernel with different 594 // functors, or by completely overloading it, by-passing a functor. 595 template<typename DstEvaluatorTypeT, typename SrcEvaluatorTypeT, typename Functor, int Version = Specialized> 596 class generic_dense_assignment_kernel 597 { 598 protected: 599 typedef typename DstEvaluatorTypeT::XprType DstXprType; 600 typedef typename SrcEvaluatorTypeT::XprType SrcXprType; 601 public: 602 603 typedef DstEvaluatorTypeT DstEvaluatorType; 604 typedef SrcEvaluatorTypeT SrcEvaluatorType; 605 typedef typename DstEvaluatorType::Scalar Scalar; 606 typedef copy_using_evaluator_traits<DstEvaluatorTypeT, SrcEvaluatorTypeT, Functor> AssignmentTraits; 607 typedef typename AssignmentTraits::PacketType PacketType; 608 609 610 EIGEN_DEVICE_FUNC generic_dense_assignment_kernel(DstEvaluatorType &dst, const SrcEvaluatorType &src, const Functor &func, DstXprType& dstExpr) 611 : m_dst(dst), m_src(src), m_functor(func), m_dstExpr(dstExpr) 612 { 613 #ifdef EIGEN_DEBUG_ASSIGN 614 AssignmentTraits::debug(); 615 #endif 616 } 617 618 EIGEN_DEVICE_FUNC Index size() const { return m_dstExpr.size(); } 619 EIGEN_DEVICE_FUNC Index innerSize() const { return m_dstExpr.innerSize(); } 620 EIGEN_DEVICE_FUNC Index outerSize() const { return m_dstExpr.outerSize(); } 621 EIGEN_DEVICE_FUNC Index rows() const { return m_dstExpr.rows(); } 622 EIGEN_DEVICE_FUNC Index cols() const { return m_dstExpr.cols(); } 623 EIGEN_DEVICE_FUNC Index outerStride() const { return m_dstExpr.outerStride(); } 624 625 EIGEN_DEVICE_FUNC DstEvaluatorType& dstEvaluator() { return m_dst; } 626 EIGEN_DEVICE_FUNC const SrcEvaluatorType& srcEvaluator() const { return m_src; } 627 628 /// Assign src(row,col) to dst(row,col) through the assignment functor. 629 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index row, Index col) 630 { 631 m_functor.assignCoeff(m_dst.coeffRef(row,col), m_src.coeff(row,col)); 632 } 633 634 /// \sa assignCoeff(Index,Index) 635 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeff(Index index) 636 { 637 m_functor.assignCoeff(m_dst.coeffRef(index), m_src.coeff(index)); 638 } 639 640 /// \sa assignCoeff(Index,Index) 641 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignCoeffByOuterInner(Index outer, Index inner) 642 { 643 Index row = rowIndexByOuterInner(outer, inner); 644 Index col = colIndexByOuterInner(outer, inner); 645 assignCoeff(row, col); 646 } 647 648 649 template<int StoreMode, int LoadMode, typename PacketType> 650 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index row, Index col) 651 { 652 m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(row,col), m_src.template packet<LoadMode,PacketType>(row,col)); 653 } 654 655 template<int StoreMode, int LoadMode, typename PacketType> 656 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacket(Index index) 657 { 658 m_functor.template assignPacket<StoreMode>(&m_dst.coeffRef(index), m_src.template packet<LoadMode,PacketType>(index)); 659 } 660 661 template<int StoreMode, int LoadMode, typename PacketType> 662 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void assignPacketByOuterInner(Index outer, Index inner) 663 { 664 Index row = rowIndexByOuterInner(outer, inner); 665 Index col = colIndexByOuterInner(outer, inner); 666 assignPacket<StoreMode,LoadMode,PacketType>(row, col); 667 } 668 669 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index rowIndexByOuterInner(Index outer, Index inner) 670 { 671 typedef typename DstEvaluatorType::ExpressionTraits Traits; 672 return int(Traits::RowsAtCompileTime) == 1 ? 0 673 : int(Traits::ColsAtCompileTime) == 1 ? inner 674 : int(DstEvaluatorType::Flags)&RowMajorBit ? outer 675 : inner; 676 } 677 678 EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Index colIndexByOuterInner(Index outer, Index inner) 679 { 680 typedef typename DstEvaluatorType::ExpressionTraits Traits; 681 return int(Traits::ColsAtCompileTime) == 1 ? 0 682 : int(Traits::RowsAtCompileTime) == 1 ? inner 683 : int(DstEvaluatorType::Flags)&RowMajorBit ? inner 684 : outer; 685 } 686 687 EIGEN_DEVICE_FUNC const Scalar* dstDataPtr() const 688 { 689 return m_dstExpr.data(); 690 } 691 692 protected: 693 DstEvaluatorType& m_dst; 694 const SrcEvaluatorType& m_src; 695 const Functor &m_functor; 696 // TODO find a way to avoid the needs of the original expression 697 DstXprType& m_dstExpr; 698 }; 699 700 /*************************************************************************** 701 * Part 5 : Entry point for dense rectangular assignment 702 ***************************************************************************/ 703 704 template<typename DstXprType,typename SrcXprType, typename Functor> 705 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 706 void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const Functor &/*func*/) 707 { 708 EIGEN_ONLY_USED_FOR_DEBUG(dst); 709 EIGEN_ONLY_USED_FOR_DEBUG(src); 710 eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); 711 } 712 713 template<typename DstXprType,typename SrcXprType, typename T1, typename T2> 714 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 715 void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const internal::assign_op<T1,T2> &/*func*/) 716 { 717 Index dstRows = src.rows(); 718 Index dstCols = src.cols(); 719 if(((dst.rows()!=dstRows) || (dst.cols()!=dstCols))) 720 dst.resize(dstRows, dstCols); 721 eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols); 722 } 723 724 template<typename DstXprType, typename SrcXprType, typename Functor> 725 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func) 726 { 727 typedef evaluator<DstXprType> DstEvaluatorType; 728 typedef evaluator<SrcXprType> SrcEvaluatorType; 729 730 SrcEvaluatorType srcEvaluator(src); 731 732 // NOTE To properly handle A = (A*A.transpose())/s with A rectangular, 733 // we need to resize the destination after the source evaluator has been created. 734 resize_if_allowed(dst, src, func); 735 736 DstEvaluatorType dstEvaluator(dst); 737 738 typedef generic_dense_assignment_kernel<DstEvaluatorType,SrcEvaluatorType,Functor> Kernel; 739 Kernel kernel(dstEvaluator, srcEvaluator, func, dst.const_cast_derived()); 740 741 dense_assignment_loop<Kernel>::run(kernel); 742 } 743 744 template<typename DstXprType, typename SrcXprType> 745 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src) 746 { 747 call_dense_assignment_loop(dst, src, internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar>()); 748 } 749 750 /*************************************************************************** 751 * Part 6 : Generic assignment 752 ***************************************************************************/ 753 754 // Based on the respective shapes of the destination and source, 755 // the class AssignmentKind determine the kind of assignment mechanism. 756 // AssignmentKind must define a Kind typedef. 757 template<typename DstShape, typename SrcShape> struct AssignmentKind; 758 759 // Assignement kind defined in this file: 760 struct Dense2Dense {}; 761 struct EigenBase2EigenBase {}; 762 763 template<typename,typename> struct AssignmentKind { typedef EigenBase2EigenBase Kind; }; 764 template<> struct AssignmentKind<DenseShape,DenseShape> { typedef Dense2Dense Kind; }; 765 766 // This is the main assignment class 767 template< typename DstXprType, typename SrcXprType, typename Functor, 768 typename Kind = typename AssignmentKind< typename evaluator_traits<DstXprType>::Shape , typename evaluator_traits<SrcXprType>::Shape >::Kind, 769 typename EnableIf = void> 770 struct Assignment; 771 772 773 // The only purpose of this call_assignment() function is to deal with noalias() / "assume-aliasing" and automatic transposition. 774 // Indeed, I (Gael) think that this concept of "assume-aliasing" was a mistake, and it makes thing quite complicated. 775 // So this intermediate function removes everything related to "assume-aliasing" such that Assignment 776 // does not has to bother about these annoying details. 777 778 template<typename Dst, typename Src> 779 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 780 void call_assignment(Dst& dst, const Src& src) 781 { 782 call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>()); 783 } 784 template<typename Dst, typename Src> 785 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 786 void call_assignment(const Dst& dst, const Src& src) 787 { 788 call_assignment(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>()); 789 } 790 791 // Deal with "assume-aliasing" 792 template<typename Dst, typename Src, typename Func> 793 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 794 void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if< evaluator_assume_aliasing<Src>::value, void*>::type = 0) 795 { 796 typename plain_matrix_type<Src>::type tmp(src); 797 call_assignment_no_alias(dst, tmp, func); 798 } 799 800 template<typename Dst, typename Src, typename Func> 801 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 802 void call_assignment(Dst& dst, const Src& src, const Func& func, typename enable_if<!evaluator_assume_aliasing<Src>::value, void*>::type = 0) 803 { 804 call_assignment_no_alias(dst, src, func); 805 } 806 807 // by-pass "assume-aliasing" 808 // When there is no aliasing, we require that 'dst' has been properly resized 809 template<typename Dst, template <typename> class StorageBase, typename Src, typename Func> 810 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 811 void call_assignment(NoAlias<Dst,StorageBase>& dst, const Src& src, const Func& func) 812 { 813 call_assignment_no_alias(dst.expression(), src, func); 814 } 815 816 817 template<typename Dst, typename Src, typename Func> 818 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 819 void call_assignment_no_alias(Dst& dst, const Src& src, const Func& func) 820 { 821 enum { 822 NeedToTranspose = ( (int(Dst::RowsAtCompileTime) == 1 && int(Src::ColsAtCompileTime) == 1) 823 || (int(Dst::ColsAtCompileTime) == 1 && int(Src::RowsAtCompileTime) == 1) 824 ) && int(Dst::SizeAtCompileTime) != 1 825 }; 826 827 typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst>::type ActualDstTypeCleaned; 828 typedef typename internal::conditional<NeedToTranspose, Transpose<Dst>, Dst&>::type ActualDstType; 829 ActualDstType actualDst(dst); 830 831 // TODO check whether this is the right place to perform these checks: 832 EIGEN_STATIC_ASSERT_LVALUE(Dst) 833 EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(ActualDstTypeCleaned,Src) 834 EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename ActualDstTypeCleaned::Scalar,typename Src::Scalar); 835 836 Assignment<ActualDstTypeCleaned,Src,Func>::run(actualDst, src, func); 837 } 838 template<typename Dst, typename Src> 839 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 840 void call_assignment_no_alias(Dst& dst, const Src& src) 841 { 842 call_assignment_no_alias(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>()); 843 } 844 845 template<typename Dst, typename Src, typename Func> 846 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 847 void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src, const Func& func) 848 { 849 // TODO check whether this is the right place to perform these checks: 850 EIGEN_STATIC_ASSERT_LVALUE(Dst) 851 EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(Dst,Src) 852 EIGEN_CHECK_BINARY_COMPATIBILIY(Func,typename Dst::Scalar,typename Src::Scalar); 853 854 Assignment<Dst,Src,Func>::run(dst, src, func); 855 } 856 template<typename Dst, typename Src> 857 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE 858 void call_assignment_no_alias_no_transpose(Dst& dst, const Src& src) 859 { 860 call_assignment_no_alias_no_transpose(dst, src, internal::assign_op<typename Dst::Scalar,typename Src::Scalar>()); 861 } 862 863 // forward declaration 864 template<typename Dst, typename Src> void check_for_aliasing(const Dst &dst, const Src &src); 865 866 // Generic Dense to Dense assignment 867 // Note that the last template argument "Weak" is needed to make it possible to perform 868 // both partial specialization+SFINAE without ambiguous specialization 869 template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak> 870 struct Assignment<DstXprType, SrcXprType, Functor, Dense2Dense, Weak> 871 { 872 EIGEN_DEVICE_FUNC 873 static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const Functor &func) 874 { 875 #ifndef EIGEN_NO_DEBUG 876 internal::check_for_aliasing(dst, src); 877 #endif 878 879 call_dense_assignment_loop(dst, src, func); 880 } 881 }; 882 883 // Generic assignment through evalTo. 884 // TODO: not sure we have to keep that one, but it helps porting current code to new evaluator mechanism. 885 // Note that the last template argument "Weak" is needed to make it possible to perform 886 // both partial specialization+SFINAE without ambiguous specialization 887 template< typename DstXprType, typename SrcXprType, typename Functor, typename Weak> 888 struct Assignment<DstXprType, SrcXprType, Functor, EigenBase2EigenBase, Weak> 889 { 890 EIGEN_DEVICE_FUNC 891 static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<typename DstXprType::Scalar,typename SrcXprType::Scalar> &/*func*/) 892 { 893 Index dstRows = src.rows(); 894 Index dstCols = src.cols(); 895 if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) 896 dst.resize(dstRows, dstCols); 897 898 eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); 899 src.evalTo(dst); 900 } 901 902 // NOTE The following two functions are templated to avoid their instanciation if not needed 903 // This is needed because some expressions supports evalTo only and/or have 'void' as scalar type. 904 template<typename SrcScalarType> 905 EIGEN_DEVICE_FUNC 906 static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::add_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/) 907 { 908 Index dstRows = src.rows(); 909 Index dstCols = src.cols(); 910 if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) 911 dst.resize(dstRows, dstCols); 912 913 eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); 914 src.addTo(dst); 915 } 916 917 template<typename SrcScalarType> 918 EIGEN_DEVICE_FUNC 919 static EIGEN_STRONG_INLINE void run(DstXprType &dst, const SrcXprType &src, const internal::sub_assign_op<typename DstXprType::Scalar,SrcScalarType> &/*func*/) 920 { 921 Index dstRows = src.rows(); 922 Index dstCols = src.cols(); 923 if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) 924 dst.resize(dstRows, dstCols); 925 926 eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); 927 src.subTo(dst); 928 } 929 }; 930 931 } // namespace internal 932 933 } // end namespace Eigen 934 935 #endif // EIGEN_ASSIGN_EVALUATOR_H 936