1 // This file is part of Eigen, a lightweight C++ template library 2 // for linear algebra. 3 // 4 // Copyright (C) 2008-2016 Konstantinos Margaritis <markos@freevec.org> 5 // 6 // This Source Code Form is subject to the terms of the Mozilla 7 // Public License v. 2.0. If a copy of the MPL was not distributed 8 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. 9 10 #ifndef EIGEN_PACKET_MATH_ALTIVEC_H 11 #define EIGEN_PACKET_MATH_ALTIVEC_H 12 13 namespace Eigen { 14 15 namespace internal { 16 17 #ifndef EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 18 #define EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD 4 19 #endif 20 21 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD 22 #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD 23 #endif 24 25 #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD 26 #define EIGEN_HAS_SINGLE_INSTRUCTION_CJMADD 27 #endif 28 29 // NOTE Altivec has 32 registers, but Eigen only accepts a value of 8 or 16 30 #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 31 #define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 32 #endif 33 34 typedef __vector float Packet4f; 35 typedef __vector int Packet4i; 36 typedef __vector unsigned int Packet4ui; 37 typedef __vector __bool int Packet4bi; 38 typedef __vector short int Packet8i; 39 typedef __vector unsigned char Packet16uc; 40 41 // We don't want to write the same code all the time, but we need to reuse the constants 42 // and it doesn't really work to declare them global, so we define macros instead 43 44 #define _EIGEN_DECLARE_CONST_FAST_Packet4f(NAME,X) \ 45 Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(vec_splat_s32(X)) 46 47 #define _EIGEN_DECLARE_CONST_FAST_Packet4i(NAME,X) \ 48 Packet4i p4i_##NAME = vec_splat_s32(X) 49 50 #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ 51 Packet4f p4f_##NAME = pset1<Packet4f>(X) 52 53 #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ 54 Packet4i p4i_##NAME = pset1<Packet4i>(X) 55 56 #define _EIGEN_DECLARE_CONST_Packet2d(NAME,X) \ 57 Packet2d p2d_##NAME = pset1<Packet2d>(X) 58 59 #define _EIGEN_DECLARE_CONST_Packet2l(NAME,X) \ 60 Packet2l p2l_##NAME = pset1<Packet2l>(X) 61 62 #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ 63 const Packet4f p4f_##NAME = reinterpret_cast<Packet4f>(pset1<Packet4i>(X)) 64 65 #define DST_CHAN 1 66 #define DST_CTRL(size, count, stride) (((size) << 24) | ((count) << 16) | (stride)) 67 68 69 // These constants are endian-agnostic 70 static _EIGEN_DECLARE_CONST_FAST_Packet4f(ZERO, 0); //{ 0.0, 0.0, 0.0, 0.0} 71 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ZERO, 0); //{ 0, 0, 0, 0,} 72 static _EIGEN_DECLARE_CONST_FAST_Packet4i(ONE,1); //{ 1, 1, 1, 1} 73 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS16,-16); //{ -16, -16, -16, -16} 74 static _EIGEN_DECLARE_CONST_FAST_Packet4i(MINUS1,-1); //{ -1, -1, -1, -1} 75 static Packet4f p4f_MZERO = (Packet4f) vec_sl((Packet4ui)p4i_MINUS1, (Packet4ui)p4i_MINUS1); //{ 0x80000000, 0x80000000, 0x80000000, 0x80000000} 76 #ifndef __VSX__ 77 static Packet4f p4f_ONE = vec_ctf(p4i_ONE, 0); //{ 1.0, 1.0, 1.0, 1.0} 78 #endif 79 80 static Packet4f p4f_COUNTDOWN = { 0.0, 1.0, 2.0, 3.0 }; 81 static Packet4i p4i_COUNTDOWN = { 0, 1, 2, 3 }; 82 83 static Packet16uc p16uc_REVERSE32 = { 12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3 }; 84 static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; 85 86 // Mask alignment 87 #ifdef __PPC64__ 88 #define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 89 #else 90 #define _EIGEN_MASK_ALIGNMENT 0xfffffff0 91 #endif 92 93 #define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) 94 95 // Handle endianness properly while loading constants 96 // Define global static constants: 97 #ifdef _BIG_ENDIAN 98 static Packet16uc p16uc_FORWARD = vec_lvsl(0, (float*)0); 99 #ifdef __VSX__ 100 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 101 #endif 102 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; 103 static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; 104 static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; 105 #else 106 static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; 107 static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 108 static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; 109 static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; 110 static Packet16uc p16uc_HALF64_0_16 = vec_sld(vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 0), (Packet16uc)p4i_ZERO, 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; 111 #endif // _BIG_ENDIAN 112 113 static Packet16uc p16uc_PSET64_HI = (Packet16uc) vec_mergeh((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 0,1,2,3, 4,5,6,7, 0,1,2,3, 4,5,6,7 }; 114 static Packet16uc p16uc_PSET64_LO = (Packet16uc) vec_mergel((Packet4ui)p16uc_PSET32_WODD, (Packet4ui)p16uc_PSET32_WEVEN); //{ 8,9,10,11, 12,13,14,15, 8,9,10,11, 12,13,14,15 }; 115 static Packet16uc p16uc_TRANSPOSE64_HI = p16uc_PSET64_HI + p16uc_HALF64_0_16; //{ 0,1,2,3, 4,5,6,7, 16,17,18,19, 20,21,22,23}; 116 static Packet16uc p16uc_TRANSPOSE64_LO = p16uc_PSET64_LO + p16uc_HALF64_0_16; //{ 8,9,10,11, 12,13,14,15, 24,25,26,27, 28,29,30,31}; 117 118 static Packet16uc p16uc_COMPLEX32_REV = vec_sld(p16uc_REVERSE32, p16uc_REVERSE32, 8); //{ 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11 }; 119 120 #ifdef _BIG_ENDIAN 121 static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_FORWARD, p16uc_FORWARD, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 122 #else 123 static Packet16uc p16uc_COMPLEX32_REV2 = vec_sld(p16uc_PSET64_HI, p16uc_PSET64_LO, 8); //{ 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; 124 #endif // _BIG_ENDIAN 125 126 #if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC 127 #define EIGEN_PPC_PREFETCH(ADDR) __builtin_prefetch(ADDR); 128 #else 129 #define EIGEN_PPC_PREFETCH(ADDR) asm( " dcbt [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); 130 #endif 131 132 template<> struct packet_traits<float> : default_packet_traits 133 { 134 typedef Packet4f type; 135 typedef Packet4f half; 136 enum { 137 Vectorizable = 1, 138 AlignedOnScalar = 1, 139 size=4, 140 HasHalfPacket = 1, 141 142 HasAdd = 1, 143 HasSub = 1, 144 HasMul = 1, 145 HasDiv = 1, 146 HasMin = 1, 147 HasMax = 1, 148 HasAbs = 1, 149 HasSin = 0, 150 HasCos = 0, 151 HasLog = 0, 152 HasExp = 1, 153 #ifdef __VSX__ 154 HasSqrt = 1, 155 #if !EIGEN_COMP_CLANG 156 HasRsqrt = 1, 157 #else 158 HasRsqrt = 0, 159 #endif 160 #else 161 HasSqrt = 0, 162 HasRsqrt = 0, 163 #endif 164 HasRound = 1, 165 HasFloor = 1, 166 HasCeil = 1, 167 HasNegate = 1, 168 HasBlend = 1 169 }; 170 }; 171 template<> struct packet_traits<int> : default_packet_traits 172 { 173 typedef Packet4i type; 174 typedef Packet4i half; 175 enum { 176 Vectorizable = 1, 177 AlignedOnScalar = 1, 178 size = 4, 179 HasHalfPacket = 0, 180 181 HasAdd = 1, 182 HasSub = 1, 183 HasMul = 1, 184 HasDiv = 0, 185 HasBlend = 1 186 }; 187 }; 188 189 190 template<> struct unpacket_traits<Packet4f> { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; 191 template<> struct unpacket_traits<Packet4i> { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; 192 193 inline std::ostream & operator <<(std::ostream & s, const Packet16uc & v) 194 { 195 union { 196 Packet16uc v; 197 unsigned char n[16]; 198 } vt; 199 vt.v = v; 200 for (int i=0; i< 16; i++) 201 s << (int)vt.n[i] << ", "; 202 return s; 203 } 204 205 inline std::ostream & operator <<(std::ostream & s, const Packet4f & v) 206 { 207 union { 208 Packet4f v; 209 float n[4]; 210 } vt; 211 vt.v = v; 212 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 213 return s; 214 } 215 216 inline std::ostream & operator <<(std::ostream & s, const Packet4i & v) 217 { 218 union { 219 Packet4i v; 220 int n[4]; 221 } vt; 222 vt.v = v; 223 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 224 return s; 225 } 226 227 inline std::ostream & operator <<(std::ostream & s, const Packet4ui & v) 228 { 229 union { 230 Packet4ui v; 231 unsigned int n[4]; 232 } vt; 233 vt.v = v; 234 s << vt.n[0] << ", " << vt.n[1] << ", " << vt.n[2] << ", " << vt.n[3]; 235 return s; 236 } 237 238 // Need to define them first or we get specialization after instantiation errors 239 template<> EIGEN_STRONG_INLINE Packet4f pload<Packet4f>(const float* from) 240 { 241 EIGEN_DEBUG_ALIGNED_LOAD 242 #ifdef __VSX__ 243 return vec_vsx_ld(0, from); 244 #else 245 return vec_ld(0, from); 246 #endif 247 } 248 249 template<> EIGEN_STRONG_INLINE Packet4i pload<Packet4i>(const int* from) 250 { 251 EIGEN_DEBUG_ALIGNED_LOAD 252 #ifdef __VSX__ 253 return vec_vsx_ld(0, from); 254 #else 255 return vec_ld(0, from); 256 #endif 257 } 258 259 template<> EIGEN_STRONG_INLINE void pstore<float>(float* to, const Packet4f& from) 260 { 261 EIGEN_DEBUG_ALIGNED_STORE 262 #ifdef __VSX__ 263 vec_vsx_st(from, 0, to); 264 #else 265 vec_st(from, 0, to); 266 #endif 267 } 268 269 template<> EIGEN_STRONG_INLINE void pstore<int>(int* to, const Packet4i& from) 270 { 271 EIGEN_DEBUG_ALIGNED_STORE 272 #ifdef __VSX__ 273 vec_vsx_st(from, 0, to); 274 #else 275 vec_st(from, 0, to); 276 #endif 277 } 278 279 template<> EIGEN_STRONG_INLINE Packet4f pset1<Packet4f>(const float& from) { 280 Packet4f v = {from, from, from, from}; 281 return v; 282 } 283 284 template<> EIGEN_STRONG_INLINE Packet4i pset1<Packet4i>(const int& from) { 285 Packet4i v = {from, from, from, from}; 286 return v; 287 } 288 template<> EIGEN_STRONG_INLINE void 289 pbroadcast4<Packet4f>(const float *a, 290 Packet4f& a0, Packet4f& a1, Packet4f& a2, Packet4f& a3) 291 { 292 a3 = pload<Packet4f>(a); 293 a0 = vec_splat(a3, 0); 294 a1 = vec_splat(a3, 1); 295 a2 = vec_splat(a3, 2); 296 a3 = vec_splat(a3, 3); 297 } 298 template<> EIGEN_STRONG_INLINE void 299 pbroadcast4<Packet4i>(const int *a, 300 Packet4i& a0, Packet4i& a1, Packet4i& a2, Packet4i& a3) 301 { 302 a3 = pload<Packet4i>(a); 303 a0 = vec_splat(a3, 0); 304 a1 = vec_splat(a3, 1); 305 a2 = vec_splat(a3, 2); 306 a3 = vec_splat(a3, 3); 307 } 308 309 template<> EIGEN_DEVICE_FUNC inline Packet4f pgather<float, Packet4f>(const float* from, Index stride) 310 { 311 float EIGEN_ALIGN16 af[4]; 312 af[0] = from[0*stride]; 313 af[1] = from[1*stride]; 314 af[2] = from[2*stride]; 315 af[3] = from[3*stride]; 316 return pload<Packet4f>(af); 317 } 318 template<> EIGEN_DEVICE_FUNC inline Packet4i pgather<int, Packet4i>(const int* from, Index stride) 319 { 320 int EIGEN_ALIGN16 ai[4]; 321 ai[0] = from[0*stride]; 322 ai[1] = from[1*stride]; 323 ai[2] = from[2*stride]; 324 ai[3] = from[3*stride]; 325 return pload<Packet4i>(ai); 326 } 327 template<> EIGEN_DEVICE_FUNC inline void pscatter<float, Packet4f>(float* to, const Packet4f& from, Index stride) 328 { 329 float EIGEN_ALIGN16 af[4]; 330 pstore<float>(af, from); 331 to[0*stride] = af[0]; 332 to[1*stride] = af[1]; 333 to[2*stride] = af[2]; 334 to[3*stride] = af[3]; 335 } 336 template<> EIGEN_DEVICE_FUNC inline void pscatter<int, Packet4i>(int* to, const Packet4i& from, Index stride) 337 { 338 int EIGEN_ALIGN16 ai[4]; 339 pstore<int>((int *)ai, from); 340 to[0*stride] = ai[0]; 341 to[1*stride] = ai[1]; 342 to[2*stride] = ai[2]; 343 to[3*stride] = ai[3]; 344 } 345 346 template<> EIGEN_STRONG_INLINE Packet4f plset<Packet4f>(const float& a) { return pset1<Packet4f>(a) + p4f_COUNTDOWN; } 347 template<> EIGEN_STRONG_INLINE Packet4i plset<Packet4i>(const int& a) { return pset1<Packet4i>(a) + p4i_COUNTDOWN; } 348 349 template<> EIGEN_STRONG_INLINE Packet4f padd<Packet4f>(const Packet4f& a, const Packet4f& b) { return a + b; } 350 template<> EIGEN_STRONG_INLINE Packet4i padd<Packet4i>(const Packet4i& a, const Packet4i& b) { return a + b; } 351 352 template<> EIGEN_STRONG_INLINE Packet4f psub<Packet4f>(const Packet4f& a, const Packet4f& b) { return a - b; } 353 template<> EIGEN_STRONG_INLINE Packet4i psub<Packet4i>(const Packet4i& a, const Packet4i& b) { return a - b; } 354 355 template<> EIGEN_STRONG_INLINE Packet4f pnegate(const Packet4f& a) { return p4f_ZERO - a; } 356 template<> EIGEN_STRONG_INLINE Packet4i pnegate(const Packet4i& a) { return p4i_ZERO - a; } 357 358 template<> EIGEN_STRONG_INLINE Packet4f pconj(const Packet4f& a) { return a; } 359 template<> EIGEN_STRONG_INLINE Packet4i pconj(const Packet4i& a) { return a; } 360 361 template<> EIGEN_STRONG_INLINE Packet4f pmul<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_madd(a,b, p4f_MZERO); } 362 template<> EIGEN_STRONG_INLINE Packet4i pmul<Packet4i>(const Packet4i& a, const Packet4i& b) { return a * b; } 363 364 template<> EIGEN_STRONG_INLINE Packet4f pdiv<Packet4f>(const Packet4f& a, const Packet4f& b) 365 { 366 #ifndef __VSX__ // VSX actually provides a div instruction 367 Packet4f t, y_0, y_1; 368 369 // Altivec does not offer a divide instruction, we have to do a reciprocal approximation 370 y_0 = vec_re(b); 371 372 // Do one Newton-Raphson iteration to get the needed accuracy 373 t = vec_nmsub(y_0, b, p4f_ONE); 374 y_1 = vec_madd(y_0, t, y_0); 375 376 return vec_madd(a, y_1, p4f_MZERO); 377 #else 378 return vec_div(a, b); 379 #endif 380 } 381 382 template<> EIGEN_STRONG_INLINE Packet4i pdiv<Packet4i>(const Packet4i& /*a*/, const Packet4i& /*b*/) 383 { eigen_assert(false && "packet integer division are not supported by AltiVec"); 384 return pset1<Packet4i>(0); 385 } 386 387 // for some weird raisons, it has to be overloaded for packet of integers 388 template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } 389 template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } 390 391 template<> EIGEN_STRONG_INLINE Packet4f pmin<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } 392 template<> EIGEN_STRONG_INLINE Packet4i pmin<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } 393 394 template<> EIGEN_STRONG_INLINE Packet4f pmax<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } 395 template<> EIGEN_STRONG_INLINE Packet4i pmax<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } 396 397 template<> EIGEN_STRONG_INLINE Packet4f pand<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } 398 template<> EIGEN_STRONG_INLINE Packet4i pand<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, b); } 399 400 template<> EIGEN_STRONG_INLINE Packet4f por<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_or(a, b); } 401 template<> EIGEN_STRONG_INLINE Packet4i por<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_or(a, b); } 402 403 template<> EIGEN_STRONG_INLINE Packet4f pxor<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_xor(a, b); } 404 template<> EIGEN_STRONG_INLINE Packet4i pxor<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_xor(a, b); } 405 406 template<> EIGEN_STRONG_INLINE Packet4f pandnot<Packet4f>(const Packet4f& a, const Packet4f& b) { return vec_and(a, vec_nor(b, b)); } 407 template<> EIGEN_STRONG_INLINE Packet4i pandnot<Packet4i>(const Packet4i& a, const Packet4i& b) { return vec_and(a, vec_nor(b, b)); } 408 409 template<> EIGEN_STRONG_INLINE Packet4f pround<Packet4f>(const Packet4f& a) { return vec_round(a); } 410 template<> EIGEN_STRONG_INLINE Packet4f pceil<Packet4f>(const Packet4f& a) { return vec_ceil(a); } 411 template<> EIGEN_STRONG_INLINE Packet4f pfloor<Packet4f>(const Packet4f& a) { return vec_floor(a); } 412 413 #ifdef _BIG_ENDIAN 414 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) 415 { 416 EIGEN_DEBUG_ALIGNED_LOAD 417 Packet16uc MSQ, LSQ; 418 Packet16uc mask; 419 MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword 420 LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword 421 mask = vec_lvsl(0, from); // create the permute mask 422 return (Packet4f) vec_perm(MSQ, LSQ, mask); // align the data 423 424 } 425 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) 426 { 427 EIGEN_DEBUG_ALIGNED_LOAD 428 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 429 Packet16uc MSQ, LSQ; 430 Packet16uc mask; 431 MSQ = vec_ld(0, (unsigned char *)from); // most significant quadword 432 LSQ = vec_ld(15, (unsigned char *)from); // least significant quadword 433 mask = vec_lvsl(0, from); // create the permute mask 434 return (Packet4i) vec_perm(MSQ, LSQ, mask); // align the data 435 } 436 #else 437 // We also need ot redefine little endian loading of Packet4i/Packet4f using VSX 438 template<> EIGEN_STRONG_INLINE Packet4i ploadu<Packet4i>(const int* from) 439 { 440 EIGEN_DEBUG_UNALIGNED_LOAD 441 return (Packet4i) vec_vsx_ld((long)from & 15, (const int*) _EIGEN_ALIGNED_PTR(from)); 442 } 443 template<> EIGEN_STRONG_INLINE Packet4f ploadu<Packet4f>(const float* from) 444 { 445 EIGEN_DEBUG_UNALIGNED_LOAD 446 return (Packet4f) vec_vsx_ld((long)from & 15, (const float*) _EIGEN_ALIGNED_PTR(from)); 447 } 448 #endif 449 450 template<> EIGEN_STRONG_INLINE Packet4f ploaddup<Packet4f>(const float* from) 451 { 452 Packet4f p; 453 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4f>(from); 454 else p = ploadu<Packet4f>(from); 455 return vec_perm(p, p, p16uc_DUPLICATE32_HI); 456 } 457 template<> EIGEN_STRONG_INLINE Packet4i ploaddup<Packet4i>(const int* from) 458 { 459 Packet4i p; 460 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet4i>(from); 461 else p = ploadu<Packet4i>(from); 462 return vec_perm(p, p, p16uc_DUPLICATE32_HI); 463 } 464 465 #ifdef _BIG_ENDIAN 466 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) 467 { 468 EIGEN_DEBUG_UNALIGNED_STORE 469 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 470 // Warning: not thread safe! 471 Packet16uc MSQ, LSQ, edges; 472 Packet16uc edgeAlign, align; 473 474 MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword 475 LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword 476 edgeAlign = vec_lvsl(0, to); // permute map to extract edges 477 edges=vec_perm(LSQ,MSQ,edgeAlign); // extract the edges 478 align = vec_lvsr( 0, to ); // permute map to misalign data 479 MSQ = vec_perm(edges,(Packet16uc)from,align); // misalign the data (MSQ) 480 LSQ = vec_perm((Packet16uc)from,edges,align); // misalign the data (LSQ) 481 vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first 482 vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part 483 } 484 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) 485 { 486 EIGEN_DEBUG_UNALIGNED_STORE 487 // Taken from http://developer.apple.com/hardwaredrivers/ve/alignment.html 488 // Warning: not thread safe! 489 Packet16uc MSQ, LSQ, edges; 490 Packet16uc edgeAlign, align; 491 492 MSQ = vec_ld(0, (unsigned char *)to); // most significant quadword 493 LSQ = vec_ld(15, (unsigned char *)to); // least significant quadword 494 edgeAlign = vec_lvsl(0, to); // permute map to extract edges 495 edges=vec_perm(LSQ, MSQ, edgeAlign); // extract the edges 496 align = vec_lvsr( 0, to ); // permute map to misalign data 497 MSQ = vec_perm(edges, (Packet16uc) from, align); // misalign the data (MSQ) 498 LSQ = vec_perm((Packet16uc) from, edges, align); // misalign the data (LSQ) 499 vec_st( LSQ, 15, (unsigned char *)to ); // Store the LSQ part first 500 vec_st( MSQ, 0, (unsigned char *)to ); // Store the MSQ part 501 } 502 #else 503 // We also need ot redefine little endian loading of Packet4i/Packet4f using VSX 504 template<> EIGEN_STRONG_INLINE void pstoreu<int>(int* to, const Packet4i& from) 505 { 506 EIGEN_DEBUG_ALIGNED_STORE 507 vec_vsx_st(from, (long)to & 15, (int*) _EIGEN_ALIGNED_PTR(to)); 508 } 509 template<> EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const Packet4f& from) 510 { 511 EIGEN_DEBUG_ALIGNED_STORE 512 vec_vsx_st(from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); 513 } 514 #endif 515 516 template<> EIGEN_STRONG_INLINE void prefetch<float>(const float* addr) { EIGEN_PPC_PREFETCH(addr); } 517 template<> EIGEN_STRONG_INLINE void prefetch<int>(const int* addr) { EIGEN_PPC_PREFETCH(addr); } 518 519 template<> EIGEN_STRONG_INLINE float pfirst<Packet4f>(const Packet4f& a) { float EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } 520 template<> EIGEN_STRONG_INLINE int pfirst<Packet4i>(const Packet4i& a) { int EIGEN_ALIGN16 x; vec_ste(a, 0, &x); return x; } 521 522 template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) 523 { 524 return reinterpret_cast<Packet4f>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); 525 } 526 template<> EIGEN_STRONG_INLINE Packet4i preverse(const Packet4i& a) 527 { 528 return reinterpret_cast<Packet4i>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE32)); } 529 530 template<> EIGEN_STRONG_INLINE Packet4f pabs(const Packet4f& a) { return vec_abs(a); } 531 template<> EIGEN_STRONG_INLINE Packet4i pabs(const Packet4i& a) { return vec_abs(a); } 532 533 template<> EIGEN_STRONG_INLINE float predux<Packet4f>(const Packet4f& a) 534 { 535 Packet4f b, sum; 536 b = vec_sld(a, a, 8); 537 sum = a + b; 538 b = vec_sld(sum, sum, 4); 539 sum += b; 540 return pfirst(sum); 541 } 542 543 template<> EIGEN_STRONG_INLINE Packet4f preduxp<Packet4f>(const Packet4f* vecs) 544 { 545 Packet4f v[4], sum[4]; 546 547 // It's easier and faster to transpose then add as columns 548 // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation 549 // Do the transpose, first set of moves 550 v[0] = vec_mergeh(vecs[0], vecs[2]); 551 v[1] = vec_mergel(vecs[0], vecs[2]); 552 v[2] = vec_mergeh(vecs[1], vecs[3]); 553 v[3] = vec_mergel(vecs[1], vecs[3]); 554 // Get the resulting vectors 555 sum[0] = vec_mergeh(v[0], v[2]); 556 sum[1] = vec_mergel(v[0], v[2]); 557 sum[2] = vec_mergeh(v[1], v[3]); 558 sum[3] = vec_mergel(v[1], v[3]); 559 560 // Now do the summation: 561 // Lines 0+1 562 sum[0] = sum[0] + sum[1]; 563 // Lines 2+3 564 sum[1] = sum[2] + sum[3]; 565 // Add the results 566 sum[0] = sum[0] + sum[1]; 567 568 return sum[0]; 569 } 570 571 template<> EIGEN_STRONG_INLINE int predux<Packet4i>(const Packet4i& a) 572 { 573 Packet4i sum; 574 sum = vec_sums(a, p4i_ZERO); 575 #ifdef _BIG_ENDIAN 576 sum = vec_sld(sum, p4i_ZERO, 12); 577 #else 578 sum = vec_sld(p4i_ZERO, sum, 4); 579 #endif 580 return pfirst(sum); 581 } 582 583 template<> EIGEN_STRONG_INLINE Packet4i preduxp<Packet4i>(const Packet4i* vecs) 584 { 585 Packet4i v[4], sum[4]; 586 587 // It's easier and faster to transpose then add as columns 588 // Check: http://www.freevec.org/function/matrix_4x4_transpose_floats for explanation 589 // Do the transpose, first set of moves 590 v[0] = vec_mergeh(vecs[0], vecs[2]); 591 v[1] = vec_mergel(vecs[0], vecs[2]); 592 v[2] = vec_mergeh(vecs[1], vecs[3]); 593 v[3] = vec_mergel(vecs[1], vecs[3]); 594 // Get the resulting vectors 595 sum[0] = vec_mergeh(v[0], v[2]); 596 sum[1] = vec_mergel(v[0], v[2]); 597 sum[2] = vec_mergeh(v[1], v[3]); 598 sum[3] = vec_mergel(v[1], v[3]); 599 600 // Now do the summation: 601 // Lines 0+1 602 sum[0] = sum[0] + sum[1]; 603 // Lines 2+3 604 sum[1] = sum[2] + sum[3]; 605 // Add the results 606 sum[0] = sum[0] + sum[1]; 607 608 return sum[0]; 609 } 610 611 // Other reduction functions: 612 // mul 613 template<> EIGEN_STRONG_INLINE float predux_mul<Packet4f>(const Packet4f& a) 614 { 615 Packet4f prod; 616 prod = pmul(a, vec_sld(a, a, 8)); 617 return pfirst(pmul(prod, vec_sld(prod, prod, 4))); 618 } 619 620 template<> EIGEN_STRONG_INLINE int predux_mul<Packet4i>(const Packet4i& a) 621 { 622 EIGEN_ALIGN16 int aux[4]; 623 pstore(aux, a); 624 return aux[0] * aux[1] * aux[2] * aux[3]; 625 } 626 627 // min 628 template<> EIGEN_STRONG_INLINE float predux_min<Packet4f>(const Packet4f& a) 629 { 630 Packet4f b, res; 631 b = vec_min(a, vec_sld(a, a, 8)); 632 res = vec_min(b, vec_sld(b, b, 4)); 633 return pfirst(res); 634 } 635 636 template<> EIGEN_STRONG_INLINE int predux_min<Packet4i>(const Packet4i& a) 637 { 638 Packet4i b, res; 639 b = vec_min(a, vec_sld(a, a, 8)); 640 res = vec_min(b, vec_sld(b, b, 4)); 641 return pfirst(res); 642 } 643 644 // max 645 template<> EIGEN_STRONG_INLINE float predux_max<Packet4f>(const Packet4f& a) 646 { 647 Packet4f b, res; 648 b = vec_max(a, vec_sld(a, a, 8)); 649 res = vec_max(b, vec_sld(b, b, 4)); 650 return pfirst(res); 651 } 652 653 template<> EIGEN_STRONG_INLINE int predux_max<Packet4i>(const Packet4i& a) 654 { 655 Packet4i b, res; 656 b = vec_max(a, vec_sld(a, a, 8)); 657 res = vec_max(b, vec_sld(b, b, 4)); 658 return pfirst(res); 659 } 660 661 template<int Offset> 662 struct palign_impl<Offset,Packet4f> 663 { 664 static EIGEN_STRONG_INLINE void run(Packet4f& first, const Packet4f& second) 665 { 666 #ifdef _BIG_ENDIAN 667 switch (Offset % 4) { 668 case 1: 669 first = vec_sld(first, second, 4); break; 670 case 2: 671 first = vec_sld(first, second, 8); break; 672 case 3: 673 first = vec_sld(first, second, 12); break; 674 } 675 #else 676 switch (Offset % 4) { 677 case 1: 678 first = vec_sld(second, first, 12); break; 679 case 2: 680 first = vec_sld(second, first, 8); break; 681 case 3: 682 first = vec_sld(second, first, 4); break; 683 } 684 #endif 685 } 686 }; 687 688 template<int Offset> 689 struct palign_impl<Offset,Packet4i> 690 { 691 static EIGEN_STRONG_INLINE void run(Packet4i& first, const Packet4i& second) 692 { 693 #ifdef _BIG_ENDIAN 694 switch (Offset % 4) { 695 case 1: 696 first = vec_sld(first, second, 4); break; 697 case 2: 698 first = vec_sld(first, second, 8); break; 699 case 3: 700 first = vec_sld(first, second, 12); break; 701 } 702 #else 703 switch (Offset % 4) { 704 case 1: 705 first = vec_sld(second, first, 12); break; 706 case 2: 707 first = vec_sld(second, first, 8); break; 708 case 3: 709 first = vec_sld(second, first, 4); break; 710 } 711 #endif 712 } 713 }; 714 715 EIGEN_DEVICE_FUNC inline void 716 ptranspose(PacketBlock<Packet4f,4>& kernel) { 717 Packet4f t0, t1, t2, t3; 718 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 719 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 720 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 721 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 722 kernel.packet[0] = vec_mergeh(t0, t2); 723 kernel.packet[1] = vec_mergel(t0, t2); 724 kernel.packet[2] = vec_mergeh(t1, t3); 725 kernel.packet[3] = vec_mergel(t1, t3); 726 } 727 728 EIGEN_DEVICE_FUNC inline void 729 ptranspose(PacketBlock<Packet4i,4>& kernel) { 730 Packet4i t0, t1, t2, t3; 731 t0 = vec_mergeh(kernel.packet[0], kernel.packet[2]); 732 t1 = vec_mergel(kernel.packet[0], kernel.packet[2]); 733 t2 = vec_mergeh(kernel.packet[1], kernel.packet[3]); 734 t3 = vec_mergel(kernel.packet[1], kernel.packet[3]); 735 kernel.packet[0] = vec_mergeh(t0, t2); 736 kernel.packet[1] = vec_mergel(t0, t2); 737 kernel.packet[2] = vec_mergeh(t1, t3); 738 kernel.packet[3] = vec_mergel(t1, t3); 739 } 740 741 template<> EIGEN_STRONG_INLINE Packet4i pblend(const Selector<4>& ifPacket, const Packet4i& thenPacket, const Packet4i& elsePacket) { 742 Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; 743 Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE))); 744 return vec_sel(elsePacket, thenPacket, mask); 745 } 746 747 template<> EIGEN_STRONG_INLINE Packet4f pblend(const Selector<4>& ifPacket, const Packet4f& thenPacket, const Packet4f& elsePacket) { 748 Packet4ui select = { ifPacket.select[0], ifPacket.select[1], ifPacket.select[2], ifPacket.select[3] }; 749 Packet4ui mask = reinterpret_cast<Packet4ui>(vec_cmpeq(reinterpret_cast<Packet4ui>(select), reinterpret_cast<Packet4ui>(p4i_ONE))); 750 return vec_sel(elsePacket, thenPacket, mask); 751 } 752 753 754 //---------- double ---------- 755 #ifdef __VSX__ 756 typedef __vector double Packet2d; 757 typedef __vector unsigned long long Packet2ul; 758 typedef __vector long long Packet2l; 759 #if EIGEN_COMP_CLANG 760 typedef Packet2ul Packet2bl; 761 #else 762 typedef __vector __bool long Packet2bl; 763 #endif 764 765 static Packet2l p2l_ONE = { 1, 1 }; 766 static Packet2l p2l_ZERO = reinterpret_cast<Packet2l>(p4i_ZERO); 767 static Packet2d p2d_ONE = { 1.0, 1.0 }; 768 static Packet2d p2d_ZERO = reinterpret_cast<Packet2d>(p4f_ZERO); 769 static Packet2d p2d_MZERO = { -0.0, -0.0 }; 770 771 #ifdef _BIG_ENDIAN 772 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ZERO), reinterpret_cast<Packet4f>(p2d_ONE), 8)); 773 #else 774 static Packet2d p2d_COUNTDOWN = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(p2d_ONE), reinterpret_cast<Packet4f>(p2d_ZERO), 8)); 775 #endif 776 777 template<int index> Packet2d vec_splat_dbl(Packet2d& a); 778 779 template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<0>(Packet2d& a) 780 { 781 return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_HI)); 782 } 783 784 template<> EIGEN_STRONG_INLINE Packet2d vec_splat_dbl<1>(Packet2d& a) 785 { 786 return reinterpret_cast<Packet2d>(vec_perm(a, a, p16uc_PSET64_LO)); 787 } 788 789 template<> struct packet_traits<double> : default_packet_traits 790 { 791 typedef Packet2d type; 792 typedef Packet2d half; 793 enum { 794 Vectorizable = 1, 795 AlignedOnScalar = 1, 796 size=2, 797 HasHalfPacket = 1, 798 799 HasAdd = 1, 800 HasSub = 1, 801 HasMul = 1, 802 HasDiv = 1, 803 HasMin = 1, 804 HasMax = 1, 805 HasAbs = 1, 806 HasSin = 0, 807 HasCos = 0, 808 HasLog = 0, 809 HasExp = 1, 810 HasSqrt = 1, 811 HasRsqrt = 1, 812 HasRound = 1, 813 HasFloor = 1, 814 HasCeil = 1, 815 HasNegate = 1, 816 HasBlend = 1 817 }; 818 }; 819 820 template<> struct unpacket_traits<Packet2d> { typedef double type; enum {size=2, alignment=Aligned16}; typedef Packet2d half; }; 821 822 inline std::ostream & operator <<(std::ostream & s, const Packet2l & v) 823 { 824 union { 825 Packet2l v; 826 int64_t n[2]; 827 } vt; 828 vt.v = v; 829 s << vt.n[0] << ", " << vt.n[1]; 830 return s; 831 } 832 833 inline std::ostream & operator <<(std::ostream & s, const Packet2d & v) 834 { 835 union { 836 Packet2d v; 837 double n[2]; 838 } vt; 839 vt.v = v; 840 s << vt.n[0] << ", " << vt.n[1]; 841 return s; 842 } 843 844 // Need to define them first or we get specialization after instantiation errors 845 template<> EIGEN_STRONG_INLINE Packet2d pload<Packet2d>(const double* from) 846 { 847 EIGEN_DEBUG_ALIGNED_LOAD 848 #ifdef __VSX__ 849 return vec_vsx_ld(0, from); 850 #else 851 return vec_ld(0, from); 852 #endif 853 } 854 855 template<> EIGEN_STRONG_INLINE void pstore<double>(double* to, const Packet2d& from) 856 { 857 EIGEN_DEBUG_ALIGNED_STORE 858 #ifdef __VSX__ 859 vec_vsx_st(from, 0, to); 860 #else 861 vec_st(from, 0, to); 862 #endif 863 } 864 865 template<> EIGEN_STRONG_INLINE Packet2d pset1<Packet2d>(const double& from) { 866 Packet2d v = {from, from}; 867 return v; 868 } 869 870 template<> EIGEN_STRONG_INLINE void 871 pbroadcast4<Packet2d>(const double *a, 872 Packet2d& a0, Packet2d& a1, Packet2d& a2, Packet2d& a3) 873 { 874 a1 = pload<Packet2d>(a); 875 a0 = vec_splat_dbl<0>(a1); 876 a1 = vec_splat_dbl<1>(a1); 877 a3 = pload<Packet2d>(a+2); 878 a2 = vec_splat_dbl<0>(a3); 879 a3 = vec_splat_dbl<1>(a3); 880 } 881 882 template<> EIGEN_DEVICE_FUNC inline Packet2d pgather<double, Packet2d>(const double* from, Index stride) 883 { 884 double EIGEN_ALIGN16 af[2]; 885 af[0] = from[0*stride]; 886 af[1] = from[1*stride]; 887 return pload<Packet2d>(af); 888 } 889 template<> EIGEN_DEVICE_FUNC inline void pscatter<double, Packet2d>(double* to, const Packet2d& from, Index stride) 890 { 891 double EIGEN_ALIGN16 af[2]; 892 pstore<double>(af, from); 893 to[0*stride] = af[0]; 894 to[1*stride] = af[1]; 895 } 896 897 template<> EIGEN_STRONG_INLINE Packet2d plset<Packet2d>(const double& a) { return pset1<Packet2d>(a) + p2d_COUNTDOWN; } 898 899 template<> EIGEN_STRONG_INLINE Packet2d padd<Packet2d>(const Packet2d& a, const Packet2d& b) { return a + b; } 900 901 template<> EIGEN_STRONG_INLINE Packet2d psub<Packet2d>(const Packet2d& a, const Packet2d& b) { return a - b; } 902 903 template<> EIGEN_STRONG_INLINE Packet2d pnegate(const Packet2d& a) { return p2d_ZERO - a; } 904 905 template<> EIGEN_STRONG_INLINE Packet2d pconj(const Packet2d& a) { return a; } 906 907 template<> EIGEN_STRONG_INLINE Packet2d pmul<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_madd(a,b,p2d_MZERO); } 908 template<> EIGEN_STRONG_INLINE Packet2d pdiv<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_div(a,b); } 909 910 // for some weird raisons, it has to be overloaded for packet of integers 911 template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } 912 913 template<> EIGEN_STRONG_INLINE Packet2d pmin<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } 914 915 template<> EIGEN_STRONG_INLINE Packet2d pmax<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } 916 917 template<> EIGEN_STRONG_INLINE Packet2d pand<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } 918 919 template<> EIGEN_STRONG_INLINE Packet2d por<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_or(a, b); } 920 921 template<> EIGEN_STRONG_INLINE Packet2d pxor<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_xor(a, b); } 922 923 template<> EIGEN_STRONG_INLINE Packet2d pandnot<Packet2d>(const Packet2d& a, const Packet2d& b) { return vec_and(a, vec_nor(b, b)); } 924 925 template<> EIGEN_STRONG_INLINE Packet2d pround<Packet2d>(const Packet2d& a) { return vec_round(a); } 926 template<> EIGEN_STRONG_INLINE Packet2d pceil<Packet2d>(const Packet2d& a) { return vec_ceil(a); } 927 template<> EIGEN_STRONG_INLINE Packet2d pfloor<Packet2d>(const Packet2d& a) { return vec_floor(a); } 928 929 template<> EIGEN_STRONG_INLINE Packet2d ploadu<Packet2d>(const double* from) 930 { 931 EIGEN_DEBUG_ALIGNED_LOAD 932 return (Packet2d) vec_vsx_ld((long)from & 15, (const double*) _EIGEN_ALIGNED_PTR(from)); 933 } 934 935 template<> EIGEN_STRONG_INLINE Packet2d ploaddup<Packet2d>(const double* from) 936 { 937 Packet2d p; 938 if((std::ptrdiff_t(from) % 16) == 0) p = pload<Packet2d>(from); 939 else p = ploadu<Packet2d>(from); 940 return vec_splat_dbl<0>(p); 941 } 942 943 template<> EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const Packet2d& from) 944 { 945 EIGEN_DEBUG_ALIGNED_STORE 946 vec_vsx_st((Packet4f)from, (long)to & 15, (float*) _EIGEN_ALIGNED_PTR(to)); 947 } 948 949 template<> EIGEN_STRONG_INLINE void prefetch<double>(const double* addr) { EIGEN_PPC_PREFETCH(addr); } 950 951 template<> EIGEN_STRONG_INLINE double pfirst<Packet2d>(const Packet2d& a) { double EIGEN_ALIGN16 x[2]; pstore<double>(x, a); return x[0]; } 952 953 template<> EIGEN_STRONG_INLINE Packet2d preverse(const Packet2d& a) 954 { 955 return reinterpret_cast<Packet2d>(vec_perm(reinterpret_cast<Packet16uc>(a), reinterpret_cast<Packet16uc>(a), p16uc_REVERSE64)); 956 } 957 template<> EIGEN_STRONG_INLINE Packet2d pabs(const Packet2d& a) { return vec_abs(a); } 958 959 template<> EIGEN_STRONG_INLINE double predux<Packet2d>(const Packet2d& a) 960 { 961 Packet2d b, sum; 962 b = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(a), reinterpret_cast<Packet4f>(a), 8)); 963 sum = a + b; 964 return pfirst<Packet2d>(sum); 965 } 966 967 template<> EIGEN_STRONG_INLINE Packet2d preduxp<Packet2d>(const Packet2d* vecs) 968 { 969 Packet2d v[2], sum; 970 v[0] = vecs[0] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[0]), reinterpret_cast<Packet4f>(vecs[0]), 8)); 971 v[1] = vecs[1] + reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(vecs[1]), reinterpret_cast<Packet4f>(vecs[1]), 8)); 972 973 #ifdef _BIG_ENDIAN 974 sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[0]), reinterpret_cast<Packet4f>(v[1]), 8)); 975 #else 976 sum = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4f>(v[1]), reinterpret_cast<Packet4f>(v[0]), 8)); 977 #endif 978 979 return sum; 980 } 981 // Other reduction functions: 982 // mul 983 template<> EIGEN_STRONG_INLINE double predux_mul<Packet2d>(const Packet2d& a) 984 { 985 return pfirst(pmul(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); 986 } 987 988 // min 989 template<> EIGEN_STRONG_INLINE double predux_min<Packet2d>(const Packet2d& a) 990 { 991 return pfirst(pmin(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); 992 } 993 994 // max 995 template<> EIGEN_STRONG_INLINE double predux_max<Packet2d>(const Packet2d& a) 996 { 997 return pfirst(pmax(a, reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(a), reinterpret_cast<Packet4ui>(a), 8)))); 998 } 999 1000 template<int Offset> 1001 struct palign_impl<Offset,Packet2d> 1002 { 1003 static EIGEN_STRONG_INLINE void run(Packet2d& first, const Packet2d& second) 1004 { 1005 if (Offset == 1) 1006 #ifdef _BIG_ENDIAN 1007 first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(first), reinterpret_cast<Packet4ui>(second), 8)); 1008 #else 1009 first = reinterpret_cast<Packet2d>(vec_sld(reinterpret_cast<Packet4ui>(second), reinterpret_cast<Packet4ui>(first), 8)); 1010 #endif 1011 } 1012 }; 1013 1014 EIGEN_DEVICE_FUNC inline void 1015 ptranspose(PacketBlock<Packet2d,2>& kernel) { 1016 Packet2d t0, t1; 1017 t0 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_HI); 1018 t1 = vec_perm(kernel.packet[0], kernel.packet[1], p16uc_TRANSPOSE64_LO); 1019 kernel.packet[0] = t0; 1020 kernel.packet[1] = t1; 1021 } 1022 1023 template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { 1024 Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; 1025 Packet2bl mask = vec_cmpeq(reinterpret_cast<Packet2d>(select), reinterpret_cast<Packet2d>(p2l_ONE)); 1026 return vec_sel(elsePacket, thenPacket, mask); 1027 } 1028 #endif // __VSX__ 1029 } // end namespace internal 1030 1031 } // end namespace Eigen 1032 1033 #endif // EIGEN_PACKET_MATH_ALTIVEC_H 1034