1#include <clc/clc.h> 2 3//For all types EXCEPT long, which is implemented separately 4#define __CLC_MUL_HI_IMPL(BGENTYPE, GENTYPE, GENSIZE) \ 5 _CLC_OVERLOAD _CLC_DEF GENTYPE mul_hi(GENTYPE x, GENTYPE y){ \ 6 return (GENTYPE)(((BGENTYPE)x * (BGENTYPE)y) >> GENSIZE); \ 7 } \ 8 9//FOIL-based long mul_hi 10// 11// Summary: Treat mul_hi(long x, long y) as: 12// (a+b) * (c+d) where a and c are the high-order parts of x and y respectively 13// and b and d are the low-order parts of x and y. 14// Thinking back to algebra, we use FOIL to do the work. 15 16_CLC_OVERLOAD _CLC_DEF long mul_hi(long x, long y){ 17 long f, o, i; 18 ulong l; 19 20 //Move the high/low halves of x/y into the lower 32-bits of variables so 21 //that we can multiply them without worrying about overflow. 22 long x_hi = x >> 32; 23 long x_lo = x & UINT_MAX; 24 long y_hi = y >> 32; 25 long y_lo = y & UINT_MAX; 26 27 //Multiply all of the components according to FOIL method 28 f = x_hi * y_hi; 29 o = x_hi * y_lo; 30 i = x_lo * y_hi; 31 l = x_lo * y_lo; 32 33 //Now add the components back together in the following steps: 34 //F: doesn't need to be modified 35 //O/I: Need to be added together. 36 //L: Shift right by 32-bits, then add into the sum of O and I 37 //Once O/I/L are summed up, then shift the sum by 32-bits and add to F. 38 // 39 //We use hadd to give us a bit of extra precision for the intermediate sums 40 //but as a result, we shift by 31 bits instead of 32 41 return (long)(f + (hadd(o, (i + (long)((ulong)l>>32))) >> 31)); 42} 43 44_CLC_OVERLOAD _CLC_DEF ulong mul_hi(ulong x, ulong y){ 45 ulong f, o, i; 46 ulong l; 47 48 //Move the high/low halves of x/y into the lower 32-bits of variables so 49 //that we can multiply them without worrying about overflow. 50 ulong x_hi = x >> 32; 51 ulong x_lo = x & UINT_MAX; 52 ulong y_hi = y >> 32; 53 ulong y_lo = y & UINT_MAX; 54 55 //Multiply all of the components according to FOIL method 56 f = x_hi * y_hi; 57 o = x_hi * y_lo; 58 i = x_lo * y_hi; 59 l = x_lo * y_lo; 60 61 //Now add the components back together, taking care to respect the fact that: 62 //F: doesn't need to be modified 63 //O/I: Need to be added together. 64 //L: Shift right by 32-bits, then add into the sum of O and I 65 //Once O/I/L are summed up, then shift the sum by 32-bits and add to F. 66 // 67 //We use hadd to give us a bit of extra precision for the intermediate sums 68 //but as a result, we shift by 31 bits instead of 32 69 return (f + (hadd(o, (i + (l>>32))) >> 31)); 70} 71 72#define __CLC_MUL_HI_VEC(GENTYPE) \ 73 _CLC_OVERLOAD _CLC_DEF GENTYPE##2 mul_hi(GENTYPE##2 x, GENTYPE##2 y){ \ 74 return (GENTYPE##2){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1)}; \ 75 } \ 76 _CLC_OVERLOAD _CLC_DEF GENTYPE##3 mul_hi(GENTYPE##3 x, GENTYPE##3 y){ \ 77 return (GENTYPE##3){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1), mul_hi(x.s2, y.s2)}; \ 78 } \ 79 _CLC_OVERLOAD _CLC_DEF GENTYPE##4 mul_hi(GENTYPE##4 x, GENTYPE##4 y){ \ 80 return (GENTYPE##4){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \ 81 } \ 82 _CLC_OVERLOAD _CLC_DEF GENTYPE##8 mul_hi(GENTYPE##8 x, GENTYPE##8 y){ \ 83 return (GENTYPE##8){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \ 84 } \ 85 _CLC_OVERLOAD _CLC_DEF GENTYPE##16 mul_hi(GENTYPE##16 x, GENTYPE##16 y){ \ 86 return (GENTYPE##16){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \ 87 } \ 88 89#define __CLC_MUL_HI_DEC_IMPL(BTYPE, TYPE, BITS) \ 90 __CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS) \ 91 __CLC_MUL_HI_VEC(TYPE) 92 93#define __CLC_MUL_HI_TYPES() \ 94 __CLC_MUL_HI_DEC_IMPL(short, char, 8) \ 95 __CLC_MUL_HI_DEC_IMPL(ushort, uchar, 8) \ 96 __CLC_MUL_HI_DEC_IMPL(int, short, 16) \ 97 __CLC_MUL_HI_DEC_IMPL(uint, ushort, 16) \ 98 __CLC_MUL_HI_DEC_IMPL(long, int, 32) \ 99 __CLC_MUL_HI_DEC_IMPL(ulong, uint, 32) \ 100 __CLC_MUL_HI_VEC(long) \ 101 __CLC_MUL_HI_VEC(ulong) 102 103__CLC_MUL_HI_TYPES() 104 105#undef __CLC_MUL_HI_TYPES 106#undef __CLC_MUL_HI_DEC_IMPL 107#undef __CLC_MUL_HI_IMPL 108#undef __CLC_MUL_HI_VEC 109#undef __CLC_B32 110