1 /**************************************************************************** 2 * Copyright (C) 2017 Intel Corporation. All Rights Reserved. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 ****************************************************************************/ 23 #pragma once 24 #if 0 25 //=========================================================================== 26 // Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures. 27 //=========================================================================== 28 struct SIMD256 // or SIMD4 or SIMD16 29 { 30 //======================================================================= 31 // SIMD Types 32 // 33 // These typedefs are examples. The SIMD256 and SIMD16 implementations will 34 // use different base types with this same naming. 35 using Float = __m256; // Packed single-precision float vector 36 using Double = __m256d; // Packed double-precision float vector 37 using Integer = __m256i; // Packed integer vector (mutable element widths) 38 using Mask = uint8_t; // Integer representing mask bits 39 40 //======================================================================= 41 // Standard interface 42 // (available in both SIMD256 and SIMD16 widths) 43 //======================================================================= 44 45 //----------------------------------------------------------------------- 46 // Single precision floating point arithmetic operations 47 //----------------------------------------------------------------------- 48 static Float add_ps(Float a, Float b); // return a + b 49 static Float div_ps(Float a, Float b); // return a / b 50 static Float fmadd_ps(Float a, Float b, Float c); // return (a * b) + c 51 static Float fmsub_ps(Float a, Float b, Float c); // return (a * b) - c 52 static Float max_ps(Float a, Float b); // return (a > b) ? a : b 53 static Float min_ps(Float a, Float b); // return (a < b) ? a : b 54 static Float mul_ps(Float a, Float b); // return a * b 55 static Float rcp_ps(Float a); // return 1.0f / a 56 static Float rsqrt_ps(Float a); // return 1.0f / sqrt(a) 57 static Float sub_ps(Float a, Float b); // return a - b 58 59 enum class RoundMode 60 { 61 TO_NEAREST_INT = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5) 62 TO_NEG_INF = 0x01, // Round to negative infinity 63 TO_POS_INF = 0x02, // Round to positive infinity 64 TO_ZERO = 0x03, // Round to 0 a.k.a. truncate 65 CUR_DIRECTION = 0x04, // Round in direction set in MXCSR register 66 67 RAISE_EXC = 0x00, // Raise exception on overflow 68 NO_EXC = 0x08, // Suppress exceptions 69 70 NINT = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(RAISE_EXC), 71 NINT_NOEXC = static_cast<int>(TO_NEAREST_INT) | static_cast<int>(NO_EXC), 72 FLOOR = static_cast<int>(TO_NEG_INF) | static_cast<int>(RAISE_EXC), 73 FLOOR_NOEXC = static_cast<int>(TO_NEG_INF) | static_cast<int>(NO_EXC), 74 CEIL = static_cast<int>(TO_POS_INF) | static_cast<int>(RAISE_EXC), 75 CEIL_NOEXC = static_cast<int>(TO_POS_INF) | static_cast<int>(NO_EXC), 76 TRUNC = static_cast<int>(TO_ZERO) | static_cast<int>(RAISE_EXC), 77 TRUNC_NOEXC = static_cast<int>(TO_ZERO) | static_cast<int>(NO_EXC), 78 RINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(RAISE_EXC), 79 NEARBYINT = static_cast<int>(CUR_DIRECTION) | static_cast<int>(NO_EXC), 80 }; 81 82 // return round_func(a) 83 // 84 // round_func is chosen on the RMT template parameter. See the documentation 85 // for the RoundMode enumeration above. 86 template <RoundMode RMT> 87 static Float round_ps(Float a); // return round(a) 88 89 90 //----------------------------------------------------------------------- 91 // Integer (various width) arithmetic operations 92 //----------------------------------------------------------------------- 93 static Integer abs_epi32(Integer a); // return absolute_value(a) (int32) 94 static Integer add_epi32(Integer a, Integer b); // return a + b (int32) 95 static Integer add_epi8(Integer a, Integer b); // return a + b (int8) 96 static Integer adds_epu8(Integer a, Integer b); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 97 static Integer max_epi32(Integer a, Integer b); // return (a > b) ? a : b (int32) 98 static Integer max_epu32(Integer a, Integer b); // return (a > b) ? a : b (uint32) 99 static Integer min_epi32(Integer a, Integer b); // return (a < b) ? a : b (int32) 100 static Integer min_epu32(Integer a, Integer b); // return (a < b) ? a : b (uint32) 101 static Integer mul_epi32(Integer a, Integer b); // return a * b (int32) 102 103 // return (a * b) & 0xFFFFFFFF 104 // 105 // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers, 106 // and store the low 32 bits of the intermediate integers in dst. 107 static Float mullo_epi32(Integer a, Integer b); 108 109 static Integer sub_epi32(Integer a, Integer b); // return a - b (int32) 110 static Integer sub_epi64(Integer a, Integer b); // return a - b (int64) 111 static Integer subs_epu8(Integer a, Integer b); // return (b > a) ? 0 : (a - b) (uint8) 112 113 //----------------------------------------------------------------------- 114 // Logical operations 115 //----------------------------------------------------------------------- 116 static Float and_ps(Float a, Float b); // return a & b (float treated as int) 117 static Integer and_si(Integer a, Integer b); // return a & b (int) 118 static Float andnot_ps(Float a, Float b); // return (~a) & b (float treated as int) 119 static Integer andnot_si(Integer a, Integer b); // return (~a) & b (int) 120 static Float or_ps(Float a, Float b); // return a | b (float treated as int) 121 static Float or_si(Integer a, Integer b); // return a | b (int) 122 static Float xor_ps(Float a, Float b); // return a ^ b (float treated as int) 123 static Integer xor_si(Integer a, Integer b); // return a ^ b (int) 124 125 //----------------------------------------------------------------------- 126 // Shift operations 127 //----------------------------------------------------------------------- 128 template<int ImmT> 129 static Integer slli_epi32(Integer a); // return a << ImmT 130 static Integer sllv_epi32(Integer a, Integer b); // return a << b 131 template<int ImmT> 132 static Integer srai_epi32(Integer a); // return a >> ImmT (int32) 133 template<int ImmT> 134 static Integer srli_epi32(Integer a); // return a >> ImmT (uint32) 135 template<int ImmT> // for each 128-bit lane: 136 static Integer srli_si(Integer a); // return a >> (ImmT*8) (uint) 137 template<int ImmT> 138 static Float srlisi_ps(Float a); // same as srli_si, but with Float cast to int 139 static Integer srlv_epi32(Integer a, Integer b); // return a >> b (uint32) 140 141 //----------------------------------------------------------------------- 142 // Conversion operations 143 //----------------------------------------------------------------------- 144 static Float castpd_ps(Double a); // return *(Float*)(&a) 145 static Integer castps_si(Float a); // return *(Integer*)(&a) 146 static Double castsi_pd(Integer a); // return *(Double*)(&a) 147 static Double castps_pd(Float a); // return *(Double*)(&a) 148 static Float castsi_ps(Integer a); // return *(Float*)(&a) 149 static Float cvtepi32_ps(Integer a); // return (float)a (int32 --> float) 150 static Integer cvtepu8_epi16(Integer a); // return (int16)a (uint8 --> int16) 151 static Integer cvtepu8_epi32(Integer a); // return (int32)a (uint8 --> int32) 152 static Integer cvtepu16_epi32(Integer a); // return (int32)a (uint16 --> int32) 153 static Integer cvtepu16_epi64(Integer a); // return (int64)a (uint16 --> int64) 154 static Integer cvtepu32_epi64(Integer a); // return (int64)a (uint32 --> int64) 155 static Integer cvtps_epi32(Float a); // return (int32)a (float --> int32) 156 static Integer cvttps_epi32(Float a); // return (int32)a (rnd_to_zero(float) --> int32) 157 158 //----------------------------------------------------------------------- 159 // Comparison operations 160 //----------------------------------------------------------------------- 161 162 // Comparison types used with cmp_ps: 163 // - ordered comparisons are always false if either operand is NaN 164 // - unordered comparisons are always true if either operand is NaN 165 // - signaling comparisons raise an exception if either operand is NaN 166 // - non-signaling comparisons will never raise an exception 167 // 168 // Ordered: return (a != NaN) && (b != NaN) && (a cmp b) 169 // Unordered: return (a == NaN) || (b == NaN) || (a cmp b) 170 enum class CompareType 171 { 172 EQ_OQ = 0x00, // Equal (ordered, nonsignaling) 173 LT_OS = 0x01, // Less-than (ordered, signaling) 174 LE_OS = 0x02, // Less-than-or-equal (ordered, signaling) 175 UNORD_Q = 0x03, // Unordered (nonsignaling) 176 NEQ_UQ = 0x04, // Not-equal (unordered, nonsignaling) 177 NLT_US = 0x05, // Not-less-than (unordered, signaling) 178 NLE_US = 0x06, // Not-less-than-or-equal (unordered, signaling) 179 ORD_Q = 0x07, // Ordered (nonsignaling) 180 EQ_UQ = 0x08, // Equal (unordered, non-signaling) 181 NGE_US = 0x09, // Not-greater-than-or-equal (unordered, signaling) 182 NGT_US = 0x0A, // Not-greater-than (unordered, signaling) 183 FALSE_OQ = 0x0B, // False (ordered, nonsignaling) 184 NEQ_OQ = 0x0C, // Not-equal (ordered, non-signaling) 185 GE_OS = 0x0D, // Greater-than-or-equal (ordered, signaling) 186 GT_OS = 0x0E, // Greater-than (ordered, signaling) 187 TRUE_UQ = 0x0F, // True (unordered, non-signaling) 188 EQ_OS = 0x10, // Equal (ordered, signaling) 189 LT_OQ = 0x11, // Less-than (ordered, nonsignaling) 190 LE_OQ = 0x12, // Less-than-or-equal (ordered, nonsignaling) 191 UNORD_S = 0x13, // Unordered (signaling) 192 NEQ_US = 0x14, // Not-equal (unordered, signaling) 193 NLT_UQ = 0x15, // Not-less-than (unordered, nonsignaling) 194 NLE_UQ = 0x16, // Not-less-than-or-equal (unordered, nonsignaling) 195 ORD_S = 0x17, // Ordered (signaling) 196 EQ_US = 0x18, // Equal (unordered, signaling) 197 NGE_UQ = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling) 198 NGT_UQ = 0x1A, // Not-greater-than (unordered, nonsignaling) 199 FALSE_OS = 0x1B, // False (ordered, signaling) 200 NEQ_OS = 0x1C, // Not-equal (ordered, signaling) 201 GE_OQ = 0x1D, // Greater-than-or-equal (ordered, nonsignaling) 202 GT_OQ = 0x1E, // Greater-than (ordered, nonsignaling) 203 TRUE_US = 0x1F, // True (unordered, signaling) 204 }; 205 206 // return a (CmpTypeT) b (float) 207 // 208 // See documentation for CompareType above for valid values for CmpTypeT. 209 template<CompareType CmpTypeT> 210 static Float cmp_ps(Float a, Float b); // return a (CmtTypeT) b (see above) 211 static Float cmpgt_ps(Float a, Float b); // return cmp_ps<CompareType::GT_OQ>(a, b) 212 static Float cmple_ps(Float a, Float b); // return cmp_ps<CompareType::LE_OQ>(a, b) 213 static Float cmplt_ps(Float a, Float b); // return cmp_ps<CompareType::LT_OQ>(a, b) 214 static Float cmpneq_ps(Float a, Float b); // return cmp_ps<CompareType::NEQ_OQ>(a, b) 215 static Float cmpeq_ps(Float a, Float b); // return cmp_ps<CompareType::EQ_OQ>(a, b) 216 static Float cmpge_ps(Float a, Float b); // return cmp_ps<CompareType::GE_OQ>(a, b) 217 static Integer cmpeq_epi8(Integer a, Integer b); // return a == b (int8) 218 static Integer cmpeq_epi16(Integer a, Integer b); // return a == b (int16) 219 static Integer cmpeq_epi32(Integer a, Integer b); // return a == b (int32) 220 static Integer cmpeq_epi64(Integer a, Integer b); // return a == b (int64) 221 static Integer cmpgt_epi8(Integer a, Integer b); // return a > b (int8) 222 static Integer cmpgt_epi16(Integer a, Integer b); // return a > b (int16) 223 static Integer cmpgt_epi32(Integer a, Integer b); // return a > b (int32) 224 static Integer cmpgt_epi64(Integer a, Integer b); // return a > b (int64) 225 static Integer cmplt_epi32(Integer a, Integer b); // return a < b (int32) 226 static bool testz_ps(Float a, Float b); // return all_lanes_zero(a & b) ? 1 : 0 (float) 227 static bool testz_si(Integer a, Integer b); // return all_lanes_zero(a & b) ? 1 : 0 (int) 228 229 //----------------------------------------------------------------------- 230 // Blend / shuffle / permute operations 231 //----------------------------------------------------------------------- 232 template<int ImmT> 233 static Float blend_ps(Float a, Float b); // return ImmT ? b : a (float) 234 static Integer blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int) 235 static Float blendv_ps(Float a, Float b, Float mask); // return mask ? b : a (float) 236 static Float broadcast_ss(float const *p); // return *p (all elements in vector get same value) 237 static Integer packs_epi16(Integer a, Integer b); // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16 238 static Integer packs_epi32(Integer a, Integer b); // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32 239 static Integer packus_epi16(Integer a, Integer b); // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16 240 static Integer packus_epi32(Integer a, Integer b); // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32 241 static Float permute_epi32(Integer a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (int32) 242 static Float permute_ps(Float a, Integer swiz); // return a[swiz[i]] for each 32-bit lane i (float) 243 template<int SwizT> 244 static Integer shuffle_epi32(Integer a, Integer b); 245 template<int SwizT> 246 static Integer shuffle_epi64(Integer a, Integer b); 247 static Integer shuffle_epi8(Integer a, Integer b); 248 template<int SwizT> 249 static Float shuffle_pd(Double a, Double b); 250 template<int SwizT> 251 static Float shuffle_ps(Float a, Float b); 252 static Integer unpackhi_epi16(Integer a, Integer b); 253 static Integer unpackhi_epi32(Integer a, Integer b); 254 static Integer unpackhi_epi64(Integer a, Integer b); 255 static Integer unpackhi_epi8(Integer a, Integer b); 256 static Float unpackhi_pd(Double a, Double b); 257 static Float unpackhi_ps(Float a, Float b); 258 static Integer unpacklo_epi16(Integer a, Integer b); 259 static Integer unpacklo_epi32(Integer a, Integer b); 260 static Integer unpacklo_epi64(Integer a, Integer b); 261 static Integer unpacklo_epi8(Integer a, Integer b); 262 static Float unpacklo_pd(Double a, Double b); 263 static Float unpacklo_ps(Float a, Float b); 264 265 //----------------------------------------------------------------------- 266 // Load / store operations 267 //----------------------------------------------------------------------- 268 enum class ScaleFactor 269 { 270 SF_1, // No scaling 271 SF_2, // Scale offset by 2 272 SF_4, // Scale offset by 4 273 SF_8, // Scale offset by 8 274 }; 275 276 template<ScaleFactor ScaleT = ScaleFactor::SF_1> 277 static Float i32gather_ps(float const* p, Integer idx); // return *(float*)(((int8*)p) + (idx * ScaleT)) 278 static Float load1_ps(float const *p); // return *p (broadcast 1 value to all elements) 279 static Float load_ps(float const *p); // return *p (loads SIMD width elements from memory) 280 static Integer load_si(Integer const *p); // return *p 281 static Float loadu_ps(float const *p); // return *p (same as load_ps but allows for unaligned mem) 282 static Integer loadu_si(Integer const *p); // return *p (same as load_si but allows for unaligned mem) 283 284 // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old 285 template<int ScaleT> 286 static Float mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask); 287 288 static void maskstore_ps(float *p, Integer mask, Float src); 289 static int movemask_epi8(Integer a); 290 static int movemask_pd(Double a); 291 static int movemask_ps(Float a); 292 static Integer set1_epi32(int i); // return i (all elements are same value) 293 static Integer set1_epi8(char i); // return i (all elements are same value) 294 static Float set1_ps(float f); // return f (all elements are same value) 295 static Float setzero_ps(); // return 0 (float) 296 static Integer setzero_si(); // return 0 (integer) 297 static void store_ps(float *p, Float a); // *p = a (stores all elements contiguously in memory) 298 static void store_si(Integer *p, Integer a); // *p = a 299 static void stream_ps(float *p, Float a); // *p = a (same as store_ps, but doesn't keep memory in cache) 300 301 //======================================================================= 302 // Legacy interface (available only in SIMD256 width) 303 //======================================================================= 304 305 static Float broadcast_ps(__m128 const *p); 306 template<int ImmT> 307 static __m128d extractf128_pd(Double a); 308 template<int ImmT> 309 static __m128 extractf128_ps(Float a); 310 template<int ImmT> 311 static __m128i extractf128_si(Integer a); 312 template<int ImmT> 313 static Double insertf128_pd(Double a, __m128d b); 314 template<int ImmT> 315 static Float insertf128_ps(Float a, __m128 b); 316 template<int ImmT> 317 static Integer insertf128_si(Integer a, __m128i b); 318 static Integer loadu2_si(__m128 const* phi, __m128 const* plo); 319 template<int ImmT> 320 static Double permute2f128_pd(Double a, Double b); 321 template<int ImmT> 322 static Float permute2f128_ps(Float a, Float b); 323 template<int ImmT> 324 static Integer permute2f128_si(Integer a, Integer b); 325 static Integer set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0); 326 static void storeu2_si(__m128i *phi, __m128i *plo, Integer src); 327 328 //======================================================================= 329 // Advanced masking interface (currently available only in SIMD16 width) 330 //======================================================================= 331 }; 332 #endif // #if 0 333