1 /*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------=== 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a copy 4 * of this software and associated documentation files (the "Software"), to deal 5 * in the Software without restriction, including without limitation the rights 6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 * copies of the Software, and to permit persons to whom the Software is 8 * furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice shall be included in 11 * all copies or substantial portions of the Software. 12 * 13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 * THE SOFTWARE. 20 * 21 *===-----------------------------------------------------------------------=== 22 */ 23 #ifndef __IMMINTRIN_H 24 #error "Never use <avx512erintrin.h> directly; include <immintrin.h> instead." 25 #endif 26 27 #ifndef __AVX512ERINTRIN_H 28 #define __AVX512ERINTRIN_H 29 30 // exp2a23 31 #define _mm512_exp2a23_round_pd(A, R) __extension__ ({ \ 32 (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ 33 (__v8df)_mm512_setzero_pd(), \ 34 (__mmask8)-1, (R)); }) 35 36 #define _mm512_mask_exp2a23_round_pd(S, M, A, R) __extension__ ({ \ 37 (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ 38 (__v8df)(__m512d)(S), \ 39 (__mmask8)(M), (R)); }) 40 41 #define _mm512_maskz_exp2a23_round_pd(M, A, R) __extension__ ({ \ 42 (__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ 43 (__v8df)_mm512_setzero_pd(), \ 44 (__mmask8)(M), (R)); }) 45 46 #define _mm512_exp2a23_pd(A) \ 47 _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION) 48 49 #define _mm512_mask_exp2a23_pd(S, M, A) \ 50 _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) 51 52 #define _mm512_maskz_exp2a23_pd(M, A) \ 53 _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) 54 55 #define _mm512_exp2a23_round_ps(A, R) __extension__ ({ \ 56 (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ 57 (__v16sf)_mm512_setzero_ps(), \ 58 (__mmask8)-1, (R)); }) 59 60 #define _mm512_mask_exp2a23_round_ps(S, M, A, R) __extension__ ({ \ 61 (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ 62 (__v16sf)(__m512)(S), \ 63 (__mmask8)(M), (R)); }) 64 65 #define _mm512_maskz_exp2a23_round_ps(M, A, R) __extension__ ({ \ 66 (__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ 67 (__v16sf)_mm512_setzero_ps(), \ 68 (__mmask8)(M), (R)); }) 69 70 #define _mm512_exp2a23_ps(A) \ 71 _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION) 72 73 #define _mm512_mask_exp2a23_ps(S, M, A) \ 74 _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION) 75 76 #define _mm512_maskz_exp2a23_ps(M, A) \ 77 _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) 78 79 // rsqrt28 80 #define _mm512_rsqrt28_round_pd(A, R) __extension__ ({ \ 81 (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ 82 (__v8df)_mm512_setzero_pd(), \ 83 (__mmask8)-1, (R)); }) 84 85 #define _mm512_mask_rsqrt28_round_pd(S, M, A, R) __extension__ ({ \ 86 (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ 87 (__v8df)(__m512d)(S), \ 88 (__mmask8)(M), (R)); }) 89 90 #define _mm512_maskz_rsqrt28_round_pd(M, A, R) __extension__ ({ \ 91 (__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ 92 (__v8df)_mm512_setzero_pd(), \ 93 (__mmask8)(M), (R)); }) 94 95 #define _mm512_rsqrt28_pd(A) \ 96 _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION) 97 98 #define _mm512_mask_rsqrt28_pd(S, M, A) \ 99 _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) 100 101 #define _mm512_maskz_rsqrt28_pd(M, A) \ 102 _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) 103 104 #define _mm512_rsqrt28_round_ps(A, R) __extension__ ({ \ 105 (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ 106 (__v16sf)_mm512_setzero_ps(), \ 107 (__mmask16)-1, (R)); }) 108 109 #define _mm512_mask_rsqrt28_round_ps(S, M, A, R) __extension__ ({ \ 110 (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ 111 (__v16sf)(__m512)(S), \ 112 (__mmask16)(M), (R)); }) 113 114 #define _mm512_maskz_rsqrt28_round_ps(M, A, R) __extension__ ({ \ 115 (__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ 116 (__v16sf)_mm512_setzero_ps(), \ 117 (__mmask16)(M), (R)); }) 118 119 #define _mm512_rsqrt28_ps(A) \ 120 _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION) 121 122 #define _mm512_mask_rsqrt28_ps(S, M, A) \ 123 _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION) 124 125 #define _mm512_maskz_rsqrt28_ps(M, A) \ 126 _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) 127 128 #define _mm_rsqrt28_round_ss(A, B, R) __extension__ ({ \ 129 (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)(__m128)(A), \ 130 (__v4sf)(__m128)(B), \ 131 (__v4sf)_mm_setzero_ps(), \ 132 (__mmask8)-1, (R)); }) 133 134 #define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) __extension__ ({ \ 135 (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)(__m128)(A), \ 136 (__v4sf)(__m128)(B), \ 137 (__v4sf)(__m128)(S), \ 138 (__mmask8)(M), (R)); }) 139 140 #define _mm_maskz_rsqrt28_round_ss(M, A, B, R) __extension__ ({ \ 141 (__m128)__builtin_ia32_rsqrt28ss_round((__v4sf)(__m128)(A), \ 142 (__v4sf)(__m128)(B), \ 143 (__v4sf)_mm_setzero_ps(), \ 144 (__mmask8)(M), (R)); }) 145 146 #define _mm_rsqrt28_ss(A, B) \ 147 _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) 148 149 #define _mm_mask_rsqrt28_ss(S, M, A, B) \ 150 _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) 151 152 #define _mm_maskz_rsqrt28_ss(M, A, B) \ 153 _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) 154 155 #define _mm_rsqrt28_round_sd(A, B, R) __extension__ ({ \ 156 (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)(__m128d)(A), \ 157 (__v2df)(__m128d)(B), \ 158 (__v2df)_mm_setzero_pd(), \ 159 (__mmask8)-1, (R)); }) 160 161 #define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) __extension__ ({ \ 162 (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)(__m128d)(A), \ 163 (__v2df)(__m128d)(B), \ 164 (__v2df)(__m128d)(S), \ 165 (__mmask8)(M), (R)); }) 166 167 #define _mm_maskz_rsqrt28_round_sd(M, A, B, R) __extension__ ({ \ 168 (__m128d)__builtin_ia32_rsqrt28sd_round((__v2df)(__m128d)(A), \ 169 (__v2df)(__m128d)(B), \ 170 (__v2df)_mm_setzero_pd(), \ 171 (__mmask8)(M), (R)); }) 172 173 #define _mm_rsqrt28_sd(A, B) \ 174 _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) 175 176 #define _mm_mask_rsqrt28_sd(S, M, A, B) \ 177 _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) 178 179 #define _mm_maskz_rsqrt28_sd(M, A, B) \ 180 _mm_mask_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) 181 182 // rcp28 183 #define _mm512_rcp28_round_pd(A, R) __extension__ ({ \ 184 (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ 185 (__v8df)_mm512_setzero_pd(), \ 186 (__mmask8)-1, (R)); }) 187 188 #define _mm512_mask_rcp28_round_pd(S, M, A, R) __extension__ ({ \ 189 (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ 190 (__v8df)(__m512d)(S), \ 191 (__mmask8)(M), (R)); }) 192 193 #define _mm512_maskz_rcp28_round_pd(M, A, R) __extension__ ({ \ 194 (__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ 195 (__v8df)_mm512_setzero_pd(), \ 196 (__mmask8)(M), (R)); }) 197 198 #define _mm512_rcp28_pd(A) \ 199 _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION) 200 201 #define _mm512_mask_rcp28_pd(S, M, A) \ 202 _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) 203 204 #define _mm512_maskz_rcp28_pd(M, A) \ 205 _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) 206 207 #define _mm512_rcp28_round_ps(A, R) __extension__ ({ \ 208 (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ 209 (__v16sf)_mm512_setzero_ps(), \ 210 (__mmask16)-1, (R)); }) 211 212 #define _mm512_mask_rcp28_round_ps(S, M, A, R) __extension__ ({ \ 213 (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ 214 (__v16sf)(__m512)(S), \ 215 (__mmask16)(M), (R)); }) 216 217 #define _mm512_maskz_rcp28_round_ps(M, A, R) __extension__ ({ \ 218 (__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ 219 (__v16sf)_mm512_setzero_ps(), \ 220 (__mmask16)(M), (R)); }) 221 222 #define _mm512_rcp28_ps(A) \ 223 _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION) 224 225 #define _mm512_mask_rcp28_ps(S, M, A) \ 226 _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION) 227 228 #define _mm512_maskz_rcp28_ps(M, A) \ 229 _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) 230 231 #define _mm_rcp28_round_ss(A, B, R) __extension__ ({ \ 232 (__m128)__builtin_ia32_rcp28ss_round((__v4sf)(__m128)(A), \ 233 (__v4sf)(__m128)(B), \ 234 (__v4sf)_mm_setzero_ps(), \ 235 (__mmask8)-1, (R)); }) 236 237 #define _mm_mask_rcp28_round_ss(S, M, A, B, R) __extension__ ({ \ 238 (__m128)__builtin_ia32_rcp28ss_round((__v4sf)(__m128)(A), \ 239 (__v4sf)(__m128)(B), \ 240 (__v4sf)(__m128)(S), \ 241 (__mmask8)(M), (R)); }) 242 243 #define _mm_maskz_rcp28_round_ss(M, A, B, R) __extension__ ({ \ 244 (__m128)__builtin_ia32_rcp28ss_round((__v4sf)(__m128)(A), \ 245 (__v4sf)(__m128)(B), \ 246 (__v4sf)_mm_setzero_ps(), \ 247 (__mmask8)(M), (R)); }) 248 249 #define _mm_rcp28_ss(A, B) \ 250 _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) 251 252 #define _mm_mask_rcp28_ss(S, M, A, B) \ 253 _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) 254 255 #define _mm_maskz_rcp28_ss(M, A, B) \ 256 _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) 257 258 #define _mm_rcp28_round_sd(A, B, R) __extension__ ({ \ 259 (__m128d)__builtin_ia32_rcp28sd_round((__v2df)(__m128d)(A), \ 260 (__v2df)(__m128d)(B), \ 261 (__v2df)_mm_setzero_pd(), \ 262 (__mmask8)-1, (R)); }) 263 264 #define _mm_mask_rcp28_round_sd(S, M, A, B, R) __extension__ ({ \ 265 (__m128d)__builtin_ia32_rcp28sd_round((__v2df)(__m128d)(A), \ 266 (__v2df)(__m128d)(B), \ 267 (__v2df)(__m128d)(S), \ 268 (__mmask8)(M), (R)); }) 269 270 #define _mm_maskz_rcp28_round_sd(M, A, B, R) __extension__ ({ \ 271 (__m128d)__builtin_ia32_rcp28sd_round((__v2df)(__m128d)(A), \ 272 (__v2df)(__m128d)(B), \ 273 (__v2df)_mm_setzero_pd(), \ 274 (__mmask8)(M), (R)); }) 275 276 #define _mm_rcp28_sd(A, B) \ 277 _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) 278 279 #define _mm_mask_rcp28_sd(S, M, A, B) \ 280 _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) 281 282 #define _mm_maskz_rcp28_sd(M, A, B) \ 283 _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) 284 285 #endif // __AVX512ERINTRIN_H 286