1 /*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 10 #ifndef __IMMINTRIN_H 11 #error \ 12 "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead." 13 #endif // __IMMINTRIN_H 14 15 #ifndef __AVXVNNIINT16INTRIN_H 16 #define __AVXVNNIINT16INTRIN_H 17 18 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with 19 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 20 /// signed 16-bit results. Sum these 2 results with the corresponding 21 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 22 /// 23 /// \headerfile <immintrin.h> 24 /// 25 /// \code 26 /// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B) 27 /// \endcode 28 /// 29 /// This intrinsic corresponds to the \c VPDPWSUD instruction. 30 /// 31 /// \param __W 32 /// A 128-bit vector of [4 x int]. 33 /// \param __A 34 /// A 128-bit vector of [8 x short]. 35 /// \param __B 36 /// A 128-bit vector of [8 x unsigned short]. 37 /// \returns 38 /// A 128-bit vector of [4 x int]. 39 /// 40 /// \code{.operation} 41 /// FOR j := 0 to 3 42 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 43 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 44 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 45 /// ENDFOR 46 /// dst[MAX:128] := 0 47 /// \endcode 48 #define _mm_dpwsud_epi32(__W, __A, __B) \ 49 ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A), \ 50 (__v4si)(__B))) 51 52 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with 53 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 54 /// signed 16-bit results. Sum these 2 results with the corresponding 55 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 56 /// 57 /// \headerfile <immintrin.h> 58 /// 59 /// \code 60 /// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B) 61 /// \endcode 62 /// 63 /// This intrinsic corresponds to the \c VPDPWSUD instruction. 64 /// 65 /// \param __W 66 /// A 256-bit vector of [8 x int]. 67 /// \param __A 68 /// A 256-bit vector of [16 x short]. 69 /// \param __B 70 /// A 256-bit vector of [16 x unsigned short]. 71 /// \returns 72 /// A 256-bit vector of [8 x int]. 73 /// 74 /// \code{.operation} 75 /// FOR j := 0 to 7 76 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 77 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 78 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 79 /// ENDFOR 80 /// dst[MAX:256] := 0 81 /// \endcode 82 #define _mm256_dpwsud_epi32(__W, __A, __B) \ 83 ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A), \ 84 (__v8si)(__B))) 85 86 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with 87 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 88 /// signed 16-bit results. Sum these 2 results with the corresponding 89 /// 32-bit integer in \a __W with signed saturation, and store the packed 90 /// 32-bit results in \a dst. 91 /// 92 /// \headerfile <immintrin.h> 93 /// 94 /// \code 95 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B) 96 /// \endcode 97 /// 98 /// This intrinsic corresponds to the \c VPDPWSUDS instruction. 99 /// 100 /// \param __W 101 /// A 128-bit vector of [4 x int]. 102 /// \param __A 103 /// A 128-bit vector of [8 x short]. 104 /// \param __B 105 /// A 128-bit vector of [8 x unsigned short]. 106 /// \returns 107 /// A 128-bit vector of [4 x int]. 108 /// 109 /// \code{.operation} 110 /// FOR j := 0 to 3 111 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 112 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 113 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 114 /// ENDFOR 115 /// dst[MAX:128] := 0 116 /// \endcode 117 #define _mm_dpwsuds_epi32(__W, __A, __B) \ 118 ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A), \ 119 (__v4si)(__B))) 120 121 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with 122 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 123 /// signed 16-bit results. Sum these 2 results with the corresponding 124 /// 32-bit integer in \a __W with signed saturation, and store the packed 125 /// 32-bit results in \a dst. 126 /// 127 /// \headerfile <immintrin.h> 128 /// 129 /// \code 130 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) 131 /// \endcode 132 /// 133 /// This intrinsic corresponds to the \c VPDPWSUDS instruction. 134 /// 135 /// \param __W 136 /// A 256-bit vector of [8 x int]. 137 /// \param __A 138 /// A 256-bit vector of [16 x short]. 139 /// \param __B 140 /// A 256-bit vector of [16 x unsigned short]. 141 /// \returns 142 /// A 256-bit vector of [8 x int]. 143 /// 144 /// \code{.operation} 145 /// FOR j := 0 to 7 146 /// tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 147 /// tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 148 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 149 /// ENDFOR 150 /// dst[MAX:256] := 0 151 /// \endcode 152 #define _mm256_dpwsuds_epi32(__W, __A, __B) \ 153 ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A), \ 154 (__v8si)(__B))) 155 156 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 157 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate 158 /// signed 16-bit results. Sum these 2 results with the corresponding 159 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 160 /// 161 /// \headerfile <immintrin.h> 162 /// 163 /// \code 164 /// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B) 165 /// \endcode 166 /// 167 /// This intrinsic corresponds to the \c VPDPWUSD instruction. 168 /// 169 /// \param __W 170 /// A 128-bit vector of [4 x int]. 171 /// \param __A 172 /// A 128-bit vector of [8 x unsigned short]. 173 /// \param __B 174 /// A 128-bit vector of [8 x short]. 175 /// \returns 176 /// A 128-bit vector of [4 x int]. 177 /// 178 /// \code{.operation} 179 /// FOR j := 0 to 3 180 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) 181 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) 182 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 183 /// ENDFOR 184 /// dst[MAX:128] := 0 185 /// \endcode 186 #define _mm_dpwusd_epi32(__W, __A, __B) \ 187 ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A), \ 188 (__v4si)(__B))) 189 190 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 191 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate 192 /// signed 16-bit results. Sum these 2 results with the corresponding 193 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 194 /// 195 /// \headerfile <immintrin.h> 196 /// 197 /// \code 198 /// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B) 199 /// \endcode 200 /// 201 /// This intrinsic corresponds to the \c VPDPWUSD instruction. 202 /// 203 /// \param __W 204 /// A 256-bit vector of [8 x int]. 205 /// \param __A 206 /// A 256-bit vector of [16 x unsigned short]. 207 /// \param __B 208 /// A 256-bit vector of [16 x short]. 209 /// \returns 210 /// A 256-bit vector of [8 x int]. 211 /// 212 /// \code{.operation} 213 /// FOR j := 0 to 7 214 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) 215 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) 216 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 217 /// ENDFOR 218 /// dst[MAX:256] := 0 219 /// \endcode 220 #define _mm256_dpwusd_epi32(__W, __A, __B) \ 221 ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A), \ 222 (__v8si)(__B))) 223 224 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 225 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate 226 /// signed 16-bit results. Sum these 2 results with the corresponding 227 /// 32-bit integer in \a __W with signed saturation, and store the packed 228 /// 32-bit results in \a dst. 229 /// 230 /// \headerfile <immintrin.h> 231 /// 232 /// \code 233 /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B) 234 /// \endcode 235 /// 236 /// This intrinsic corresponds to the \c VPDPWSUDS instruction. 237 /// 238 /// \param __W 239 /// A 128-bit vector of [4 x int]. 240 /// \param __A 241 /// A 128-bit vector of [8 x unsigned short]. 242 /// \param __B 243 /// A 128-bit vector of [8 x short]. 244 /// \returns 245 /// A 128-bit vector of [4 x int]. 246 /// 247 /// \code{.operation} 248 /// FOR j := 0 to 3 249 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) 250 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) 251 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 252 /// ENDFOR 253 /// dst[MAX:128] := 0 254 /// \endcode 255 #define _mm_dpwusds_epi32(__W, __A, __B) \ 256 ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A), \ 257 (__v4si)(__B))) 258 259 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 260 /// corresponding signed 16-bit integers in \a __B, producing 2 intermediate 261 /// signed 16-bit results. Sum these 2 results with the corresponding 262 /// 32-bit integer in \a __W with signed saturation, and store the packed 263 /// 32-bit results in \a dst. 264 /// 265 /// \headerfile <immintrin.h> 266 /// 267 /// \code 268 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B) 269 /// \endcode 270 /// 271 /// This intrinsic corresponds to the \c VPDPWSUDS instruction. 272 /// 273 /// \param __W 274 /// A 256-bit vector of [8 x int]. 275 /// \param __A 276 /// A 256-bit vector of [16 x unsigned short]. 277 /// \param __B 278 /// A 256-bit vector of [16 x short]. 279 /// \returns 280 /// A 256-bit vector of [8 x int]. 281 /// 282 /// \code{.operation} 283 /// FOR j := 0 to 7 284 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) 285 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) 286 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 287 /// ENDFOR 288 /// dst[MAX:256] := 0 289 /// \endcode 290 #define _mm256_dpwusds_epi32(__W, __A, __B) \ 291 ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A), \ 292 (__v8si)(__B))) 293 294 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 295 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 296 /// signed 16-bit results. Sum these 2 results with the corresponding 297 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 298 /// 299 /// \headerfile <immintrin.h> 300 /// 301 /// \code 302 /// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B) 303 /// \endcode 304 /// 305 /// This intrinsic corresponds to the \c VPDPWUUD instruction. 306 /// 307 /// \param __W 308 /// A 128-bit vector of [4 x unsigned int]. 309 /// \param __A 310 /// A 128-bit vector of [8 x unsigned short]. 311 /// \param __B 312 /// A 128-bit vector of [8 x unsigned short]. 313 /// \returns 314 /// A 128-bit vector of [4 x unsigned int]. 315 /// 316 /// \code{.operation} 317 /// FOR j := 0 to 3 318 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 319 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 320 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 321 /// ENDFOR 322 /// dst[MAX:128] := 0 323 /// \endcode 324 #define _mm_dpwuud_epi32(__W, __A, __B) \ 325 ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A), \ 326 (__v4si)(__B))) 327 328 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 329 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 330 /// signed 16-bit results. Sum these 2 results with the corresponding 331 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 332 /// 333 /// \headerfile <immintrin.h> 334 /// 335 /// \code 336 /// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B) 337 /// \endcode 338 /// 339 /// This intrinsic corresponds to the \c VPDPWUUD instruction. 340 /// 341 /// \param __W 342 /// A 256-bit vector of [8 x unsigned int]. 343 /// \param __A 344 /// A 256-bit vector of [16 x unsigned short]. 345 /// \param __B 346 /// A 256-bit vector of [16 x unsigned short]. 347 /// \returns 348 /// A 256-bit vector of [8 x unsigned int]. 349 /// 350 /// \code{.operation} 351 /// FOR j := 0 to 7 352 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 353 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 354 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 355 /// ENDFOR 356 /// dst[MAX:256] := 0 357 /// \endcode 358 #define _mm256_dpwuud_epi32(__W, __A, __B) \ 359 ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A), \ 360 (__v8si)(__B))) 361 362 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 363 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 364 /// signed 16-bit results. Sum these 2 results with the corresponding 365 /// 32-bit integer in \a __W with signed saturation, and store the packed 366 /// 32-bit results in \a dst. 367 /// 368 /// \headerfile <immintrin.h> 369 /// 370 /// \code 371 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B) 372 /// \endcode 373 /// 374 /// This intrinsic corresponds to the \c VPDPWSUDS instruction. 375 /// 376 /// \param __W 377 /// A 128-bit vector of [4 x unsigned int]. 378 /// \param __A 379 /// A 128-bit vector of [8 x unsigned short]. 380 /// \param __B 381 /// A 128-bit vector of [8 x unsigned short]. 382 /// \returns 383 /// A 128-bit vector of [4 x unsigned int]. 384 /// 385 /// \code{.operation} 386 /// FOR j := 0 to 3 387 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 388 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 389 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 390 /// ENDFOR 391 /// dst[MAX:128] := 0 392 /// \endcode 393 #define _mm_dpwuuds_epi32(__W, __A, __B) \ 394 ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A), \ 395 (__v4si)(__B))) 396 397 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with 398 /// corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate 399 /// signed 16-bit results. Sum these 2 results with the corresponding 400 /// 32-bit integer in \a __W with signed saturation, and store the packed 401 /// 32-bit results in \a dst. 402 /// 403 /// \headerfile <immintrin.h> 404 /// 405 /// \code 406 /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B) 407 /// \endcode 408 /// 409 /// This intrinsic corresponds to the \c VPDPWSUDS instruction. 410 /// 411 /// \param __W 412 /// A 256-bit vector of [8 x unsigned int]. 413 /// \param __A 414 /// A 256-bit vector of [16 x unsigned short]. 415 /// \param __B 416 /// A 256-bit vector of [16 x unsigned short]. 417 /// \returns 418 /// A 256-bit vector of [8 x unsigned int]. 419 /// 420 /// \code{.operation} 421 /// FOR j := 0 to 7 422 /// tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j]) 423 /// tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1]) 424 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2) 425 /// ENDFOR 426 /// dst[MAX:256] := 0 427 /// \endcode 428 #define _mm256_dpwuuds_epi32(__W, __A, __B) \ 429 ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A), \ 430 (__v8si)(__B))) 431 432 #endif // __AVXVNNIINT16INTRIN_H 433