1 /*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------=== 2 * 3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 * See https://llvm.org/LICENSE.txt for license information. 5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 * 7 *===-----------------------------------------------------------------------=== 8 */ 9 #ifndef __IMMINTRIN_H 10 #error \ 11 "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead." 12 #endif 13 14 #ifndef __AVXVNNIINT8INTRIN_H 15 #define __AVXVNNIINT8INTRIN_H 16 17 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 18 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate 19 /// signed 16-bit results. Sum these 4 results with the corresponding 20 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 21 /// 22 /// \headerfile <x86intrin.h> 23 /// 24 /// \code 25 /// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B); 26 /// \endcode 27 /// 28 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 29 /// 30 /// \param __A 31 /// A 128-bit vector of [16 x char]. 32 /// \param __B 33 /// A 128-bit vector of [16 x char]. 34 /// \returns 35 /// A 128-bit vector of [4 x int]. 36 /// 37 /// \code{.operation} 38 /// FOR j := 0 to 3 39 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) 40 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) 41 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) 42 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) 43 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 44 /// ENDFOR 45 /// dst[MAX:128] := 0 46 /// \endcode 47 #define _mm_dpbssd_epi32(__W, __A, __B) \ 48 ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A), \ 49 (__v4si)(__B))) 50 51 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 52 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate 53 /// signed 16-bit results. Sum these 4 results with the corresponding 54 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 55 /// 56 /// \headerfile <x86intrin.h> 57 /// 58 /// \code 59 /// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B); 60 /// \endcode 61 /// 62 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 63 /// 64 /// \param __A 65 /// A 256-bit vector of [32 x char]. 66 /// \param __B 67 /// A 256-bit vector of [32 x char]. 68 /// \returns 69 /// A 256-bit vector of [8 x int]. 70 /// 71 /// \code{.operation} 72 /// FOR j := 0 to 7 73 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) 74 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) 75 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) 76 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) 77 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 78 /// ENDFOR 79 /// dst[MAX:256] := 0 80 /// \endcode 81 #define _mm256_dpbssd_epi32(__W, __A, __B) \ 82 ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A), \ 83 (__v8si)(__B))) 84 85 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 86 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate 87 /// signed 16-bit results. Sum these 4 results with the corresponding 88 /// 32-bit integer in \a __W with signed saturation, and store the packed 89 /// 32-bit results in \a dst. 90 /// 91 /// \headerfile <x86intrin.h> 92 /// 93 /// \code 94 /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B); 95 /// \endcode 96 /// 97 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 98 /// 99 /// \param __A 100 /// A 128-bit vector of [16 x char]. 101 /// \param __B 102 /// A 128-bit vector of [16 x char]. 103 /// \returns 104 /// A 128-bit vector of [4 x int]. 105 /// 106 /// \code{.operation} 107 /// FOR j := 0 to 3 108 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) 109 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) 110 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) 111 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) 112 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 113 /// ENDFOR 114 /// dst[MAX:128] := 0 115 /// \endcode 116 #define _mm_dpbssds_epi32(__W, __A, __B) \ 117 ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A), \ 118 (__v4si)(__B))) 119 120 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 121 /// corresponding signed 8-bit integers in \a __B, producing 4 intermediate 122 /// signed 16-bit results. Sum these 4 results with the corresponding 123 /// 32-bit integer in \a __W with signed saturation, and store the packed 124 /// 32-bit results in \a dst. 125 /// 126 /// \headerfile <x86intrin.h> 127 /// 128 /// \code 129 /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B); 130 /// \endcode 131 /// 132 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 133 /// 134 /// \param __A 135 /// A 256-bit vector of [32 x char]. 136 /// \param __B 137 /// A 256-bit vector of [32 x char]. 138 /// \returns 139 /// A 256-bit vector of [8 x int]. 140 /// 141 /// \code{.operation} 142 /// FOR j := 0 to 7 143 /// tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]) 144 /// tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]) 145 /// tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]) 146 /// tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]) 147 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 148 /// ENDFOR 149 /// dst[MAX:256] := 0 150 /// \endcode 151 #define _mm256_dpbssds_epi32(__W, __A, __B) \ 152 ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A), \ 153 (__v8si)(__B))) 154 155 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 156 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate 157 /// signed 16-bit results. Sum these 4 results with the corresponding 158 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 159 /// 160 /// \headerfile <x86intrin.h> 161 /// 162 /// \code 163 /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B); 164 /// \endcode 165 /// 166 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 167 /// 168 /// \param __A 169 /// A 128-bit vector of [16 x char]. 170 /// \param __B 171 /// A 128-bit vector of [16 x unsigned char]. 172 /// \returns 173 /// A 128-bit vector of [4 x int]. 174 /// 175 /// \code{.operation} 176 /// FOR j := 0 to 3 177 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) 178 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) 179 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) 180 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) 181 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 182 /// ENDFOR 183 /// dst[MAX:128] := 0 184 /// \endcode 185 #define _mm_dpbsud_epi32(__W, __A, __B) \ 186 ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A), \ 187 (__v4si)(__B))) 188 189 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 190 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate 191 /// signed 16-bit results. Sum these 4 results with the corresponding 192 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 193 /// 194 /// \headerfile <x86intrin.h> 195 /// 196 /// \code 197 /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B); 198 /// \endcode 199 /// 200 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 201 /// 202 /// \param __A 203 /// A 256-bit vector of [32 x char]. 204 /// \param __B 205 /// A 256-bit vector of [32 x unsigned char]. 206 /// \returns 207 /// A 256-bit vector of [8 x int]. 208 /// 209 /// \code{.operation} 210 /// FOR j := 0 to 7 211 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) 212 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) 213 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) 214 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) 215 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 216 /// ENDFOR 217 /// dst[MAX:256] := 0 218 /// \endcode 219 #define _mm256_dpbsud_epi32(__W, __A, __B) \ 220 ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A), \ 221 (__v8si)(__B))) 222 223 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 224 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate 225 /// signed 16-bit results. Sum these 4 results with the corresponding 226 /// 32-bit integer in \a __W with signed saturation, and store the packed 227 /// 32-bit results in \a dst. 228 /// 229 /// \headerfile <x86intrin.h> 230 /// 231 /// \code 232 /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B); 233 /// \endcode 234 /// 235 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 236 /// 237 /// \param __A 238 /// A 128-bit vector of [16 x char]. 239 /// \param __B 240 /// A 128-bit vector of [16 x unsigned char]. 241 /// \returns 242 /// A 128-bit vector of [4 x int]. 243 /// 244 /// \code{.operation} 245 /// FOR j := 0 to 3 246 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) 247 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) 248 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) 249 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) 250 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 251 /// ENDFOR 252 /// dst[MAX:128] := 0 253 /// \endcode 254 #define _mm_dpbsuds_epi32(__W, __A, __B) \ 255 ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A), \ 256 (__v4si)(__B))) 257 258 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with 259 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate 260 /// signed 16-bit results. Sum these 4 results with the corresponding 261 /// 32-bit integer in \a __W with signed saturation, and store the packed 262 /// 32-bit results in \a dst. 263 /// 264 /// \headerfile <x86intrin.h> 265 /// 266 /// \code 267 /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B); 268 /// \endcode 269 /// 270 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 271 /// 272 /// \param __A 273 /// A 256-bit vector of [32 x char]. 274 /// \param __B 275 /// A 256-bit vector of [32 x unsigned char]. 276 /// \returns 277 /// A 256-bit vector of [8 x int]. 278 /// 279 /// \code{.operation} 280 /// FOR j := 0 to 7 281 /// tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])) 282 /// tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])) 283 /// tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])) 284 /// tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])) 285 /// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 286 /// ENDFOR 287 /// dst[MAX:256] := 0 288 /// \endcode 289 #define _mm256_dpbsuds_epi32(__W, __A, __B) \ 290 ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A), \ 291 (__v8si)(__B))) 292 293 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with 294 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate 295 /// signed 16-bit results. Sum these 4 results with the corresponding 296 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 297 /// 298 /// \headerfile <x86intrin.h> 299 /// 300 /// \code 301 /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B); 302 /// \endcode 303 /// 304 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 305 /// 306 /// \param __A 307 /// A 128-bit vector of [16 x unsigned char]. 308 /// \param __B 309 /// A 128-bit vector of [16 x unsigned char]. 310 /// \returns 311 /// A 128-bit vector of [4 x int]. 312 /// 313 /// \code{.operation} 314 /// FOR j := 0 to 3 315 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) 316 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) 317 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) 318 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) 319 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 320 /// ENDFOR 321 /// dst[MAX:128] := 0 322 /// \endcode 323 #define _mm_dpbuud_epi32(__W, __A, __B) \ 324 ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A), \ 325 (__v4si)(__B))) 326 327 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with 328 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate 329 /// signed 16-bit results. Sum these 4 results with the corresponding 330 /// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst. 331 /// 332 /// \headerfile <x86intrin.h> 333 /// 334 /// \code 335 /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B); 336 /// \endcode 337 /// 338 /// This intrinsic corresponds to the \c VPDPBSSD instruction. 339 /// 340 /// \param __A 341 /// A 256-bit vector of [32 x unsigned char]. 342 /// \param __B 343 /// A 256-bit vector of [32 x unsigned char]. 344 /// \returns 345 /// A 256-bit vector of [8 x int]. 346 /// 347 /// \code{.operation} 348 /// FOR j := 0 to 7 349 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) 350 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) 351 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) 352 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) 353 /// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 354 /// ENDFOR 355 /// dst[MAX:256] := 0 356 /// \endcode 357 #define _mm256_dpbuud_epi32(__W, __A, __B) \ 358 ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A), \ 359 (__v8si)(__B))) 360 361 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with 362 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate 363 /// signed 16-bit results. Sum these 4 results with the corresponding 364 /// 32-bit integer in \a __W with signed saturation, and store the packed 365 /// 32-bit results in \a dst. 366 /// 367 /// \headerfile <x86intrin.h> 368 /// 369 /// \code 370 /// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B); 371 /// \endcode 372 /// 373 /// This intrinsic corresponds to the \c VPDPBUUDS instruction. 374 /// 375 /// \param __A 376 /// A 128-bit vector of [16 x unsigned char]. 377 /// \param __B 378 /// A 128-bit vector of [16 x unsigned char]. 379 /// \returns 380 /// A 128-bit vector of [4 x int]. 381 /// 382 /// \code{.operation} 383 /// FOR j := 0 to 3 384 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) 385 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) 386 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) 387 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) 388 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 389 /// ENDFOR 390 /// dst[MAX:128] := 0 391 /// \endcode 392 #define _mm_dpbuuds_epi32(__W, __A, __B) \ 393 ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v4si)(__A), \ 394 (__v4si)(__B))) 395 396 /// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate 397 /// signed 16-bit results. Sum these 4 results with the corresponding 398 /// 32-bit integer in \a __W with signed saturation, and store the packed 399 /// 32-bit results in \a dst. 400 /// 401 /// \headerfile <x86intrin.h> 402 /// 403 /// \code 404 /// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B); 405 /// \endcode 406 /// 407 /// This intrinsic corresponds to the \c VPDPBUUDS instruction. 408 /// 409 /// \param __A 410 /// A 256-bit vector of [32 x unsigned char]. 411 /// \param __B 412 /// A 256-bit vector of [32 x unsigned char]. 413 /// \returns 414 /// A 256-bit vector of [8 x int]. 415 /// 416 /// \code{.operation} 417 /// FOR j := 0 to 7 418 /// tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]) 419 /// tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]) 420 /// tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]) 421 /// tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]) 422 /// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) 423 /// ENDFOR 424 /// dst[MAX:256] := 0 425 /// \endcode 426 #define _mm256_dpbuuds_epi32(__W, __A, __B) \ 427 ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v8si)(__A), \ 428 (__v8si)(__B))) 429 430 #endif // __AVXVNNIINT8INTRIN_H 431