• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 #ifndef __IMMINTRIN_H
10 #error                                                                         \
11     "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
12 #endif
13 
14 #ifndef __AVXVNNIINT8INTRIN_H
15 #define __AVXVNNIINT8INTRIN_H
16 
17 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
18 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
19 ///    signed 16-bit results. Sum these 4 results with the corresponding
20 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
21 ///
22 /// \headerfile <x86intrin.h>
23 ///
24 /// \code
25 /// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
26 /// \endcode
27 ///
28 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
29 ///
30 /// \param __A
31 ///    A 128-bit vector of [16 x char].
32 /// \param __B
33 ///    A 128-bit vector of [16 x char].
34 /// \returns
35 ///    A 128-bit vector of [4 x int].
36 ///
37 /// \code{.operation}
38 /// FOR j := 0 to 3
39 /// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
40 /// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
41 /// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
42 /// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
43 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
44 /// ENDFOR
45 /// dst[MAX:128] := 0
46 /// \endcode
47 #define _mm_dpbssd_epi32(__W, __A, __B)                                        \
48   ((__m128i)__builtin_ia32_vpdpbssd128((__v4si)(__W), (__v4si)(__A),           \
49                                        (__v4si)(__B)))
50 
51 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
52 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
53 ///    signed 16-bit results. Sum these 4 results with the corresponding
54 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
55 ///
56 /// \headerfile <x86intrin.h>
57 ///
58 /// \code
59 /// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
60 /// \endcode
61 ///
62 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
63 ///
64 /// \param __A
65 ///    A 256-bit vector of [32 x char].
66 /// \param __B
67 ///    A 256-bit vector of [32 x char].
68 /// \returns
69 ///    A 256-bit vector of [8 x int].
70 ///
71 /// \code{.operation}
72 /// FOR j := 0 to 7
73 /// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
74 /// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
75 /// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
76 /// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
77 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
78 /// ENDFOR
79 /// dst[MAX:256] := 0
80 /// \endcode
81 #define _mm256_dpbssd_epi32(__W, __A, __B)                                     \
82   ((__m256i)__builtin_ia32_vpdpbssd256((__v8si)(__W), (__v8si)(__A),           \
83                                        (__v8si)(__B)))
84 
85 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
86 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
87 ///    signed 16-bit results. Sum these 4 results with the corresponding
88 ///    32-bit integer in \a __W with signed saturation, and store the packed
89 ///    32-bit results in \a dst.
90 ///
91 /// \headerfile <x86intrin.h>
92 ///
93 /// \code
94 /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
95 /// \endcode
96 ///
97 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
98 ///
99 /// \param __A
100 ///    A 128-bit vector of [16 x char].
101 /// \param __B
102 ///    A 128-bit vector of [16 x char].
103 /// \returns
104 ///    A 128-bit vector of [4 x int].
105 ///
106 /// \code{.operation}
107 /// FOR j := 0 to 3
108 /// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
109 /// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
110 /// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
111 /// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
112 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
113 /// ENDFOR
114 /// dst[MAX:128] := 0
115 /// \endcode
116 #define _mm_dpbssds_epi32(__W, __A, __B)                                       \
117   ((__m128i)__builtin_ia32_vpdpbssds128((__v4si)(__W), (__v4si)(__A),          \
118                                         (__v4si)(__B)))
119 
120 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
121 ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
122 ///    signed 16-bit results. Sum these 4 results with the corresponding
123 ///    32-bit integer in \a __W with signed saturation, and store the packed
124 ///    32-bit results in \a dst.
125 ///
126 /// \headerfile <x86intrin.h>
127 ///
128 /// \code
129 /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
130 /// \endcode
131 ///
132 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
133 ///
134 /// \param __A
135 ///    A 256-bit vector of [32 x char].
136 /// \param __B
137 ///    A 256-bit vector of [32 x char].
138 /// \returns
139 ///    A 256-bit vector of [8 x int].
140 ///
141 /// \code{.operation}
142 /// FOR j := 0 to 7
143 /// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
144 /// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
145 /// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
146 /// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
147 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
148 /// ENDFOR
149 /// dst[MAX:256] := 0
150 /// \endcode
151 #define _mm256_dpbssds_epi32(__W, __A, __B)                                    \
152   ((__m256i)__builtin_ia32_vpdpbssds256((__v8si)(__W), (__v8si)(__A),          \
153                                         (__v8si)(__B)))
154 
155 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
156 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
157 ///    signed 16-bit results. Sum these 4 results with the corresponding
158 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
159 ///
160 /// \headerfile <x86intrin.h>
161 ///
162 /// \code
163 /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
164 /// \endcode
165 ///
166 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
167 ///
168 /// \param __A
169 ///    A 128-bit vector of [16 x char].
170 /// \param __B
171 ///    A 128-bit vector of [16 x unsigned char].
172 /// \returns
173 ///    A 128-bit vector of [4 x int].
174 ///
175 /// \code{.operation}
176 /// FOR j := 0 to 3
177 /// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
178 /// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
179 /// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
180 /// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
181 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
182 /// ENDFOR
183 /// dst[MAX:128] := 0
184 /// \endcode
185 #define _mm_dpbsud_epi32(__W, __A, __B)                                        \
186   ((__m128i)__builtin_ia32_vpdpbsud128((__v4si)(__W), (__v4si)(__A),           \
187                                        (__v4si)(__B)))
188 
189 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
190 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
191 ///    signed 16-bit results. Sum these 4 results with the corresponding
192 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
193 ///
194 /// \headerfile <x86intrin.h>
195 ///
196 /// \code
197 /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
198 /// \endcode
199 ///
200 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
201 ///
202 /// \param __A
203 ///    A 256-bit vector of [32 x char].
204 /// \param __B
205 ///    A 256-bit vector of [32 x unsigned char].
206 /// \returns
207 ///    A 256-bit vector of [8 x int].
208 ///
209 /// \code{.operation}
210 /// FOR j := 0 to 7
211 /// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
212 /// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
213 /// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
214 /// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
215 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
216 /// ENDFOR
217 /// dst[MAX:256] := 0
218 /// \endcode
219 #define _mm256_dpbsud_epi32(__W, __A, __B)                                     \
220   ((__m256i)__builtin_ia32_vpdpbsud256((__v8si)(__W), (__v8si)(__A),           \
221                                        (__v8si)(__B)))
222 
223 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
224 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
225 ///    signed 16-bit results. Sum these 4 results with the corresponding
226 ///    32-bit integer in \a __W with signed saturation, and store the packed
227 ///    32-bit results in \a dst.
228 ///
229 /// \headerfile <x86intrin.h>
230 ///
231 /// \code
232 /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
233 /// \endcode
234 ///
235 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
236 ///
237 /// \param __A
238 ///    A 128-bit vector of [16 x char].
239 /// \param __B
240 ///    A 128-bit vector of [16 x unsigned char].
241 /// \returns
242 ///    A 128-bit vector of [4 x int].
243 ///
244 /// \code{.operation}
245 /// FOR j := 0 to 3
246 /// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
247 /// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
248 /// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
249 /// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
250 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
251 /// ENDFOR
252 /// dst[MAX:128] := 0
253 /// \endcode
254 #define _mm_dpbsuds_epi32(__W, __A, __B)                                       \
255   ((__m128i)__builtin_ia32_vpdpbsuds128((__v4si)(__W), (__v4si)(__A),          \
256                                         (__v4si)(__B)))
257 
258 /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
259 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
260 ///    signed 16-bit results. Sum these 4 results with the corresponding
261 ///    32-bit integer in \a __W with signed saturation, and store the packed
262 ///    32-bit results in \a dst.
263 ///
264 /// \headerfile <x86intrin.h>
265 ///
266 /// \code
267 /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
268 /// \endcode
269 ///
270 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
271 ///
272 /// \param __A
273 ///    A 256-bit vector of [32 x char].
274 /// \param __B
275 ///    A 256-bit vector of [32 x unsigned char].
276 /// \returns
277 ///    A 256-bit vector of [8 x int].
278 ///
279 /// \code{.operation}
280 /// FOR j := 0 to 7
281 /// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
282 /// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
283 /// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
284 /// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
285 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
286 /// ENDFOR
287 /// dst[MAX:256] := 0
288 /// \endcode
289 #define _mm256_dpbsuds_epi32(__W, __A, __B)                                    \
290   ((__m256i)__builtin_ia32_vpdpbsuds256((__v8si)(__W), (__v8si)(__A),          \
291                                         (__v8si)(__B)))
292 
293 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
294 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
295 ///    signed 16-bit results. Sum these 4 results with the corresponding
296 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
297 ///
298 /// \headerfile <x86intrin.h>
299 ///
300 /// \code
301 /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
302 /// \endcode
303 ///
304 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
305 ///
306 /// \param __A
307 ///    A 128-bit vector of [16 x unsigned char].
308 /// \param __B
309 ///    A 128-bit vector of [16 x unsigned char].
310 /// \returns
311 ///    A 128-bit vector of [4 x int].
312 ///
313 /// \code{.operation}
314 /// FOR j := 0 to 3
315 /// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
316 /// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
317 /// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
318 /// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
319 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
320 /// ENDFOR
321 /// dst[MAX:128] := 0
322 /// \endcode
323 #define _mm_dpbuud_epi32(__W, __A, __B)                                        \
324   ((__m128i)__builtin_ia32_vpdpbuud128((__v4si)(__W), (__v4si)(__A),           \
325                                        (__v4si)(__B)))
326 
327 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
328 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
329 ///    signed 16-bit results. Sum these 4 results with the corresponding
330 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
331 ///
332 /// \headerfile <x86intrin.h>
333 ///
334 /// \code
335 /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
336 /// \endcode
337 ///
338 /// This intrinsic corresponds to the \c VPDPBSSD instruction.
339 ///
340 /// \param __A
341 ///    A 256-bit vector of [32 x unsigned char].
342 /// \param __B
343 ///    A 256-bit vector of [32 x unsigned char].
344 /// \returns
345 ///    A 256-bit vector of [8 x int].
346 ///
347 /// \code{.operation}
348 /// FOR j := 0 to 7
349 /// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
350 /// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
351 /// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
352 /// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
353 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
354 /// ENDFOR
355 /// dst[MAX:256] := 0
356 /// \endcode
357 #define _mm256_dpbuud_epi32(__W, __A, __B)                                     \
358   ((__m256i)__builtin_ia32_vpdpbuud256((__v8si)(__W), (__v8si)(__A),           \
359                                        (__v8si)(__B)))
360 
361 /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
362 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
363 ///    signed 16-bit results. Sum these 4 results with the corresponding
364 ///    32-bit integer in \a __W with signed saturation, and store the packed
365 ///    32-bit results in \a dst.
366 ///
367 /// \headerfile <x86intrin.h>
368 ///
369 /// \code
370 /// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
371 /// \endcode
372 ///
373 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
374 ///
375 /// \param __A
376 ///    A 128-bit vector of [16 x unsigned char].
377 /// \param __B
378 ///    A 128-bit vector of [16 x unsigned char].
379 /// \returns
380 ///    A 128-bit vector of [4 x int].
381 ///
382 /// \code{.operation}
383 /// FOR j := 0 to 3
384 /// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
385 /// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
386 /// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
387 /// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
388 /// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
389 /// ENDFOR
390 /// dst[MAX:128] := 0
391 /// \endcode
392 #define _mm_dpbuuds_epi32(__W, __A, __B)                                       \
393   ((__m128i)__builtin_ia32_vpdpbuuds128((__v4si)(__W), (__v4si)(__A),          \
394                                         (__v4si)(__B)))
395 
396 ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
397 ///    signed 16-bit results. Sum these 4 results with the corresponding
398 ///    32-bit integer in \a __W with signed saturation, and store the packed
399 ///    32-bit results in \a dst.
400 ///
401 /// \headerfile <x86intrin.h>
402 ///
403 /// \code
404 /// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
405 /// \endcode
406 ///
407 /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
408 ///
409 /// \param __A
410 ///    A 256-bit vector of [32 x unsigned char].
411 /// \param __B
412 ///    A 256-bit vector of [32 x unsigned char].
413 /// \returns
414 ///    A 256-bit vector of [8 x int].
415 ///
416 /// \code{.operation}
417 /// FOR j := 0 to 7
418 /// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
419 /// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
420 /// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
421 /// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
422 /// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
423 /// ENDFOR
424 /// dst[MAX:256] := 0
425 /// \endcode
426 #define _mm256_dpbuuds_epi32(__W, __A, __B)                                    \
427   ((__m256i)__builtin_ia32_vpdpbuuds256((__v8si)(__W), (__v8si)(__A),          \
428                                         (__v8si)(__B)))
429 
430 #endif // __AVXVNNIINT8INTRIN_H
431