• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*===----------- avxvnniint16intrin.h - AVXVNNIINT16 intrinsics-------------===
2  *
3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  * See https://llvm.org/LICENSE.txt for license information.
5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  *
7  *===-----------------------------------------------------------------------===
8  */
9 
10 #ifndef __IMMINTRIN_H
11 #error                                                                         \
12     "Never use <avxvnniint16intrin.h> directly; include <immintrin.h> instead."
13 #endif // __IMMINTRIN_H
14 
15 #ifndef __AVXVNNIINT16INTRIN_H
16 #define __AVXVNNIINT16INTRIN_H
17 
18 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
19 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
20 ///    signed 16-bit results. Sum these 2 results with the corresponding
21 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
22 ///
23 /// \headerfile <immintrin.h>
24 ///
25 /// \code
26 /// __m128i _mm_dpwsud_epi32(__m128i __W, __m128i __A, __m128i __B)
27 /// \endcode
28 ///
29 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
30 ///
31 /// \param __W
32 ///    A 128-bit vector of [4 x int].
33 /// \param __A
34 ///    A 128-bit vector of [8 x short].
35 /// \param __B
36 ///    A 128-bit vector of [8 x unsigned short].
37 /// \returns
38 ///    A 128-bit vector of [4 x int].
39 ///
40 /// \code{.operation}
41 /// FOR j := 0 to 3
42 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
43 /// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
44 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
45 /// ENDFOR
46 /// dst[MAX:128] := 0
47 /// \endcode
48 #define _mm_dpwsud_epi32(__W, __A, __B)                                        \
49   ((__m128i)__builtin_ia32_vpdpwsud128((__v4si)(__W), (__v4si)(__A),           \
50                                        (__v4si)(__B)))
51 
52 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
53 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
54 ///    signed 16-bit results. Sum these 2 results with the corresponding
55 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
56 ///
57 /// \headerfile <immintrin.h>
58 ///
59 /// \code
60 /// __m256i _mm256_dpwsud_epi32(__m256i __W, __m256i __A, __m256i __B)
61 /// \endcode
62 ///
63 /// This intrinsic corresponds to the \c VPDPWSUD instruction.
64 ///
65 /// \param __W
66 ///    A 256-bit vector of [8 x int].
67 /// \param __A
68 ///    A 256-bit vector of [16 x short].
69 /// \param __B
70 ///    A 256-bit vector of [16 x unsigned short].
71 /// \returns
72 ///    A 256-bit vector of [8 x int].
73 ///
74 /// \code{.operation}
75 /// FOR j := 0 to 7
76 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
77 /// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
78 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
79 /// ENDFOR
80 /// dst[MAX:256] := 0
81 /// \endcode
82 #define _mm256_dpwsud_epi32(__W, __A, __B)                                     \
83   ((__m256i)__builtin_ia32_vpdpwsud256((__v8si)(__W), (__v8si)(__A),           \
84                                        (__v8si)(__B)))
85 
86 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
87 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
88 ///    signed 16-bit results. Sum these 2 results with the corresponding
89 ///    32-bit integer in \a __W with signed saturation, and store the packed
90 ///    32-bit results in \a dst.
91 ///
92 /// \headerfile <immintrin.h>
93 ///
94 /// \code
95 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
96 /// \endcode
97 ///
98 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
99 ///
100 /// \param __W
101 ///    A 128-bit vector of [4 x int].
102 /// \param __A
103 ///    A 128-bit vector of [8 x short].
104 /// \param __B
105 ///    A 128-bit vector of [8 x unsigned short].
106 /// \returns
107 ///    A 128-bit vector of [4 x int].
108 ///
109 /// \code{.operation}
110 /// FOR j := 0 to 3
111 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
112 /// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
113 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
114 /// ENDFOR
115 /// dst[MAX:128] := 0
116 /// \endcode
117 #define _mm_dpwsuds_epi32(__W, __A, __B)                                       \
118   ((__m128i)__builtin_ia32_vpdpwsuds128((__v4si)(__W), (__v4si)(__A),          \
119                                         (__v4si)(__B)))
120 
121 /// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
122 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
123 ///    signed 16-bit results. Sum these 2 results with the corresponding
124 ///    32-bit integer in \a __W with signed saturation, and store the packed
125 ///    32-bit results in \a dst.
126 ///
127 /// \headerfile <immintrin.h>
128 ///
129 /// \code
130 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
131 /// \endcode
132 ///
133 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
134 ///
135 /// \param __W
136 ///    A 256-bit vector of [8 x int].
137 /// \param __A
138 ///    A 256-bit vector of [16 x short].
139 /// \param __B
140 ///    A 256-bit vector of [16 x unsigned short].
141 /// \returns
142 ///    A 256-bit vector of [8 x int].
143 ///
144 /// \code{.operation}
145 /// FOR j := 0 to 7
146 /// 	tmp1.dword := SignExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
147 /// 	tmp2.dword := SignExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
148 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
149 /// ENDFOR
150 /// dst[MAX:256] := 0
151 /// \endcode
152 #define _mm256_dpwsuds_epi32(__W, __A, __B)                                    \
153   ((__m256i)__builtin_ia32_vpdpwsuds256((__v8si)(__W), (__v8si)(__A),          \
154                                         (__v8si)(__B)))
155 
156 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
157 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
158 ///    signed 16-bit results. Sum these 2 results with the corresponding
159 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
160 ///
161 /// \headerfile <immintrin.h>
162 ///
163 /// \code
164 /// __m128i _mm_dpbusd_epi32(__m128i __W, __m128i __A, __m128i __B)
165 /// \endcode
166 ///
167 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
168 ///
169 /// \param __W
170 ///    A 128-bit vector of [4 x int].
171 /// \param __A
172 ///    A 128-bit vector of [8 x unsigned short].
173 /// \param __B
174 ///    A 128-bit vector of [8 x short].
175 /// \returns
176 ///    A 128-bit vector of [4 x int].
177 ///
178 /// \code{.operation}
179 /// FOR j := 0 to 3
180 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
181 /// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
182 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
183 /// ENDFOR
184 /// dst[MAX:128] := 0
185 /// \endcode
186 #define _mm_dpwusd_epi32(__W, __A, __B)                                        \
187   ((__m128i)__builtin_ia32_vpdpwusd128((__v4si)(__W), (__v4si)(__A),           \
188                                        (__v4si)(__B)))
189 
190 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
191 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
192 ///    signed 16-bit results. Sum these 2 results with the corresponding
193 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
194 ///
195 /// \headerfile <immintrin.h>
196 ///
197 /// \code
198 /// __m256i _mm256_dpwusd_epi32(__m256i __W, __m256i __A, __m256i __B)
199 /// \endcode
200 ///
201 /// This intrinsic corresponds to the \c VPDPWUSD instruction.
202 ///
203 /// \param __W
204 ///    A 256-bit vector of [8 x int].
205 /// \param __A
206 ///    A 256-bit vector of [16 x unsigned short].
207 /// \param __B
208 ///    A 256-bit vector of [16 x short].
209 /// \returns
210 ///    A 256-bit vector of [8 x int].
211 ///
212 /// \code{.operation}
213 /// FOR j := 0 to 7
214 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
215 /// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
216 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
217 /// ENDFOR
218 /// dst[MAX:256] := 0
219 /// \endcode
220 #define _mm256_dpwusd_epi32(__W, __A, __B)                                     \
221   ((__m256i)__builtin_ia32_vpdpwusd256((__v8si)(__W), (__v8si)(__A),           \
222                                        (__v8si)(__B)))
223 
224 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
225 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
226 ///    signed 16-bit results. Sum these 2 results with the corresponding
227 ///    32-bit integer in \a __W with signed saturation, and store the packed
228 ///    32-bit results in \a dst.
229 ///
230 /// \headerfile <immintrin.h>
231 ///
232 /// \code
233 /// __m128i _mm_dpwusds_epi32(__m128i __W, __m128i __A, __m128i __B)
234 /// \endcode
235 ///
236 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
237 ///
238 /// \param __W
239 ///    A 128-bit vector of [4 x int].
240 /// \param __A
241 ///    A 128-bit vector of [8 x unsigned short].
242 /// \param __B
243 ///    A 128-bit vector of [8 x short].
244 /// \returns
245 ///    A 128-bit vector of [4 x int].
246 ///
247 /// \code{.operation}
248 /// FOR j := 0 to 3
249 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
250 /// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
251 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
252 /// ENDFOR
253 /// dst[MAX:128] := 0
254 /// \endcode
255 #define _mm_dpwusds_epi32(__W, __A, __B)                                       \
256   ((__m128i)__builtin_ia32_vpdpwusds128((__v4si)(__W), (__v4si)(__A),          \
257                                         (__v4si)(__B)))
258 
259 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
260 ///    corresponding signed 16-bit integers in \a __B, producing 2 intermediate
261 ///    signed 16-bit results. Sum these 2 results with the corresponding
262 ///    32-bit integer in \a __W with signed saturation, and store the packed
263 ///    32-bit results in \a dst.
264 ///
265 /// \headerfile <immintrin.h>
266 ///
267 /// \code
268 /// __m256i _mm256_dpwsuds_epi32(__m256i __W, __m256i __A, __m256i __B)
269 /// \endcode
270 ///
271 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
272 ///
273 /// \param __W
274 ///    A 256-bit vector of [8 x int].
275 /// \param __A
276 ///    A 256-bit vector of [16 x unsigned short].
277 /// \param __B
278 ///    A 256-bit vector of [16 x short].
279 /// \returns
280 ///    A 256-bit vector of [8 x int].
281 ///
282 /// \code{.operation}
283 /// FOR j := 0 to 7
284 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
285 /// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
286 /// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
287 /// ENDFOR
288 /// dst[MAX:256] := 0
289 /// \endcode
290 #define _mm256_dpwusds_epi32(__W, __A, __B)                                    \
291   ((__m256i)__builtin_ia32_vpdpwusds256((__v8si)(__W), (__v8si)(__A),          \
292                                         (__v8si)(__B)))
293 
294 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
295 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
296 ///    signed 16-bit results. Sum these 2 results with the corresponding
297 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
298 ///
299 /// \headerfile <immintrin.h>
300 ///
301 /// \code
302 /// __m128i _mm_dpwuud_epi32(__m128i __W, __m128i __A, __m128i __B)
303 /// \endcode
304 ///
305 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
306 ///
307 /// \param __W
308 ///    A 128-bit vector of [4 x unsigned int].
309 /// \param __A
310 ///    A 128-bit vector of [8 x unsigned short].
311 /// \param __B
312 ///    A 128-bit vector of [8 x unsigned short].
313 /// \returns
314 ///    A 128-bit vector of [4 x unsigned int].
315 ///
316 /// \code{.operation}
317 /// FOR j := 0 to 3
318 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
319 /// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
320 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
321 /// ENDFOR
322 /// dst[MAX:128] := 0
323 /// \endcode
324 #define _mm_dpwuud_epi32(__W, __A, __B)                                        \
325   ((__m128i)__builtin_ia32_vpdpwuud128((__v4si)(__W), (__v4si)(__A),           \
326                                        (__v4si)(__B)))
327 
328 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
329 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
330 ///    signed 16-bit results. Sum these 2 results with the corresponding
331 ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
332 ///
333 /// \headerfile <immintrin.h>
334 ///
335 /// \code
336 /// __m256i _mm256_dpwuud_epi32(__m256i __W, __m256i __A, __m256i __B)
337 /// \endcode
338 ///
339 /// This intrinsic corresponds to the \c VPDPWUUD instruction.
340 ///
341 /// \param __W
342 ///    A 256-bit vector of [8 x unsigned int].
343 /// \param __A
344 ///    A 256-bit vector of [16 x unsigned short].
345 /// \param __B
346 ///    A 256-bit vector of [16 x unsigned short].
347 /// \returns
348 ///    A 256-bit vector of [8 x unsigned int].
349 ///
350 /// \code{.operation}
351 /// FOR j := 0 to 7
352 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
353 /// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
354 /// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2
355 /// ENDFOR
356 /// dst[MAX:256] := 0
357 /// \endcode
358 #define _mm256_dpwuud_epi32(__W, __A, __B)                                     \
359   ((__m256i)__builtin_ia32_vpdpwuud256((__v8si)(__W), (__v8si)(__A),           \
360                                        (__v8si)(__B)))
361 
362 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
363 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
364 ///    signed 16-bit results. Sum these 2 results with the corresponding
365 ///    32-bit integer in \a __W with signed saturation, and store the packed
366 ///    32-bit results in \a dst.
367 ///
368 /// \headerfile <immintrin.h>
369 ///
370 /// \code
371 /// __m128i _mm_dpwsuds_epi32(__m128i __W, __m128i __A, __m128i __B)
372 /// \endcode
373 ///
374 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
375 ///
376 /// \param __W
377 ///    A 128-bit vector of [4 x unsigned int].
378 /// \param __A
379 ///    A 128-bit vector of [8 x unsigned short].
380 /// \param __B
381 ///    A 128-bit vector of [8 x unsigned short].
382 /// \returns
383 ///    A 128-bit vector of [4 x unsigned int].
384 ///
385 /// \code{.operation}
386 /// FOR j := 0 to 3
387 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
388 /// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
389 /// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
390 /// ENDFOR
391 /// dst[MAX:128] := 0
392 /// \endcode
393 #define _mm_dpwuuds_epi32(__W, __A, __B)                                       \
394   ((__m128i)__builtin_ia32_vpdpwuuds128((__v4si)(__W), (__v4si)(__A),          \
395                                         (__v4si)(__B)))
396 
397 /// Multiply groups of 2 adjacent pairs of unsigned 16-bit integers in \a __A with
398 ///    corresponding unsigned 16-bit integers in \a __B, producing 2 intermediate
399 ///    signed 16-bit results. Sum these 2 results with the corresponding
400 ///    32-bit integer in \a __W with signed saturation, and store the packed
401 ///    32-bit results in \a dst.
402 ///
403 /// \headerfile <immintrin.h>
404 ///
405 /// \code
406 /// __m256i _mm256_dpwuuds_epi32(__m256i __W, __m256i __A, __m256i __B)
407 /// \endcode
408 ///
409 /// This intrinsic corresponds to the \c VPDPWSUDS instruction.
410 ///
411 /// \param __W
412 ///    A 256-bit vector of [8 x unsigned int].
413 /// \param __A
414 ///    A 256-bit vector of [16 x unsigned short].
415 /// \param __B
416 ///    A 256-bit vector of [16 x unsigned short].
417 /// \returns
418 ///    A 256-bit vector of [8 x unsigned int].
419 ///
420 /// \code{.operation}
421 /// FOR j := 0 to 7
422 /// 	tmp1.dword := ZeroExtend32(__A.word[2*j]) * ZeroExtend32(__B.word[2*j])
423 /// 	tmp2.dword := ZeroExtend32(__A.word[2*j+1]) * ZeroExtend32(__B.word[2*j+1])
424 /// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2)
425 /// ENDFOR
426 /// dst[MAX:256] := 0
427 /// \endcode
428 #define _mm256_dpwuuds_epi32(__W, __A, __B)                                    \
429   ((__m256i)__builtin_ia32_vpdpwuuds256((__v8si)(__W), (__v8si)(__A),          \
430                                         (__v8si)(__B)))
431 
432 #endif // __AVXVNNIINT16INTRIN_H
433