1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #include <assert.h>
13 #include <string>
14
15 #include "config/aom_dsp_rtcd.h"
16
17 #include "test/acm_random.h"
18 #include "aom_dsp/aom_simd.h"
19 #undef SIMD_INLINE
20 #define SIMD_INLINE static // Don't enforce inlining
21 #include "aom_dsp/simd/v256_intrinsics_c.h"
22
23 // Machine tuned code goes into this file. This file is included from
24 // simd_cmp_sse2.cc, simd_cmp_ssse3.cc etc which define the macros
25 // ARCH (=neon, sse2, ssse3, etc), SIMD_NAMESPACE and ARCH_POSTFIX().
26
27 #ifdef _MSC_VER
28 // Disable "value of intrinsic immediate argument 'value' is out of range
29 // 'lowerbound - upperbound'" warning. Visual Studio emits this warning though
30 // the parameters are conditionally checked in e.g., v256_shr_n_byte. Adding a
31 // mask doesn't always appear to be sufficient.
32 #pragma warning(disable : 4556)
33 #endif
34
35 using libaom_test::ACMRandom;
36
37 namespace SIMD_NAMESPACE {
38
39 // Wrap templates around intrinsics using immediate values
40 template <int shift>
imm_v64_shl_n_byte(v64 a)41 v64 imm_v64_shl_n_byte(v64 a) {
42 return v64_shl_n_byte(a, shift);
43 }
44 template <int shift>
imm_v64_shr_n_byte(v64 a)45 v64 imm_v64_shr_n_byte(v64 a) {
46 return v64_shr_n_byte(a, shift);
47 }
48 template <int shift>
imm_v64_shl_n_8(v64 a)49 v64 imm_v64_shl_n_8(v64 a) {
50 return v64_shl_n_8(a, shift);
51 }
52 template <int shift>
imm_v64_shr_n_u8(v64 a)53 v64 imm_v64_shr_n_u8(v64 a) {
54 return v64_shr_n_u8(a, shift);
55 }
56 template <int shift>
imm_v64_shr_n_s8(v64 a)57 v64 imm_v64_shr_n_s8(v64 a) {
58 return v64_shr_n_s8(a, shift);
59 }
60 template <int shift>
imm_v64_shl_n_16(v64 a)61 v64 imm_v64_shl_n_16(v64 a) {
62 return v64_shl_n_16(a, shift);
63 }
64 template <int shift>
imm_v64_shr_n_u16(v64 a)65 v64 imm_v64_shr_n_u16(v64 a) {
66 return v64_shr_n_u16(a, shift);
67 }
68 template <int shift>
imm_v64_shr_n_s16(v64 a)69 v64 imm_v64_shr_n_s16(v64 a) {
70 return v64_shr_n_s16(a, shift);
71 }
72 template <int shift>
imm_v64_shl_n_32(v64 a)73 v64 imm_v64_shl_n_32(v64 a) {
74 return v64_shl_n_32(a, shift);
75 }
76 template <int shift>
imm_v64_shr_n_u32(v64 a)77 v64 imm_v64_shr_n_u32(v64 a) {
78 return v64_shr_n_u32(a, shift);
79 }
80 template <int shift>
imm_v64_shr_n_s32(v64 a)81 v64 imm_v64_shr_n_s32(v64 a) {
82 return v64_shr_n_s32(a, shift);
83 }
84 template <int shift>
imm_v64_align(v64 a,v64 b)85 v64 imm_v64_align(v64 a, v64 b) {
86 return v64_align(a, b, shift);
87 }
88
89 // Wrap templates around corresponding C implementations of the above
90 template <int shift>
c_imm_v64_shl_n_byte(c_v64 a)91 c_v64 c_imm_v64_shl_n_byte(c_v64 a) {
92 return c_v64_shl_n_byte(a, shift);
93 }
94 template <int shift>
c_imm_v64_shr_n_byte(c_v64 a)95 c_v64 c_imm_v64_shr_n_byte(c_v64 a) {
96 return c_v64_shr_n_byte(a, shift);
97 }
98 template <int shift>
c_imm_v64_shl_n_8(c_v64 a)99 c_v64 c_imm_v64_shl_n_8(c_v64 a) {
100 return c_v64_shl_n_8(a, shift);
101 }
102 template <int shift>
c_imm_v64_shr_n_u8(c_v64 a)103 c_v64 c_imm_v64_shr_n_u8(c_v64 a) {
104 return c_v64_shr_n_u8(a, shift);
105 }
106 template <int shift>
c_imm_v64_shr_n_s8(c_v64 a)107 c_v64 c_imm_v64_shr_n_s8(c_v64 a) {
108 return c_v64_shr_n_s8(a, shift);
109 }
110 template <int shift>
c_imm_v64_shl_n_16(c_v64 a)111 c_v64 c_imm_v64_shl_n_16(c_v64 a) {
112 return c_v64_shl_n_16(a, shift);
113 }
114 template <int shift>
c_imm_v64_shr_n_u16(c_v64 a)115 c_v64 c_imm_v64_shr_n_u16(c_v64 a) {
116 return c_v64_shr_n_u16(a, shift);
117 }
118 template <int shift>
c_imm_v64_shr_n_s16(c_v64 a)119 c_v64 c_imm_v64_shr_n_s16(c_v64 a) {
120 return c_v64_shr_n_s16(a, shift);
121 }
122 template <int shift>
c_imm_v64_shl_n_32(c_v64 a)123 c_v64 c_imm_v64_shl_n_32(c_v64 a) {
124 return c_v64_shl_n_32(a, shift);
125 }
126 template <int shift>
c_imm_v64_shr_n_u32(c_v64 a)127 c_v64 c_imm_v64_shr_n_u32(c_v64 a) {
128 return c_v64_shr_n_u32(a, shift);
129 }
130 template <int shift>
c_imm_v64_shr_n_s32(c_v64 a)131 c_v64 c_imm_v64_shr_n_s32(c_v64 a) {
132 return c_v64_shr_n_s32(a, shift);
133 }
134 template <int shift>
c_imm_v64_align(c_v64 a,c_v64 b)135 c_v64 c_imm_v64_align(c_v64 a, c_v64 b) {
136 return c_v64_align(a, b, shift);
137 }
138
139 template <int shift>
imm_v128_shl_n_byte(v128 a)140 v128 imm_v128_shl_n_byte(v128 a) {
141 return v128_shl_n_byte(a, shift);
142 }
143 template <int shift>
imm_v128_shr_n_byte(v128 a)144 v128 imm_v128_shr_n_byte(v128 a) {
145 return v128_shr_n_byte(a, shift);
146 }
147 template <int shift>
imm_v128_shl_n_8(v128 a)148 v128 imm_v128_shl_n_8(v128 a) {
149 return v128_shl_n_8(a, shift);
150 }
151 template <int shift>
imm_v128_shr_n_u8(v128 a)152 v128 imm_v128_shr_n_u8(v128 a) {
153 return v128_shr_n_u8(a, shift);
154 }
155 template <int shift>
imm_v128_shr_n_s8(v128 a)156 v128 imm_v128_shr_n_s8(v128 a) {
157 return v128_shr_n_s8(a, shift);
158 }
159 template <int shift>
imm_v128_shl_n_16(v128 a)160 v128 imm_v128_shl_n_16(v128 a) {
161 return v128_shl_n_16(a, shift);
162 }
163 template <int shift>
imm_v128_shr_n_u16(v128 a)164 v128 imm_v128_shr_n_u16(v128 a) {
165 return v128_shr_n_u16(a, shift);
166 }
167 template <int shift>
imm_v128_shr_n_s16(v128 a)168 v128 imm_v128_shr_n_s16(v128 a) {
169 return v128_shr_n_s16(a, shift);
170 }
171 template <int shift>
imm_v128_shl_n_32(v128 a)172 v128 imm_v128_shl_n_32(v128 a) {
173 return v128_shl_n_32(a, shift);
174 }
175 template <int shift>
imm_v128_shr_n_u32(v128 a)176 v128 imm_v128_shr_n_u32(v128 a) {
177 return v128_shr_n_u32(a, shift);
178 }
179 template <int shift>
imm_v128_shr_n_s32(v128 a)180 v128 imm_v128_shr_n_s32(v128 a) {
181 return v128_shr_n_s32(a, shift);
182 }
183 template <int shift>
imm_v128_shl_n_64(v128 a)184 v128 imm_v128_shl_n_64(v128 a) {
185 return v128_shl_n_64(a, shift);
186 }
187 template <int shift>
imm_v128_shr_n_u64(v128 a)188 v128 imm_v128_shr_n_u64(v128 a) {
189 return v128_shr_n_u64(a, shift);
190 }
191 template <int shift>
imm_v128_shr_n_s64(v128 a)192 v128 imm_v128_shr_n_s64(v128 a) {
193 return v128_shr_n_s64(a, shift);
194 }
195 template <int shift>
imm_v128_align(v128 a,v128 b)196 v128 imm_v128_align(v128 a, v128 b) {
197 return v128_align(a, b, shift);
198 }
199
200 template <int shift>
c_imm_v128_shl_n_byte(c_v128 a)201 c_v128 c_imm_v128_shl_n_byte(c_v128 a) {
202 return c_v128_shl_n_byte(a, shift);
203 }
204 template <int shift>
c_imm_v128_shr_n_byte(c_v128 a)205 c_v128 c_imm_v128_shr_n_byte(c_v128 a) {
206 return c_v128_shr_n_byte(a, shift);
207 }
208 template <int shift>
c_imm_v128_shl_n_8(c_v128 a)209 c_v128 c_imm_v128_shl_n_8(c_v128 a) {
210 return c_v128_shl_n_8(a, shift);
211 }
212 template <int shift>
c_imm_v128_shr_n_u8(c_v128 a)213 c_v128 c_imm_v128_shr_n_u8(c_v128 a) {
214 return c_v128_shr_n_u8(a, shift);
215 }
216 template <int shift>
c_imm_v128_shr_n_s8(c_v128 a)217 c_v128 c_imm_v128_shr_n_s8(c_v128 a) {
218 return c_v128_shr_n_s8(a, shift);
219 }
220 template <int shift>
c_imm_v128_shl_n_16(c_v128 a)221 c_v128 c_imm_v128_shl_n_16(c_v128 a) {
222 return c_v128_shl_n_16(a, shift);
223 }
224 template <int shift>
c_imm_v128_shr_n_u16(c_v128 a)225 c_v128 c_imm_v128_shr_n_u16(c_v128 a) {
226 return c_v128_shr_n_u16(a, shift);
227 }
228 template <int shift>
c_imm_v128_shr_n_s16(c_v128 a)229 c_v128 c_imm_v128_shr_n_s16(c_v128 a) {
230 return c_v128_shr_n_s16(a, shift);
231 }
232 template <int shift>
c_imm_v128_shl_n_32(c_v128 a)233 c_v128 c_imm_v128_shl_n_32(c_v128 a) {
234 return c_v128_shl_n_32(a, shift);
235 }
236 template <int shift>
c_imm_v128_shr_n_u32(c_v128 a)237 c_v128 c_imm_v128_shr_n_u32(c_v128 a) {
238 return c_v128_shr_n_u32(a, shift);
239 }
240 template <int shift>
c_imm_v128_shr_n_s32(c_v128 a)241 c_v128 c_imm_v128_shr_n_s32(c_v128 a) {
242 return c_v128_shr_n_s32(a, shift);
243 }
244 template <int shift>
c_imm_v128_shl_n_64(c_v128 a)245 c_v128 c_imm_v128_shl_n_64(c_v128 a) {
246 return c_v128_shl_n_64(a, shift);
247 }
248 template <int shift>
c_imm_v128_shr_n_u64(c_v128 a)249 c_v128 c_imm_v128_shr_n_u64(c_v128 a) {
250 return c_v128_shr_n_u64(a, shift);
251 }
252 template <int shift>
c_imm_v128_shr_n_s64(c_v128 a)253 c_v128 c_imm_v128_shr_n_s64(c_v128 a) {
254 return c_v128_shr_n_s64(a, shift);
255 }
256 template <int shift>
c_imm_v128_align(c_v128 a,c_v128 b)257 c_v128 c_imm_v128_align(c_v128 a, c_v128 b) {
258 return c_v128_align(a, b, shift);
259 }
260
261 template <int shift>
imm_v256_shl_n_word(v256 a)262 v256 imm_v256_shl_n_word(v256 a) {
263 return v256_shl_n_word(a, shift);
264 }
265 template <int shift>
imm_v256_shr_n_word(v256 a)266 v256 imm_v256_shr_n_word(v256 a) {
267 return v256_shr_n_word(a, shift);
268 }
269 template <int shift>
imm_v256_shl_n_byte(v256 a)270 v256 imm_v256_shl_n_byte(v256 a) {
271 return v256_shl_n_byte(a, shift);
272 }
273 template <int shift>
imm_v256_shr_n_byte(v256 a)274 v256 imm_v256_shr_n_byte(v256 a) {
275 return v256_shr_n_byte(a, shift);
276 }
277 template <int shift>
imm_v256_shl_n_8(v256 a)278 v256 imm_v256_shl_n_8(v256 a) {
279 return v256_shl_n_8(a, shift);
280 }
281 template <int shift>
imm_v256_shr_n_u8(v256 a)282 v256 imm_v256_shr_n_u8(v256 a) {
283 return v256_shr_n_u8(a, shift);
284 }
285 template <int shift>
imm_v256_shr_n_s8(v256 a)286 v256 imm_v256_shr_n_s8(v256 a) {
287 return v256_shr_n_s8(a, shift);
288 }
289 template <int shift>
imm_v256_shl_n_16(v256 a)290 v256 imm_v256_shl_n_16(v256 a) {
291 return v256_shl_n_16(a, shift);
292 }
293 template <int shift>
imm_v256_shr_n_u16(v256 a)294 v256 imm_v256_shr_n_u16(v256 a) {
295 return v256_shr_n_u16(a, shift);
296 }
297 template <int shift>
imm_v256_shr_n_s16(v256 a)298 v256 imm_v256_shr_n_s16(v256 a) {
299 return v256_shr_n_s16(a, shift);
300 }
301 template <int shift>
imm_v256_shl_n_32(v256 a)302 v256 imm_v256_shl_n_32(v256 a) {
303 return v256_shl_n_32(a, shift);
304 }
305 template <int shift>
imm_v256_shr_n_u32(v256 a)306 v256 imm_v256_shr_n_u32(v256 a) {
307 return v256_shr_n_u32(a, shift);
308 }
309 template <int shift>
imm_v256_shr_n_s32(v256 a)310 v256 imm_v256_shr_n_s32(v256 a) {
311 return v256_shr_n_s32(a, shift);
312 }
313 template <int shift>
imm_v256_shl_n_64(v256 a)314 v256 imm_v256_shl_n_64(v256 a) {
315 return v256_shl_n_64(a, shift);
316 }
317 template <int shift>
imm_v256_shr_n_u64(v256 a)318 v256 imm_v256_shr_n_u64(v256 a) {
319 return v256_shr_n_u64(a, shift);
320 }
321 template <int shift>
imm_v256_shr_n_s64(v256 a)322 v256 imm_v256_shr_n_s64(v256 a) {
323 return v256_shr_n_s64(a, shift);
324 }
325 template <int shift>
imm_v256_align(v256 a,v256 b)326 v256 imm_v256_align(v256 a, v256 b) {
327 return v256_align(a, b, shift);
328 }
329
330 template <int shift>
c_imm_v256_shl_n_word(c_v256 a)331 c_v256 c_imm_v256_shl_n_word(c_v256 a) {
332 return c_v256_shl_n_word(a, shift);
333 }
334 template <int shift>
c_imm_v256_shr_n_word(c_v256 a)335 c_v256 c_imm_v256_shr_n_word(c_v256 a) {
336 return c_v256_shr_n_word(a, shift);
337 }
338 template <int shift>
c_imm_v256_shl_n_byte(c_v256 a)339 c_v256 c_imm_v256_shl_n_byte(c_v256 a) {
340 return c_v256_shl_n_byte(a, shift);
341 }
342 template <int shift>
c_imm_v256_shr_n_byte(c_v256 a)343 c_v256 c_imm_v256_shr_n_byte(c_v256 a) {
344 return c_v256_shr_n_byte(a, shift);
345 }
346 template <int shift>
c_imm_v256_shl_n_8(c_v256 a)347 c_v256 c_imm_v256_shl_n_8(c_v256 a) {
348 return c_v256_shl_n_8(a, shift);
349 }
350 template <int shift>
c_imm_v256_shr_n_u8(c_v256 a)351 c_v256 c_imm_v256_shr_n_u8(c_v256 a) {
352 return c_v256_shr_n_u8(a, shift);
353 }
354 template <int shift>
c_imm_v256_shr_n_s8(c_v256 a)355 c_v256 c_imm_v256_shr_n_s8(c_v256 a) {
356 return c_v256_shr_n_s8(a, shift);
357 }
358 template <int shift>
c_imm_v256_shl_n_16(c_v256 a)359 c_v256 c_imm_v256_shl_n_16(c_v256 a) {
360 return c_v256_shl_n_16(a, shift);
361 }
362 template <int shift>
c_imm_v256_shr_n_u16(c_v256 a)363 c_v256 c_imm_v256_shr_n_u16(c_v256 a) {
364 return c_v256_shr_n_u16(a, shift);
365 }
366 template <int shift>
c_imm_v256_shr_n_s16(c_v256 a)367 c_v256 c_imm_v256_shr_n_s16(c_v256 a) {
368 return c_v256_shr_n_s16(a, shift);
369 }
370 template <int shift>
c_imm_v256_shl_n_32(c_v256 a)371 c_v256 c_imm_v256_shl_n_32(c_v256 a) {
372 return c_v256_shl_n_32(a, shift);
373 }
374 template <int shift>
c_imm_v256_shr_n_u32(c_v256 a)375 c_v256 c_imm_v256_shr_n_u32(c_v256 a) {
376 return c_v256_shr_n_u32(a, shift);
377 }
378 template <int shift>
c_imm_v256_shr_n_s32(c_v256 a)379 c_v256 c_imm_v256_shr_n_s32(c_v256 a) {
380 return c_v256_shr_n_s32(a, shift);
381 }
382 template <int shift>
c_imm_v256_shl_n_64(c_v256 a)383 c_v256 c_imm_v256_shl_n_64(c_v256 a) {
384 return c_v256_shl_n_64(a, shift);
385 }
386 template <int shift>
c_imm_v256_shr_n_u64(c_v256 a)387 c_v256 c_imm_v256_shr_n_u64(c_v256 a) {
388 return c_v256_shr_n_u64(a, shift);
389 }
390 template <int shift>
c_imm_v256_shr_n_s64(c_v256 a)391 c_v256 c_imm_v256_shr_n_s64(c_v256 a) {
392 return c_v256_shr_n_s64(a, shift);
393 }
394 template <int shift>
c_imm_v256_align(c_v256 a,c_v256 b)395 c_v256 c_imm_v256_align(c_v256 a, c_v256 b) {
396 return c_v256_align(a, b, shift);
397 }
398
399 // Wrappers around the the SAD and SSD functions
v64_sad_u8(v64 a,v64 b)400 uint32_t v64_sad_u8(v64 a, v64 b) {
401 return v64_sad_u8_sum(::v64_sad_u8(v64_sad_u8_init(), a, b));
402 }
v64_ssd_u8(v64 a,v64 b)403 uint32_t v64_ssd_u8(v64 a, v64 b) {
404 return v64_ssd_u8_sum(::v64_ssd_u8(v64_ssd_u8_init(), a, b));
405 }
406
c_v64_sad_u8(c_v64 a,c_v64 b)407 uint32_t c_v64_sad_u8(c_v64 a, c_v64 b) {
408 return c_v64_sad_u8_sum(::c_v64_sad_u8(c_v64_sad_u8_init(), a, b));
409 }
c_v64_ssd_u8(c_v64 a,c_v64 b)410 uint32_t c_v64_ssd_u8(c_v64 a, c_v64 b) {
411 return c_v64_ssd_u8_sum(::c_v64_ssd_u8(c_v64_ssd_u8_init(), a, b));
412 }
v128_sad_u8(v128 a,v128 b)413 uint32_t v128_sad_u8(v128 a, v128 b) {
414 return v128_sad_u8_sum(::v128_sad_u8(v128_sad_u8_init(), a, b));
415 }
v128_ssd_u8(v128 a,v128 b)416 uint32_t v128_ssd_u8(v128 a, v128 b) {
417 return v128_ssd_u8_sum(::v128_ssd_u8(v128_ssd_u8_init(), a, b));
418 }
c_v128_sad_u8(c_v128 a,c_v128 b)419 uint32_t c_v128_sad_u8(c_v128 a, c_v128 b) {
420 return c_v128_sad_u8_sum(::c_v128_sad_u8(c_v128_sad_u8_init(), a, b));
421 }
c_v128_ssd_u8(c_v128 a,c_v128 b)422 uint32_t c_v128_ssd_u8(c_v128 a, c_v128 b) {
423 return c_v128_ssd_u8_sum(::c_v128_ssd_u8(c_v128_ssd_u8_init(), a, b));
424 }
v128_sad_u16(v128 a,v128 b)425 uint32_t v128_sad_u16(v128 a, v128 b) {
426 return v128_sad_u16_sum(::v128_sad_u16(v128_sad_u16_init(), a, b));
427 }
v128_ssd_s16(v128 a,v128 b)428 uint64_t v128_ssd_s16(v128 a, v128 b) {
429 return v128_ssd_s16_sum(::v128_ssd_s16(v128_ssd_s16_init(), a, b));
430 }
c_v128_sad_u16(c_v128 a,c_v128 b)431 uint32_t c_v128_sad_u16(c_v128 a, c_v128 b) {
432 return c_v128_sad_u16_sum(::c_v128_sad_u16(c_v128_sad_u16_init(), a, b));
433 }
c_v128_ssd_s16(c_v128 a,c_v128 b)434 uint64_t c_v128_ssd_s16(c_v128 a, c_v128 b) {
435 return c_v128_ssd_s16_sum(::c_v128_ssd_s16(c_v128_ssd_s16_init(), a, b));
436 }
v256_sad_u8(v256 a,v256 b)437 uint32_t v256_sad_u8(v256 a, v256 b) {
438 return v256_sad_u8_sum(::v256_sad_u8(v256_sad_u8_init(), a, b));
439 }
v256_ssd_u8(v256 a,v256 b)440 uint32_t v256_ssd_u8(v256 a, v256 b) {
441 return v256_ssd_u8_sum(::v256_ssd_u8(v256_ssd_u8_init(), a, b));
442 }
c_v256_sad_u8(c_v256 a,c_v256 b)443 uint32_t c_v256_sad_u8(c_v256 a, c_v256 b) {
444 return c_v256_sad_u8_sum(::c_v256_sad_u8(c_v256_sad_u8_init(), a, b));
445 }
c_v256_ssd_u8(c_v256 a,c_v256 b)446 uint32_t c_v256_ssd_u8(c_v256 a, c_v256 b) {
447 return c_v256_ssd_u8_sum(::c_v256_ssd_u8(c_v256_ssd_u8_init(), a, b));
448 }
v256_sad_u16(v256 a,v256 b)449 uint32_t v256_sad_u16(v256 a, v256 b) {
450 return v256_sad_u16_sum(::v256_sad_u16(v256_sad_u16_init(), a, b));
451 }
v256_ssd_s16(v256 a,v256 b)452 uint64_t v256_ssd_s16(v256 a, v256 b) {
453 return v256_ssd_s16_sum(::v256_ssd_s16(v256_ssd_s16_init(), a, b));
454 }
c_v256_sad_u16(c_v256 a,c_v256 b)455 uint32_t c_v256_sad_u16(c_v256 a, c_v256 b) {
456 return c_v256_sad_u16_sum(::c_v256_sad_u16(c_v256_sad_u16_init(), a, b));
457 }
c_v256_ssd_s16(c_v256 a,c_v256 b)458 uint64_t c_v256_ssd_s16(c_v256 a, c_v256 b) {
459 return c_v256_ssd_s16_sum(::c_v256_ssd_s16(c_v256_ssd_s16_init(), a, b));
460 }
461
462 namespace {
463
464 typedef void (*fptr)();
465
466 typedef struct {
467 const char *name;
468 fptr ref;
469 fptr simd;
470 } mapping;
471
472 #define MAP(name) \
473 { #name, reinterpret_cast < fptr>(c_##name), reinterpret_cast < fptr>(name) }
474
475 const mapping m[] = { MAP(v64_sad_u8),
476 MAP(v64_ssd_u8),
477 MAP(v64_add_8),
478 MAP(v64_add_16),
479 MAP(v64_sadd_s8),
480 MAP(v64_sadd_u8),
481 MAP(v64_sadd_s16),
482 MAP(v64_add_32),
483 MAP(v64_sub_8),
484 MAP(v64_ssub_u8),
485 MAP(v64_ssub_s8),
486 MAP(v64_sub_16),
487 MAP(v64_ssub_s16),
488 MAP(v64_ssub_u16),
489 MAP(v64_sub_32),
490 MAP(v64_ziplo_8),
491 MAP(v64_ziphi_8),
492 MAP(v64_ziplo_16),
493 MAP(v64_ziphi_16),
494 MAP(v64_ziplo_32),
495 MAP(v64_ziphi_32),
496 MAP(v64_pack_s32_u16),
497 MAP(v64_pack_s32_s16),
498 MAP(v64_pack_s16_u8),
499 MAP(v64_pack_s16_s8),
500 MAP(v64_unziphi_8),
501 MAP(v64_unziplo_8),
502 MAP(v64_unziphi_16),
503 MAP(v64_unziplo_16),
504 MAP(v64_or),
505 MAP(v64_xor),
506 MAP(v64_and),
507 MAP(v64_andn),
508 MAP(v64_mullo_s16),
509 MAP(v64_mulhi_s16),
510 MAP(v64_mullo_s32),
511 MAP(v64_madd_s16),
512 MAP(v64_madd_us8),
513 MAP(v64_avg_u8),
514 MAP(v64_rdavg_u8),
515 MAP(v64_rdavg_u16),
516 MAP(v64_avg_u16),
517 MAP(v64_min_u8),
518 MAP(v64_max_u8),
519 MAP(v64_min_s8),
520 MAP(v64_max_s8),
521 MAP(v64_min_s16),
522 MAP(v64_max_s16),
523 MAP(v64_cmpgt_s8),
524 MAP(v64_cmplt_s8),
525 MAP(v64_cmpeq_8),
526 MAP(v64_cmpgt_s16),
527 MAP(v64_cmplt_s16),
528 MAP(v64_cmpeq_16),
529 MAP(v64_shuffle_8),
530 MAP(imm_v64_align<1>),
531 MAP(imm_v64_align<2>),
532 MAP(imm_v64_align<3>),
533 MAP(imm_v64_align<4>),
534 MAP(imm_v64_align<5>),
535 MAP(imm_v64_align<6>),
536 MAP(imm_v64_align<7>),
537 MAP(v64_abs_s8),
538 MAP(v64_abs_s16),
539 MAP(v64_unpacklo_u8_s16),
540 MAP(v64_unpackhi_u8_s16),
541 MAP(v64_unpacklo_s8_s16),
542 MAP(v64_unpackhi_s8_s16),
543 MAP(v64_unpacklo_u16_s32),
544 MAP(v64_unpacklo_s16_s32),
545 MAP(v64_unpackhi_u16_s32),
546 MAP(v64_unpackhi_s16_s32),
547 MAP(imm_v64_shr_n_byte<1>),
548 MAP(imm_v64_shr_n_byte<2>),
549 MAP(imm_v64_shr_n_byte<3>),
550 MAP(imm_v64_shr_n_byte<4>),
551 MAP(imm_v64_shr_n_byte<5>),
552 MAP(imm_v64_shr_n_byte<6>),
553 MAP(imm_v64_shr_n_byte<7>),
554 MAP(imm_v64_shl_n_byte<1>),
555 MAP(imm_v64_shl_n_byte<2>),
556 MAP(imm_v64_shl_n_byte<3>),
557 MAP(imm_v64_shl_n_byte<4>),
558 MAP(imm_v64_shl_n_byte<5>),
559 MAP(imm_v64_shl_n_byte<6>),
560 MAP(imm_v64_shl_n_byte<7>),
561 MAP(imm_v64_shl_n_8<1>),
562 MAP(imm_v64_shl_n_8<2>),
563 MAP(imm_v64_shl_n_8<3>),
564 MAP(imm_v64_shl_n_8<4>),
565 MAP(imm_v64_shl_n_8<5>),
566 MAP(imm_v64_shl_n_8<6>),
567 MAP(imm_v64_shl_n_8<7>),
568 MAP(imm_v64_shr_n_u8<1>),
569 MAP(imm_v64_shr_n_u8<2>),
570 MAP(imm_v64_shr_n_u8<3>),
571 MAP(imm_v64_shr_n_u8<4>),
572 MAP(imm_v64_shr_n_u8<5>),
573 MAP(imm_v64_shr_n_u8<6>),
574 MAP(imm_v64_shr_n_u8<7>),
575 MAP(imm_v64_shr_n_s8<1>),
576 MAP(imm_v64_shr_n_s8<2>),
577 MAP(imm_v64_shr_n_s8<3>),
578 MAP(imm_v64_shr_n_s8<4>),
579 MAP(imm_v64_shr_n_s8<5>),
580 MAP(imm_v64_shr_n_s8<6>),
581 MAP(imm_v64_shr_n_s8<7>),
582 MAP(imm_v64_shl_n_16<1>),
583 MAP(imm_v64_shl_n_16<2>),
584 MAP(imm_v64_shl_n_16<4>),
585 MAP(imm_v64_shl_n_16<6>),
586 MAP(imm_v64_shl_n_16<8>),
587 MAP(imm_v64_shl_n_16<10>),
588 MAP(imm_v64_shl_n_16<12>),
589 MAP(imm_v64_shl_n_16<14>),
590 MAP(imm_v64_shr_n_u16<1>),
591 MAP(imm_v64_shr_n_u16<2>),
592 MAP(imm_v64_shr_n_u16<4>),
593 MAP(imm_v64_shr_n_u16<6>),
594 MAP(imm_v64_shr_n_u16<8>),
595 MAP(imm_v64_shr_n_u16<10>),
596 MAP(imm_v64_shr_n_u16<12>),
597 MAP(imm_v64_shr_n_u16<14>),
598 MAP(imm_v64_shr_n_s16<1>),
599 MAP(imm_v64_shr_n_s16<2>),
600 MAP(imm_v64_shr_n_s16<4>),
601 MAP(imm_v64_shr_n_s16<6>),
602 MAP(imm_v64_shr_n_s16<8>),
603 MAP(imm_v64_shr_n_s16<10>),
604 MAP(imm_v64_shr_n_s16<12>),
605 MAP(imm_v64_shr_n_s16<14>),
606 MAP(imm_v64_shl_n_32<1>),
607 MAP(imm_v64_shl_n_32<4>),
608 MAP(imm_v64_shl_n_32<8>),
609 MAP(imm_v64_shl_n_32<12>),
610 MAP(imm_v64_shl_n_32<16>),
611 MAP(imm_v64_shl_n_32<20>),
612 MAP(imm_v64_shl_n_32<24>),
613 MAP(imm_v64_shl_n_32<28>),
614 MAP(imm_v64_shr_n_u32<1>),
615 MAP(imm_v64_shr_n_u32<4>),
616 MAP(imm_v64_shr_n_u32<8>),
617 MAP(imm_v64_shr_n_u32<12>),
618 MAP(imm_v64_shr_n_u32<16>),
619 MAP(imm_v64_shr_n_u32<20>),
620 MAP(imm_v64_shr_n_u32<24>),
621 MAP(imm_v64_shr_n_u32<28>),
622 MAP(imm_v64_shr_n_s32<1>),
623 MAP(imm_v64_shr_n_s32<4>),
624 MAP(imm_v64_shr_n_s32<8>),
625 MAP(imm_v64_shr_n_s32<12>),
626 MAP(imm_v64_shr_n_s32<16>),
627 MAP(imm_v64_shr_n_s32<20>),
628 MAP(imm_v64_shr_n_s32<24>),
629 MAP(imm_v64_shr_n_s32<28>),
630 MAP(v64_shl_8),
631 MAP(v64_shr_u8),
632 MAP(v64_shr_s8),
633 MAP(v64_shl_16),
634 MAP(v64_shr_u16),
635 MAP(v64_shr_s16),
636 MAP(v64_shl_32),
637 MAP(v64_shr_u32),
638 MAP(v64_shr_s32),
639 MAP(v64_hadd_u8),
640 MAP(v64_hadd_s16),
641 MAP(v64_dotp_s16),
642 MAP(v64_dotp_su8),
643 MAP(v64_u64),
644 MAP(v64_low_u32),
645 MAP(v64_high_u32),
646 MAP(v64_low_s32),
647 MAP(v64_high_s32),
648 MAP(v64_dup_8),
649 MAP(v64_dup_16),
650 MAP(v64_dup_32),
651 MAP(v64_from_32),
652 MAP(v64_zero),
653 MAP(v64_from_16),
654 MAP(v128_sad_u8),
655 MAP(v128_ssd_u8),
656 MAP(v128_sad_u16),
657 MAP(v128_ssd_s16),
658 MAP(v128_add_8),
659 MAP(v128_add_16),
660 MAP(v128_sadd_s8),
661 MAP(v128_sadd_u8),
662 MAP(v128_sadd_s16),
663 MAP(v128_add_32),
664 MAP(v128_add_64),
665 MAP(v128_sub_8),
666 MAP(v128_ssub_u8),
667 MAP(v128_ssub_s8),
668 MAP(v128_sub_16),
669 MAP(v128_ssub_s16),
670 MAP(v128_ssub_u16),
671 MAP(v128_sub_32),
672 MAP(v128_sub_64),
673 MAP(v128_ziplo_8),
674 MAP(v128_ziphi_8),
675 MAP(v128_ziplo_16),
676 MAP(v128_ziphi_16),
677 MAP(v128_ziplo_32),
678 MAP(v128_ziphi_32),
679 MAP(v128_ziplo_64),
680 MAP(v128_ziphi_64),
681 MAP(v128_unziphi_8),
682 MAP(v128_unziplo_8),
683 MAP(v128_unziphi_16),
684 MAP(v128_unziplo_16),
685 MAP(v128_unziphi_32),
686 MAP(v128_unziplo_32),
687 MAP(v128_pack_s32_u16),
688 MAP(v128_pack_s32_s16),
689 MAP(v128_pack_s16_u8),
690 MAP(v128_pack_s16_s8),
691 MAP(v128_or),
692 MAP(v128_xor),
693 MAP(v128_and),
694 MAP(v128_andn),
695 MAP(v128_mullo_s16),
696 MAP(v128_mulhi_s16),
697 MAP(v128_mullo_s32),
698 MAP(v128_madd_s16),
699 MAP(v128_madd_us8),
700 MAP(v128_avg_u8),
701 MAP(v128_rdavg_u8),
702 MAP(v128_rdavg_u16),
703 MAP(v128_avg_u16),
704 MAP(v128_min_u8),
705 MAP(v128_max_u8),
706 MAP(v128_min_s8),
707 MAP(v128_max_s8),
708 MAP(v128_min_s16),
709 MAP(v128_max_s16),
710 MAP(v128_min_s32),
711 MAP(v128_max_s32),
712 MAP(v128_cmpgt_s8),
713 MAP(v128_cmplt_s8),
714 MAP(v128_cmpeq_8),
715 MAP(v128_cmpgt_s16),
716 MAP(v128_cmpeq_16),
717 MAP(v128_cmplt_s16),
718 MAP(v128_cmpgt_s32),
719 MAP(v128_cmpeq_32),
720 MAP(v128_cmplt_s32),
721 MAP(v128_shuffle_8),
722 MAP(imm_v128_align<1>),
723 MAP(imm_v128_align<2>),
724 MAP(imm_v128_align<3>),
725 MAP(imm_v128_align<4>),
726 MAP(imm_v128_align<5>),
727 MAP(imm_v128_align<6>),
728 MAP(imm_v128_align<7>),
729 MAP(imm_v128_align<8>),
730 MAP(imm_v128_align<9>),
731 MAP(imm_v128_align<10>),
732 MAP(imm_v128_align<11>),
733 MAP(imm_v128_align<12>),
734 MAP(imm_v128_align<13>),
735 MAP(imm_v128_align<14>),
736 MAP(imm_v128_align<15>),
737 MAP(v128_abs_s8),
738 MAP(v128_abs_s16),
739 MAP(v128_padd_u8),
740 MAP(v128_padd_s16),
741 MAP(v128_unpacklo_u16_s32),
742 MAP(v128_unpacklo_s16_s32),
743 MAP(v128_unpackhi_u16_s32),
744 MAP(v128_unpackhi_s16_s32),
745 MAP(imm_v128_shr_n_byte<1>),
746 MAP(imm_v128_shr_n_byte<2>),
747 MAP(imm_v128_shr_n_byte<3>),
748 MAP(imm_v128_shr_n_byte<4>),
749 MAP(imm_v128_shr_n_byte<5>),
750 MAP(imm_v128_shr_n_byte<6>),
751 MAP(imm_v128_shr_n_byte<7>),
752 MAP(imm_v128_shr_n_byte<8>),
753 MAP(imm_v128_shr_n_byte<9>),
754 MAP(imm_v128_shr_n_byte<10>),
755 MAP(imm_v128_shr_n_byte<11>),
756 MAP(imm_v128_shr_n_byte<12>),
757 MAP(imm_v128_shr_n_byte<13>),
758 MAP(imm_v128_shr_n_byte<14>),
759 MAP(imm_v128_shr_n_byte<15>),
760 MAP(imm_v128_shl_n_byte<1>),
761 MAP(imm_v128_shl_n_byte<2>),
762 MAP(imm_v128_shl_n_byte<3>),
763 MAP(imm_v128_shl_n_byte<4>),
764 MAP(imm_v128_shl_n_byte<5>),
765 MAP(imm_v128_shl_n_byte<6>),
766 MAP(imm_v128_shl_n_byte<7>),
767 MAP(imm_v128_shl_n_byte<8>),
768 MAP(imm_v128_shl_n_byte<9>),
769 MAP(imm_v128_shl_n_byte<10>),
770 MAP(imm_v128_shl_n_byte<11>),
771 MAP(imm_v128_shl_n_byte<12>),
772 MAP(imm_v128_shl_n_byte<13>),
773 MAP(imm_v128_shl_n_byte<14>),
774 MAP(imm_v128_shl_n_byte<15>),
775 MAP(imm_v128_shl_n_8<1>),
776 MAP(imm_v128_shl_n_8<2>),
777 MAP(imm_v128_shl_n_8<3>),
778 MAP(imm_v128_shl_n_8<4>),
779 MAP(imm_v128_shl_n_8<5>),
780 MAP(imm_v128_shl_n_8<6>),
781 MAP(imm_v128_shl_n_8<7>),
782 MAP(imm_v128_shr_n_u8<1>),
783 MAP(imm_v128_shr_n_u8<2>),
784 MAP(imm_v128_shr_n_u8<3>),
785 MAP(imm_v128_shr_n_u8<4>),
786 MAP(imm_v128_shr_n_u8<5>),
787 MAP(imm_v128_shr_n_u8<6>),
788 MAP(imm_v128_shr_n_u8<7>),
789 MAP(imm_v128_shr_n_s8<1>),
790 MAP(imm_v128_shr_n_s8<2>),
791 MAP(imm_v128_shr_n_s8<3>),
792 MAP(imm_v128_shr_n_s8<4>),
793 MAP(imm_v128_shr_n_s8<5>),
794 MAP(imm_v128_shr_n_s8<6>),
795 MAP(imm_v128_shr_n_s8<7>),
796 MAP(imm_v128_shl_n_16<1>),
797 MAP(imm_v128_shl_n_16<2>),
798 MAP(imm_v128_shl_n_16<4>),
799 MAP(imm_v128_shl_n_16<6>),
800 MAP(imm_v128_shl_n_16<8>),
801 MAP(imm_v128_shl_n_16<10>),
802 MAP(imm_v128_shl_n_16<12>),
803 MAP(imm_v128_shl_n_16<14>),
804 MAP(imm_v128_shr_n_u16<1>),
805 MAP(imm_v128_shr_n_u16<2>),
806 MAP(imm_v128_shr_n_u16<4>),
807 MAP(imm_v128_shr_n_u16<6>),
808 MAP(imm_v128_shr_n_u16<8>),
809 MAP(imm_v128_shr_n_u16<10>),
810 MAP(imm_v128_shr_n_u16<12>),
811 MAP(imm_v128_shr_n_u16<14>),
812 MAP(imm_v128_shr_n_s16<1>),
813 MAP(imm_v128_shr_n_s16<2>),
814 MAP(imm_v128_shr_n_s16<4>),
815 MAP(imm_v128_shr_n_s16<6>),
816 MAP(imm_v128_shr_n_s16<8>),
817 MAP(imm_v128_shr_n_s16<10>),
818 MAP(imm_v128_shr_n_s16<12>),
819 MAP(imm_v128_shr_n_s16<14>),
820 MAP(imm_v128_shl_n_32<1>),
821 MAP(imm_v128_shl_n_32<4>),
822 MAP(imm_v128_shl_n_32<8>),
823 MAP(imm_v128_shl_n_32<12>),
824 MAP(imm_v128_shl_n_32<16>),
825 MAP(imm_v128_shl_n_32<20>),
826 MAP(imm_v128_shl_n_32<24>),
827 MAP(imm_v128_shl_n_32<28>),
828 MAP(imm_v128_shr_n_u32<1>),
829 MAP(imm_v128_shr_n_u32<4>),
830 MAP(imm_v128_shr_n_u32<8>),
831 MAP(imm_v128_shr_n_u32<12>),
832 MAP(imm_v128_shr_n_u32<16>),
833 MAP(imm_v128_shr_n_u32<20>),
834 MAP(imm_v128_shr_n_u32<24>),
835 MAP(imm_v128_shr_n_u32<28>),
836 MAP(imm_v128_shr_n_s32<1>),
837 MAP(imm_v128_shr_n_s32<4>),
838 MAP(imm_v128_shr_n_s32<8>),
839 MAP(imm_v128_shr_n_s32<12>),
840 MAP(imm_v128_shr_n_s32<16>),
841 MAP(imm_v128_shr_n_s32<20>),
842 MAP(imm_v128_shr_n_s32<24>),
843 MAP(imm_v128_shr_n_s32<28>),
844 MAP(imm_v128_shl_n_64<1>),
845 MAP(imm_v128_shl_n_64<4>),
846 MAP(imm_v128_shl_n_64<8>),
847 MAP(imm_v128_shl_n_64<12>),
848 MAP(imm_v128_shl_n_64<16>),
849 MAP(imm_v128_shl_n_64<20>),
850 MAP(imm_v128_shl_n_64<24>),
851 MAP(imm_v128_shl_n_64<28>),
852 MAP(imm_v128_shl_n_64<32>),
853 MAP(imm_v128_shl_n_64<36>),
854 MAP(imm_v128_shl_n_64<40>),
855 MAP(imm_v128_shl_n_64<44>),
856 MAP(imm_v128_shl_n_64<48>),
857 MAP(imm_v128_shl_n_64<52>),
858 MAP(imm_v128_shl_n_64<56>),
859 MAP(imm_v128_shl_n_64<60>),
860 MAP(imm_v128_shr_n_u64<1>),
861 MAP(imm_v128_shr_n_u64<4>),
862 MAP(imm_v128_shr_n_u64<8>),
863 MAP(imm_v128_shr_n_u64<12>),
864 MAP(imm_v128_shr_n_u64<16>),
865 MAP(imm_v128_shr_n_u64<20>),
866 MAP(imm_v128_shr_n_u64<24>),
867 MAP(imm_v128_shr_n_u64<28>),
868 MAP(imm_v128_shr_n_u64<32>),
869 MAP(imm_v128_shr_n_u64<36>),
870 MAP(imm_v128_shr_n_u64<40>),
871 MAP(imm_v128_shr_n_u64<44>),
872 MAP(imm_v128_shr_n_u64<48>),
873 MAP(imm_v128_shr_n_u64<52>),
874 MAP(imm_v128_shr_n_u64<56>),
875 MAP(imm_v128_shr_n_u64<60>),
876 MAP(imm_v128_shr_n_s64<1>),
877 MAP(imm_v128_shr_n_s64<4>),
878 MAP(imm_v128_shr_n_s64<8>),
879 MAP(imm_v128_shr_n_s64<12>),
880 MAP(imm_v128_shr_n_s64<16>),
881 MAP(imm_v128_shr_n_s64<20>),
882 MAP(imm_v128_shr_n_s64<24>),
883 MAP(imm_v128_shr_n_s64<28>),
884 MAP(imm_v128_shr_n_s64<32>),
885 MAP(imm_v128_shr_n_s64<36>),
886 MAP(imm_v128_shr_n_s64<40>),
887 MAP(imm_v128_shr_n_s64<44>),
888 MAP(imm_v128_shr_n_s64<48>),
889 MAP(imm_v128_shr_n_s64<52>),
890 MAP(imm_v128_shr_n_s64<56>),
891 MAP(imm_v128_shr_n_s64<60>),
892 MAP(v128_from_v64),
893 MAP(v128_zip_8),
894 MAP(v128_zip_16),
895 MAP(v128_zip_32),
896 MAP(v128_mul_s16),
897 MAP(v128_unpack_u8_s16),
898 MAP(v128_unpack_s8_s16),
899 MAP(v128_unpack_u16_s32),
900 MAP(v128_unpack_s16_s32),
901 MAP(v128_shl_8),
902 MAP(v128_shr_u8),
903 MAP(v128_shr_s8),
904 MAP(v128_shl_16),
905 MAP(v128_shr_u16),
906 MAP(v128_shr_s16),
907 MAP(v128_shl_32),
908 MAP(v128_shr_u32),
909 MAP(v128_shr_s32),
910 MAP(v128_shl_64),
911 MAP(v128_shr_u64),
912 MAP(v128_shr_s64),
913 MAP(v128_hadd_u8),
914 MAP(v128_dotp_su8),
915 MAP(v128_dotp_s16),
916 MAP(v128_dotp_s32),
917 MAP(v128_low_u32),
918 MAP(v128_low_v64),
919 MAP(v128_high_v64),
920 MAP(v128_from_64),
921 MAP(v128_from_32),
922 MAP(v128_movemask_8),
923 MAP(v128_zero),
924 MAP(v128_dup_8),
925 MAP(v128_dup_16),
926 MAP(v128_dup_32),
927 MAP(v128_dup_64),
928 MAP(v128_unpacklo_u8_s16),
929 MAP(v128_unpackhi_u8_s16),
930 MAP(v128_unpacklo_s8_s16),
931 MAP(v128_unpackhi_s8_s16),
932 MAP(v128_blend_8),
933 MAP(u32_load_unaligned),
934 MAP(u32_store_unaligned),
935 MAP(v64_load_unaligned),
936 MAP(v64_store_unaligned),
937 MAP(v128_load_unaligned),
938 MAP(v128_store_unaligned),
939 MAP(v256_sad_u8),
940 MAP(v256_ssd_u8),
941 MAP(v256_sad_u16),
942 MAP(v256_ssd_s16),
943 MAP(v256_hadd_u8),
944 MAP(v256_low_u64),
945 MAP(v256_dotp_su8),
946 MAP(v256_dotp_s16),
947 MAP(v256_dotp_s32),
948 MAP(v256_add_8),
949 MAP(v256_add_16),
950 MAP(v256_sadd_s8),
951 MAP(v256_sadd_u8),
952 MAP(v256_sadd_s16),
953 MAP(v256_add_32),
954 MAP(v256_add_64),
955 MAP(v256_sub_8),
956 MAP(v256_ssub_u8),
957 MAP(v256_ssub_s8),
958 MAP(v256_sub_16),
959 MAP(v256_ssub_u16),
960 MAP(v256_ssub_s16),
961 MAP(v256_sub_32),
962 MAP(v256_sub_64),
963 MAP(v256_ziplo_8),
964 MAP(v256_ziphi_8),
965 MAP(v256_ziplo_16),
966 MAP(v256_ziphi_16),
967 MAP(v256_ziplo_32),
968 MAP(v256_ziphi_32),
969 MAP(v256_ziplo_64),
970 MAP(v256_ziphi_64),
971 MAP(v256_unziphi_8),
972 MAP(v256_unziplo_8),
973 MAP(v256_unziphi_16),
974 MAP(v256_unziplo_16),
975 MAP(v256_unziphi_32),
976 MAP(v256_unziplo_32),
977 MAP(v256_unziphi_64),
978 MAP(v256_unziplo_64),
979 MAP(v256_pack_s32_u16),
980 MAP(v256_pack_s32_s16),
981 MAP(v256_pack_s16_u8),
982 MAP(v256_pack_s16_s8),
983 MAP(v256_or),
984 MAP(v256_xor),
985 MAP(v256_and),
986 MAP(v256_andn),
987 MAP(v256_mullo_s16),
988 MAP(v256_mulhi_s16),
989 MAP(v256_mullo_s32),
990 MAP(v256_madd_s16),
991 MAP(v256_madd_us8),
992 MAP(v256_avg_u8),
993 MAP(v256_rdavg_u8),
994 MAP(v256_rdavg_u16),
995 MAP(v256_avg_u16),
996 MAP(v256_min_u8),
997 MAP(v256_max_u8),
998 MAP(v256_min_s8),
999 MAP(v256_max_s8),
1000 MAP(v256_min_s16),
1001 MAP(v256_max_s16),
1002 MAP(v256_min_s32),
1003 MAP(v256_max_s32),
1004 MAP(v256_cmpgt_s8),
1005 MAP(v256_cmplt_s8),
1006 MAP(v256_cmpeq_8),
1007 MAP(v256_cmpgt_s16),
1008 MAP(v256_cmplt_s16),
1009 MAP(v256_cmpeq_16),
1010 MAP(v256_cmpgt_s32),
1011 MAP(v256_cmplt_s32),
1012 MAP(v256_cmpeq_32),
1013 MAP(v256_shuffle_8),
1014 MAP(v256_pshuffle_8),
1015 MAP(v256_wideshuffle_8),
1016 MAP(imm_v256_align<1>),
1017 MAP(imm_v256_align<2>),
1018 MAP(imm_v256_align<3>),
1019 MAP(imm_v256_align<4>),
1020 MAP(imm_v256_align<5>),
1021 MAP(imm_v256_align<6>),
1022 MAP(imm_v256_align<7>),
1023 MAP(imm_v256_align<8>),
1024 MAP(imm_v256_align<9>),
1025 MAP(imm_v256_align<10>),
1026 MAP(imm_v256_align<11>),
1027 MAP(imm_v256_align<12>),
1028 MAP(imm_v256_align<13>),
1029 MAP(imm_v256_align<14>),
1030 MAP(imm_v256_align<15>),
1031 MAP(imm_v256_align<16>),
1032 MAP(imm_v256_align<17>),
1033 MAP(imm_v256_align<18>),
1034 MAP(imm_v256_align<19>),
1035 MAP(imm_v256_align<20>),
1036 MAP(imm_v256_align<21>),
1037 MAP(imm_v256_align<22>),
1038 MAP(imm_v256_align<23>),
1039 MAP(imm_v256_align<24>),
1040 MAP(imm_v256_align<25>),
1041 MAP(imm_v256_align<26>),
1042 MAP(imm_v256_align<27>),
1043 MAP(imm_v256_align<28>),
1044 MAP(imm_v256_align<29>),
1045 MAP(imm_v256_align<30>),
1046 MAP(imm_v256_align<31>),
1047 MAP(v256_from_v128),
1048 MAP(v256_zip_8),
1049 MAP(v256_zip_16),
1050 MAP(v256_zip_32),
1051 MAP(v256_mul_s16),
1052 MAP(v256_unpack_u8_s16),
1053 MAP(v256_unpack_s8_s16),
1054 MAP(v256_unpack_u16_s32),
1055 MAP(v256_unpack_s16_s32),
1056 MAP(v256_shl_8),
1057 MAP(v256_shr_u8),
1058 MAP(v256_shr_s8),
1059 MAP(v256_shl_16),
1060 MAP(v256_shr_u16),
1061 MAP(v256_shr_s16),
1062 MAP(v256_shl_32),
1063 MAP(v256_shr_u32),
1064 MAP(v256_shr_s32),
1065 MAP(v256_shl_64),
1066 MAP(v256_shr_u64),
1067 MAP(v256_shr_s64),
1068 MAP(v256_abs_s8),
1069 MAP(v256_abs_s16),
1070 MAP(v256_padd_u8),
1071 MAP(v256_padd_s16),
1072 MAP(v256_unpacklo_u16_s32),
1073 MAP(v256_unpacklo_s16_s32),
1074 MAP(v256_unpackhi_u16_s32),
1075 MAP(v256_unpackhi_s16_s32),
1076 MAP(imm_v256_shr_n_word<1>),
1077 MAP(imm_v256_shr_n_word<2>),
1078 MAP(imm_v256_shr_n_word<3>),
1079 MAP(imm_v256_shr_n_word<4>),
1080 MAP(imm_v256_shr_n_word<5>),
1081 MAP(imm_v256_shr_n_word<6>),
1082 MAP(imm_v256_shr_n_word<7>),
1083 MAP(imm_v256_shr_n_word<8>),
1084 MAP(imm_v256_shr_n_word<9>),
1085 MAP(imm_v256_shr_n_word<10>),
1086 MAP(imm_v256_shr_n_word<11>),
1087 MAP(imm_v256_shr_n_word<12>),
1088 MAP(imm_v256_shr_n_word<13>),
1089 MAP(imm_v256_shr_n_word<14>),
1090 MAP(imm_v256_shr_n_word<15>),
1091 MAP(imm_v256_shl_n_word<1>),
1092 MAP(imm_v256_shl_n_word<2>),
1093 MAP(imm_v256_shl_n_word<3>),
1094 MAP(imm_v256_shl_n_word<4>),
1095 MAP(imm_v256_shl_n_word<5>),
1096 MAP(imm_v256_shl_n_word<6>),
1097 MAP(imm_v256_shl_n_word<7>),
1098 MAP(imm_v256_shl_n_word<8>),
1099 MAP(imm_v256_shl_n_word<9>),
1100 MAP(imm_v256_shl_n_word<10>),
1101 MAP(imm_v256_shl_n_word<11>),
1102 MAP(imm_v256_shl_n_word<12>),
1103 MAP(imm_v256_shl_n_word<13>),
1104 MAP(imm_v256_shl_n_word<14>),
1105 MAP(imm_v256_shl_n_word<15>),
1106 MAP(imm_v256_shr_n_byte<1>),
1107 MAP(imm_v256_shr_n_byte<2>),
1108 MAP(imm_v256_shr_n_byte<3>),
1109 MAP(imm_v256_shr_n_byte<4>),
1110 MAP(imm_v256_shr_n_byte<5>),
1111 MAP(imm_v256_shr_n_byte<6>),
1112 MAP(imm_v256_shr_n_byte<7>),
1113 MAP(imm_v256_shr_n_byte<8>),
1114 MAP(imm_v256_shr_n_byte<9>),
1115 MAP(imm_v256_shr_n_byte<10>),
1116 MAP(imm_v256_shr_n_byte<11>),
1117 MAP(imm_v256_shr_n_byte<12>),
1118 MAP(imm_v256_shr_n_byte<13>),
1119 MAP(imm_v256_shr_n_byte<14>),
1120 MAP(imm_v256_shr_n_byte<15>),
1121 MAP(imm_v256_shr_n_byte<16>),
1122 MAP(imm_v256_shr_n_byte<17>),
1123 MAP(imm_v256_shr_n_byte<18>),
1124 MAP(imm_v256_shr_n_byte<19>),
1125 MAP(imm_v256_shr_n_byte<20>),
1126 MAP(imm_v256_shr_n_byte<21>),
1127 MAP(imm_v256_shr_n_byte<22>),
1128 MAP(imm_v256_shr_n_byte<23>),
1129 MAP(imm_v256_shr_n_byte<24>),
1130 MAP(imm_v256_shr_n_byte<25>),
1131 MAP(imm_v256_shr_n_byte<26>),
1132 MAP(imm_v256_shr_n_byte<27>),
1133 MAP(imm_v256_shr_n_byte<28>),
1134 MAP(imm_v256_shr_n_byte<29>),
1135 MAP(imm_v256_shr_n_byte<30>),
1136 MAP(imm_v256_shr_n_byte<31>),
1137 MAP(imm_v256_shl_n_byte<1>),
1138 MAP(imm_v256_shl_n_byte<2>),
1139 MAP(imm_v256_shl_n_byte<3>),
1140 MAP(imm_v256_shl_n_byte<4>),
1141 MAP(imm_v256_shl_n_byte<5>),
1142 MAP(imm_v256_shl_n_byte<6>),
1143 MAP(imm_v256_shl_n_byte<7>),
1144 MAP(imm_v256_shl_n_byte<8>),
1145 MAP(imm_v256_shl_n_byte<9>),
1146 MAP(imm_v256_shl_n_byte<10>),
1147 MAP(imm_v256_shl_n_byte<11>),
1148 MAP(imm_v256_shl_n_byte<12>),
1149 MAP(imm_v256_shl_n_byte<13>),
1150 MAP(imm_v256_shl_n_byte<14>),
1151 MAP(imm_v256_shl_n_byte<15>),
1152 MAP(imm_v256_shl_n_byte<16>),
1153 MAP(imm_v256_shl_n_byte<17>),
1154 MAP(imm_v256_shl_n_byte<18>),
1155 MAP(imm_v256_shl_n_byte<19>),
1156 MAP(imm_v256_shl_n_byte<20>),
1157 MAP(imm_v256_shl_n_byte<21>),
1158 MAP(imm_v256_shl_n_byte<22>),
1159 MAP(imm_v256_shl_n_byte<23>),
1160 MAP(imm_v256_shl_n_byte<24>),
1161 MAP(imm_v256_shl_n_byte<25>),
1162 MAP(imm_v256_shl_n_byte<26>),
1163 MAP(imm_v256_shl_n_byte<27>),
1164 MAP(imm_v256_shl_n_byte<28>),
1165 MAP(imm_v256_shl_n_byte<29>),
1166 MAP(imm_v256_shl_n_byte<30>),
1167 MAP(imm_v256_shl_n_byte<31>),
1168 MAP(imm_v256_shl_n_8<1>),
1169 MAP(imm_v256_shl_n_8<2>),
1170 MAP(imm_v256_shl_n_8<3>),
1171 MAP(imm_v256_shl_n_8<4>),
1172 MAP(imm_v256_shl_n_8<5>),
1173 MAP(imm_v256_shl_n_8<6>),
1174 MAP(imm_v256_shl_n_8<7>),
1175 MAP(imm_v256_shr_n_u8<1>),
1176 MAP(imm_v256_shr_n_u8<2>),
1177 MAP(imm_v256_shr_n_u8<3>),
1178 MAP(imm_v256_shr_n_u8<4>),
1179 MAP(imm_v256_shr_n_u8<5>),
1180 MAP(imm_v256_shr_n_u8<6>),
1181 MAP(imm_v256_shr_n_u8<7>),
1182 MAP(imm_v256_shr_n_s8<1>),
1183 MAP(imm_v256_shr_n_s8<2>),
1184 MAP(imm_v256_shr_n_s8<3>),
1185 MAP(imm_v256_shr_n_s8<4>),
1186 MAP(imm_v256_shr_n_s8<5>),
1187 MAP(imm_v256_shr_n_s8<6>),
1188 MAP(imm_v256_shr_n_s8<7>),
1189 MAP(imm_v256_shl_n_16<1>),
1190 MAP(imm_v256_shl_n_16<2>),
1191 MAP(imm_v256_shl_n_16<4>),
1192 MAP(imm_v256_shl_n_16<6>),
1193 MAP(imm_v256_shl_n_16<8>),
1194 MAP(imm_v256_shl_n_16<10>),
1195 MAP(imm_v256_shl_n_16<12>),
1196 MAP(imm_v256_shl_n_16<14>),
1197 MAP(imm_v256_shr_n_u16<1>),
1198 MAP(imm_v256_shr_n_u16<2>),
1199 MAP(imm_v256_shr_n_u16<4>),
1200 MAP(imm_v256_shr_n_u16<6>),
1201 MAP(imm_v256_shr_n_u16<8>),
1202 MAP(imm_v256_shr_n_u16<10>),
1203 MAP(imm_v256_shr_n_u16<12>),
1204 MAP(imm_v256_shr_n_u16<14>),
1205 MAP(imm_v256_shr_n_s16<1>),
1206 MAP(imm_v256_shr_n_s16<2>),
1207 MAP(imm_v256_shr_n_s16<4>),
1208 MAP(imm_v256_shr_n_s16<6>),
1209 MAP(imm_v256_shr_n_s16<8>),
1210 MAP(imm_v256_shr_n_s16<10>),
1211 MAP(imm_v256_shr_n_s16<12>),
1212 MAP(imm_v256_shr_n_s16<14>),
1213 MAP(imm_v256_shl_n_32<1>),
1214 MAP(imm_v256_shl_n_32<4>),
1215 MAP(imm_v256_shl_n_32<8>),
1216 MAP(imm_v256_shl_n_32<12>),
1217 MAP(imm_v256_shl_n_32<16>),
1218 MAP(imm_v256_shl_n_32<20>),
1219 MAP(imm_v256_shl_n_32<24>),
1220 MAP(imm_v256_shl_n_32<28>),
1221 MAP(imm_v256_shr_n_u32<1>),
1222 MAP(imm_v256_shr_n_u32<4>),
1223 MAP(imm_v256_shr_n_u32<8>),
1224 MAP(imm_v256_shr_n_u32<12>),
1225 MAP(imm_v256_shr_n_u32<16>),
1226 MAP(imm_v256_shr_n_u32<20>),
1227 MAP(imm_v256_shr_n_u32<24>),
1228 MAP(imm_v256_shr_n_u32<28>),
1229 MAP(imm_v256_shr_n_s32<1>),
1230 MAP(imm_v256_shr_n_s32<4>),
1231 MAP(imm_v256_shr_n_s32<8>),
1232 MAP(imm_v256_shr_n_s32<12>),
1233 MAP(imm_v256_shr_n_s32<16>),
1234 MAP(imm_v256_shr_n_s32<20>),
1235 MAP(imm_v256_shr_n_s32<24>),
1236 MAP(imm_v256_shr_n_s32<28>),
1237 MAP(imm_v256_shl_n_64<1>),
1238 MAP(imm_v256_shl_n_64<4>),
1239 MAP(imm_v256_shl_n_64<8>),
1240 MAP(imm_v256_shl_n_64<12>),
1241 MAP(imm_v256_shl_n_64<16>),
1242 MAP(imm_v256_shl_n_64<20>),
1243 MAP(imm_v256_shl_n_64<24>),
1244 MAP(imm_v256_shl_n_64<28>),
1245 MAP(imm_v256_shl_n_64<32>),
1246 MAP(imm_v256_shl_n_64<36>),
1247 MAP(imm_v256_shl_n_64<40>),
1248 MAP(imm_v256_shl_n_64<44>),
1249 MAP(imm_v256_shl_n_64<48>),
1250 MAP(imm_v256_shl_n_64<52>),
1251 MAP(imm_v256_shl_n_64<56>),
1252 MAP(imm_v256_shl_n_64<60>),
1253 MAP(imm_v256_shr_n_u64<1>),
1254 MAP(imm_v256_shr_n_u64<4>),
1255 MAP(imm_v256_shr_n_u64<8>),
1256 MAP(imm_v256_shr_n_u64<12>),
1257 MAP(imm_v256_shr_n_u64<16>),
1258 MAP(imm_v256_shr_n_u64<20>),
1259 MAP(imm_v256_shr_n_u64<24>),
1260 MAP(imm_v256_shr_n_u64<28>),
1261 MAP(imm_v256_shr_n_u64<32>),
1262 MAP(imm_v256_shr_n_u64<36>),
1263 MAP(imm_v256_shr_n_u64<40>),
1264 MAP(imm_v256_shr_n_u64<44>),
1265 MAP(imm_v256_shr_n_u64<48>),
1266 MAP(imm_v256_shr_n_u64<52>),
1267 MAP(imm_v256_shr_n_u64<56>),
1268 MAP(imm_v256_shr_n_u64<60>),
1269 MAP(imm_v256_shr_n_s64<1>),
1270 MAP(imm_v256_shr_n_s64<4>),
1271 MAP(imm_v256_shr_n_s64<8>),
1272 MAP(imm_v256_shr_n_s64<12>),
1273 MAP(imm_v256_shr_n_s64<16>),
1274 MAP(imm_v256_shr_n_s64<20>),
1275 MAP(imm_v256_shr_n_s64<24>),
1276 MAP(imm_v256_shr_n_s64<28>),
1277 MAP(imm_v256_shr_n_s64<32>),
1278 MAP(imm_v256_shr_n_s64<36>),
1279 MAP(imm_v256_shr_n_s64<40>),
1280 MAP(imm_v256_shr_n_s64<44>),
1281 MAP(imm_v256_shr_n_s64<48>),
1282 MAP(imm_v256_shr_n_s64<52>),
1283 MAP(imm_v256_shr_n_s64<56>),
1284 MAP(imm_v256_shr_n_s64<60>),
1285 MAP(v256_movemask_8),
1286 MAP(v256_zero),
1287 MAP(v256_dup_8),
1288 MAP(v256_dup_16),
1289 MAP(v256_dup_32),
1290 MAP(v256_dup_64),
1291 MAP(v256_low_u32),
1292 MAP(v256_low_v64),
1293 MAP(v256_from_64),
1294 MAP(v256_from_v64),
1295 MAP(v256_ziplo_128),
1296 MAP(v256_ziphi_128),
1297 MAP(v256_unpacklo_u8_s16),
1298 MAP(v256_unpackhi_u8_s16),
1299 MAP(v256_unpacklo_s8_s16),
1300 MAP(v256_unpackhi_s8_s16),
1301 MAP(v256_blend_8),
1302 { NULL, NULL, NULL } };
1303 #undef MAP
1304
1305 // Map reference functions to machine tuned functions. Since the
1306 // functions depend on machine tuned types, the non-machine tuned
1307 // instantiations of the test can't refer to these functions directly,
1308 // so we refer to them by name and do the mapping here.
Map(const char * name,fptr * ref,fptr * simd)1309 void Map(const char *name, fptr *ref, fptr *simd) {
1310 unsigned int i;
1311 for (i = 0; m[i].name && strcmp(name, m[i].name); i++) {
1312 }
1313
1314 *ref = m[i].ref;
1315 *simd = m[i].simd;
1316 }
1317
1318 // Used for printing errors in TestSimd1Arg, TestSimd2Args and TestSimd3Args
Print(const uint8_t * a,int size)1319 std::string Print(const uint8_t *a, int size) {
1320 std::string text = "0x";
1321 for (int i = 0; i < size; i++) {
1322 const uint8_t c = a[!CONFIG_BIG_ENDIAN ? size - 1 - i : i];
1323 // Same as snprintf(..., ..., "%02x", c)
1324 text += (c >> 4) + '0' + ((c >> 4) > 9) * ('a' - '0' - 10);
1325 text += (c & 15) + '0' + ((c & 15) > 9) * ('a' - '0' - 10);
1326 }
1327
1328 return text;
1329 }
1330
1331 // Used in TestSimd1Arg, TestSimd2Args and TestSimd3Args to restrict argument
1332 // ranges
SetMask(uint8_t * s,int size,uint32_t mask,uint32_t maskwidth)1333 void SetMask(uint8_t *s, int size, uint32_t mask, uint32_t maskwidth) {
1334 switch (maskwidth) {
1335 case 0: {
1336 break;
1337 }
1338 case 8: {
1339 for (int i = 0; i < size; i++) s[i] &= mask;
1340 break;
1341 }
1342 case 16: {
1343 uint16_t *t = reinterpret_cast<uint16_t *>(s);
1344 assert(!(reinterpret_cast<uintptr_t>(s) & 1));
1345 for (int i = 0; i < size / 2; i++) t[i] &= mask;
1346 break;
1347 }
1348 case 32: {
1349 uint32_t *t = reinterpret_cast<uint32_t *>(s);
1350 assert(!(reinterpret_cast<uintptr_t>(s) & 3));
1351 for (int i = 0; i < size / 4; i++) t[i] &= mask;
1352 break;
1353 }
1354 case 64: {
1355 uint64_t *t = reinterpret_cast<uint64_t *>(s);
1356 assert(!(reinterpret_cast<uintptr_t>(s) & 7));
1357 for (int i = 0; i < size / 8; i++) t[i] &= mask;
1358 break;
1359 }
1360 default: {
1361 FAIL() << "Unsupported mask width";
1362 break;
1363 }
1364 }
1365 }
1366
1367 // We need some extra load/store functions
u64_store_aligned(void * p,uint64_t a)1368 void u64_store_aligned(void *p, uint64_t a) {
1369 v64_store_aligned(p, v64_from_64(a));
1370 }
s32_store_aligned(void * p,int32_t a)1371 void s32_store_aligned(void *p, int32_t a) {
1372 u32_store_aligned(p, static_cast<uint32_t>(a));
1373 }
s64_store_aligned(void * p,int64_t a)1374 void s64_store_aligned(void *p, int64_t a) {
1375 v64_store_aligned(p, v64_from_64(static_cast<uint64_t>(a)));
1376 }
1377
c_u64_store_aligned(void * p,uint64_t a)1378 void c_u64_store_aligned(void *p, uint64_t a) {
1379 c_v64_store_aligned(p, c_v64_from_64(a));
1380 }
1381
c_s32_store_aligned(void * p,int32_t a)1382 void c_s32_store_aligned(void *p, int32_t a) {
1383 c_u32_store_aligned(p, static_cast<uint32_t>(a));
1384 }
1385
c_s64_store_aligned(void * p,int64_t a)1386 void c_s64_store_aligned(void *p, int64_t a) {
1387 c_v64_store_aligned(p, c_v64_from_64(static_cast<uint64_t>(a)));
1388 }
1389
u64_load_aligned(const void * p)1390 uint64_t u64_load_aligned(const void *p) {
1391 return v64_u64(v64_load_aligned(p));
1392 }
u16_load_aligned(const void * p)1393 uint16_t u16_load_aligned(const void *p) {
1394 return *(reinterpret_cast<const uint16_t *>(p));
1395 }
u8_load_aligned(const void * p)1396 uint8_t u8_load_aligned(const void *p) {
1397 return *(reinterpret_cast<const uint8_t *>(p));
1398 }
1399
c_u64_load_aligned(const void * p)1400 uint64_t c_u64_load_aligned(const void *p) {
1401 return c_v64_u64(c_v64_load_aligned(p));
1402 }
c_u16_load_aligned(const void * p)1403 uint16_t c_u16_load_aligned(const void *p) {
1404 return *(reinterpret_cast<const uint16_t *>(p));
1405 }
c_u8_load_aligned(const void * p)1406 uint8_t c_u8_load_aligned(const void *p) {
1407 return *(reinterpret_cast<const uint8_t *>(p));
1408 }
1409
1410 // CompareSimd1Arg, CompareSimd2Args and CompareSimd3Args compare
1411 // intrinsics taking 1, 2 or 3 arguments respectively with their
1412 // corresponding C reference. Ideally, the loads and stores should
1413 // have gone into the template parameter list, but v64 and v128 could
1414 // be typedef'ed to the same type (which is the case on x86) and then
1415 // we can't instantiate both v64 and v128, so the function return and
1416 // argument types, including the always differing types in the C
1417 // equivalent are used instead. The function arguments must be void
1418 // pointers and then go through a cast to avoid matching errors in the
1419 // branches eliminated by the typeid tests in the calling function.
1420 template <typename Ret, typename Arg, typename CRet, typename CArg>
CompareSimd1Arg(fptr store,fptr load,fptr simd,void * d,fptr c_store,fptr c_load,fptr c_simd,void * ref_d,const void * a)1421 int CompareSimd1Arg(fptr store, fptr load, fptr simd, void *d, fptr c_store,
1422 fptr c_load, fptr c_simd, void *ref_d, const void *a) {
1423 void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
1424 Arg (*const my_load)(const void *) = (Arg(*const)(const void *))load;
1425 Ret (*const my_simd)(Arg) = (Ret(*const)(Arg))simd;
1426 void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
1427 CArg (*const my_c_load)(const void *) = (CArg(*const)(const void *))c_load;
1428 CRet (*const my_c_simd)(CArg) = (CRet(*const)(CArg))c_simd;
1429
1430 // Call reference and intrinsic
1431 my_c_store(ref_d, my_c_simd(my_c_load(a)));
1432 my_store(d, my_simd(my_load(a)));
1433
1434 // Compare results
1435 return memcmp(ref_d, d, sizeof(CRet));
1436 }
1437
1438 template <typename Ret, typename Arg1, typename Arg2, typename CRet,
1439 typename CArg1, typename CArg2>
CompareSimd2Args(fptr store,fptr load1,fptr load2,fptr simd,void * d,fptr c_store,fptr c_load1,fptr c_load2,fptr c_simd,void * ref_d,const void * a,const void * b)1440 int CompareSimd2Args(fptr store, fptr load1, fptr load2, fptr simd, void *d,
1441 fptr c_store, fptr c_load1, fptr c_load2, fptr c_simd,
1442 void *ref_d, const void *a, const void *b) {
1443 void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
1444 Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
1445 Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
1446 Ret (*const my_simd)(Arg1, Arg2) = (Ret(*const)(Arg1, Arg2))simd;
1447 void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
1448 CArg1 (*const my_c_load1)(const void *) =
1449 (CArg1(*const)(const void *))c_load1;
1450 CArg2 (*const my_c_load2)(const void *) =
1451 (CArg2(*const)(const void *))c_load2;
1452 CRet (*const my_c_simd)(CArg1, CArg2) = (CRet(*const)(CArg1, CArg2))c_simd;
1453
1454 // Call reference and intrinsic
1455 my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b)));
1456 my_store(d, my_simd(my_load1(a), my_load2(b)));
1457
1458 // Compare results
1459 return memcmp(ref_d, d, sizeof(CRet));
1460 }
1461
1462 template <typename Ret, typename Arg1, typename Arg2, typename Arg3,
1463 typename CRet, typename CArg1, typename CArg2, typename CArg3>
CompareSimd3Args(fptr store,fptr load1,fptr load2,fptr load3,fptr simd,void * d,fptr c_store,fptr c_load1,fptr c_load2,fptr c_load3,fptr c_simd,void * ref_d,const void * a,const void * b,const void * c)1464 int CompareSimd3Args(fptr store, fptr load1, fptr load2, fptr load3, fptr simd,
1465 void *d, fptr c_store, fptr c_load1, fptr c_load2,
1466 fptr c_load3, fptr c_simd, void *ref_d, const void *a,
1467 const void *b, const void *c) {
1468 void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
1469 Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
1470 Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
1471 Arg3 (*const my_load3)(const void *) = (Arg3(*const)(const void *))load3;
1472 Ret (*const my_simd)(Arg1, Arg2, Arg3) = (Ret(*const)(Arg1, Arg2, Arg3))simd;
1473 void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
1474 CArg1 (*const my_c_load1)(const void *) =
1475 (CArg1(*const)(const void *))c_load1;
1476 CArg2 (*const my_c_load2)(const void *) =
1477 (CArg2(*const)(const void *))c_load2;
1478 CArg3 (*const my_c_load3)(const void *) =
1479 (CArg3(*const)(const void *))c_load3;
1480 CRet (*const my_c_simd)(CArg1, CArg2, CArg3) =
1481 (CRet(*const)(CArg1, CArg2, CArg3))c_simd;
1482
1483 // Call reference and intrinsic
1484 my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b), my_c_load3(c)));
1485 my_store(d, my_simd(my_load1(a), my_load2(b), my_load3(c)));
1486
1487 // Compare results
1488 return memcmp(ref_d, d, sizeof(CRet));
1489 }
1490
1491 } // namespace
1492
1493 template <typename CRet, typename CArg>
TestSimd1Arg(uint32_t iterations,uint32_t mask,uint32_t maskwidth,const char * name)1494 void TestSimd1Arg(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
1495 const char *name) {
1496 ACMRandom rnd(ACMRandom::DeterministicSeed());
1497 fptr ref_simd;
1498 fptr simd;
1499 int error = 0;
1500 DECLARE_ALIGNED(32, uint8_t, s[32]);
1501 DECLARE_ALIGNED(32, uint8_t, d[32]);
1502 DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
1503 assert(sizeof(CArg) <= 32 && sizeof(CRet) <= 32);
1504 memset(ref_d, 0, sizeof(ref_d));
1505 memset(d, 0, sizeof(d));
1506
1507 Map(name, &ref_simd, &simd);
1508 if (simd == NULL || ref_simd == NULL) {
1509 FAIL() << "Internal error: Unknown intrinsic function " << name;
1510 }
1511 for (unsigned int count = 0;
1512 count < iterations && !error && !testing::Test::HasFailure(); count++) {
1513 for (unsigned int c = 0; c < sizeof(CArg); c++) s[c] = rnd.Rand8();
1514
1515 if (maskwidth) {
1516 SetMask(s, sizeof(CArg), mask, maskwidth);
1517 }
1518
1519 if (typeid(CRet) == typeid(c_v64) && typeid(CArg) == typeid(c_v64)) {
1520 // V64_V64
1521 error = CompareSimd1Arg<v64, v64, CRet, CArg>(
1522 reinterpret_cast<fptr>(v64_store_aligned),
1523 reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1524 reinterpret_cast<fptr>(c_v64_store_aligned),
1525 reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1526 } else if (typeid(CRet) == typeid(c_v64) &&
1527 typeid(CArg) == typeid(uint8_t)) {
1528 // V64_U8
1529 error = CompareSimd1Arg<v64, uint8_t, CRet, CArg>(
1530 reinterpret_cast<fptr>(v64_store_aligned),
1531 reinterpret_cast<fptr>(u8_load_aligned), simd, d,
1532 reinterpret_cast<fptr>(c_v64_store_aligned),
1533 reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
1534 } else if (typeid(CRet) == typeid(c_v64) &&
1535 typeid(CArg) == typeid(uint16_t)) {
1536 // V64_U16
1537 error = CompareSimd1Arg<v64, uint16_t, CRet, CArg>(
1538 reinterpret_cast<fptr>(v64_store_aligned),
1539 reinterpret_cast<fptr>(u16_load_aligned), simd, d,
1540 reinterpret_cast<fptr>(c_v64_store_aligned),
1541 reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
1542 } else if (typeid(CRet) == typeid(c_v64) &&
1543 typeid(CArg) == typeid(uint32_t)) {
1544 // V64_U32
1545 error = CompareSimd1Arg<v64, uint32_t, CRet, CArg>(
1546 reinterpret_cast<fptr>(v64_store_aligned),
1547 reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1548 reinterpret_cast<fptr>(c_v64_store_aligned),
1549 reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
1550 } else if (typeid(CRet) == typeid(uint64_t) &&
1551 typeid(CArg) == typeid(c_v64)) {
1552 // U64_V64
1553 error = CompareSimd1Arg<uint64_t, v64, CRet, CArg>(
1554 reinterpret_cast<fptr>(u64_store_aligned),
1555 reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1556 reinterpret_cast<fptr>(c_u64_store_aligned),
1557 reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1558 } else if (typeid(CRet) == typeid(int64_t) &&
1559 typeid(CArg) == typeid(c_v64)) {
1560 // S64_V64
1561 error = CompareSimd1Arg<int64_t, v64, CRet, CArg>(
1562 reinterpret_cast<fptr>(s64_store_aligned),
1563 reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1564 reinterpret_cast<fptr>(c_s64_store_aligned),
1565 reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1566 } else if (typeid(CRet) == typeid(uint32_t) &&
1567 typeid(CArg) == typeid(c_v64)) {
1568 // U32_V64
1569 error = CompareSimd1Arg<uint32_t, v64, CRet, CArg>(
1570 reinterpret_cast<fptr>(u32_store_aligned),
1571 reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1572 reinterpret_cast<fptr>(c_u32_store_aligned),
1573 reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1574 } else if (typeid(CRet) == typeid(int32_t) &&
1575 typeid(CArg) == typeid(c_v64)) {
1576 // S32_V64
1577 error = CompareSimd1Arg<int32_t, v64, CRet, CArg>(
1578 reinterpret_cast<fptr>(s32_store_aligned),
1579 reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1580 reinterpret_cast<fptr>(c_s32_store_aligned),
1581 reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1582 } else if (typeid(CRet) == typeid(uint32_t) &&
1583 typeid(CArg) == typeid(c_v128)) {
1584 // U32_V128
1585 error = CompareSimd1Arg<uint32_t, v128, CRet, CArg>(
1586 reinterpret_cast<fptr>(u32_store_aligned),
1587 reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1588 reinterpret_cast<fptr>(c_u32_store_aligned),
1589 reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
1590 } else if (typeid(CRet) == typeid(uint64_t) &&
1591 typeid(CArg) == typeid(c_v128)) {
1592 // U64_V128
1593 error = CompareSimd1Arg<uint64_t, v128, CRet, CArg>(
1594 reinterpret_cast<fptr>(u64_store_aligned),
1595 reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1596 reinterpret_cast<fptr>(c_u64_store_aligned),
1597 reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
1598 } else if (typeid(CRet) == typeid(uint64_t) &&
1599 typeid(CArg) == typeid(c_v256)) {
1600 // U64_V256
1601 error = CompareSimd1Arg<uint64_t, v256, CRet, CArg>(
1602 reinterpret_cast<fptr>(u64_store_aligned),
1603 reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1604 reinterpret_cast<fptr>(c_u64_store_aligned),
1605 reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
1606 } else if (typeid(CRet) == typeid(c_v64) &&
1607 typeid(CArg) == typeid(c_v128)) {
1608 // V64_V128
1609 error = CompareSimd1Arg<v64, v128, CRet, CArg>(
1610 reinterpret_cast<fptr>(v64_store_aligned),
1611 reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1612 reinterpret_cast<fptr>(c_v64_store_aligned),
1613 reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
1614 } else if (typeid(CRet) == typeid(c_v128) &&
1615 typeid(CArg) == typeid(c_v128)) {
1616 // V128_V128
1617 error = CompareSimd1Arg<v128, v128, CRet, CArg>(
1618 reinterpret_cast<fptr>(v128_store_aligned),
1619 reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1620 reinterpret_cast<fptr>(c_v128_store_aligned),
1621 reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
1622 } else if (typeid(CRet) == typeid(c_v128) &&
1623 typeid(CArg) == typeid(c_v64)) {
1624 // V128_V64
1625 error = CompareSimd1Arg<v128, v64, CRet, CArg>(
1626 reinterpret_cast<fptr>(v128_store_aligned),
1627 reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1628 reinterpret_cast<fptr>(c_v128_store_aligned),
1629 reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1630 } else if (typeid(CRet) == typeid(c_v128) &&
1631 typeid(CArg) == typeid(uint8_t)) {
1632 // V128_U8
1633 error = CompareSimd1Arg<v128, uint8_t, CRet, CArg>(
1634 reinterpret_cast<fptr>(v128_store_aligned),
1635 reinterpret_cast<fptr>(u8_load_aligned), simd, d,
1636 reinterpret_cast<fptr>(c_v128_store_aligned),
1637 reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
1638 } else if (typeid(CRet) == typeid(c_v128) &&
1639 typeid(CArg) == typeid(uint16_t)) {
1640 // V128_U16
1641 error = CompareSimd1Arg<v128, uint16_t, CRet, CArg>(
1642 reinterpret_cast<fptr>(v128_store_aligned),
1643 reinterpret_cast<fptr>(u16_load_aligned), simd, d,
1644 reinterpret_cast<fptr>(c_v128_store_aligned),
1645 reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
1646 } else if (typeid(CRet) == typeid(c_v128) &&
1647 typeid(CArg) == typeid(uint32_t)) {
1648 // V128_U32
1649 error = CompareSimd1Arg<v128, uint32_t, CRet, CArg>(
1650 reinterpret_cast<fptr>(v128_store_aligned),
1651 reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1652 reinterpret_cast<fptr>(c_v128_store_aligned),
1653 reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
1654 } else if (typeid(CRet) == typeid(c_v128) &&
1655 typeid(CArg) == typeid(uint64_t)) {
1656 // V128_U64
1657 error = CompareSimd1Arg<v128, uint64_t, CRet, CArg>(
1658 reinterpret_cast<fptr>(v128_store_aligned),
1659 reinterpret_cast<fptr>(u64_load_aligned), simd, d,
1660 reinterpret_cast<fptr>(c_v128_store_aligned),
1661 reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
1662 } else if (typeid(CRet) == typeid(c_v256) &&
1663 typeid(CArg) == typeid(c_v256)) {
1664 // V256_V256
1665 error = CompareSimd1Arg<v256, v256, CRet, CArg>(
1666 reinterpret_cast<fptr>(v256_store_aligned),
1667 reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1668 reinterpret_cast<fptr>(c_v256_store_aligned),
1669 reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
1670 } else if (typeid(CRet) == typeid(c_v256) &&
1671 typeid(CArg) == typeid(c_v128)) {
1672 // V256_V128
1673 error = CompareSimd1Arg<v256, v128, CRet, CArg>(
1674 reinterpret_cast<fptr>(v256_store_aligned),
1675 reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1676 reinterpret_cast<fptr>(c_v256_store_aligned),
1677 reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
1678 } else if (typeid(CRet) == typeid(c_v256) &&
1679 typeid(CArg) == typeid(uint8_t)) {
1680 // V256_U8
1681 error = CompareSimd1Arg<v256, uint8_t, CRet, CArg>(
1682 reinterpret_cast<fptr>(v256_store_aligned),
1683 reinterpret_cast<fptr>(u8_load_aligned), simd, d,
1684 reinterpret_cast<fptr>(c_v256_store_aligned),
1685 reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
1686 } else if (typeid(CRet) == typeid(c_v256) &&
1687 typeid(CArg) == typeid(uint16_t)) {
1688 // V256_U16
1689 error = CompareSimd1Arg<v256, uint16_t, CRet, CArg>(
1690 reinterpret_cast<fptr>(v256_store_aligned),
1691 reinterpret_cast<fptr>(u16_load_aligned), simd, d,
1692 reinterpret_cast<fptr>(c_v256_store_aligned),
1693 reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
1694 } else if (typeid(CRet) == typeid(c_v256) &&
1695 typeid(CArg) == typeid(uint32_t)) {
1696 // V256_U32
1697 error = CompareSimd1Arg<v256, uint32_t, CRet, CArg>(
1698 reinterpret_cast<fptr>(v256_store_aligned),
1699 reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1700 reinterpret_cast<fptr>(c_v256_store_aligned),
1701 reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
1702 } else if (typeid(CRet) == typeid(c_v256) &&
1703 typeid(CArg) == typeid(uint64_t)) {
1704 // V256_U64
1705 error = CompareSimd1Arg<v256, uint64_t, CRet, CArg>(
1706 reinterpret_cast<fptr>(v256_store_aligned),
1707 reinterpret_cast<fptr>(u64_load_aligned), simd, d,
1708 reinterpret_cast<fptr>(c_v256_store_aligned),
1709 reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
1710 } else if (typeid(CRet) == typeid(uint32_t) &&
1711 typeid(CArg) == typeid(c_v256)) {
1712 // U32_V256
1713 error = CompareSimd1Arg<uint32_t, v256, CRet, CArg>(
1714 reinterpret_cast<fptr>(u32_store_aligned),
1715 reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1716 reinterpret_cast<fptr>(c_u32_store_aligned),
1717 reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
1718 } else if (typeid(CRet) == typeid(c_v64) &&
1719 typeid(CArg) == typeid(c_v256)) {
1720 // V64_V256
1721 error = CompareSimd1Arg<v64, v256, CRet, CArg>(
1722 reinterpret_cast<fptr>(v64_store_aligned),
1723 reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1724 reinterpret_cast<fptr>(c_v64_store_aligned),
1725 reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
1726 } else {
1727 FAIL() << "Internal error: Unknown intrinsic function "
1728 << typeid(CRet).name() << " " << name << "(" << typeid(CArg).name()
1729 << ")";
1730 }
1731 }
1732
1733 EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
1734 << Print(s, sizeof(CArg)) << ") -> "
1735 << Print(d, sizeof(CRet)) << " (simd), "
1736 << Print(ref_d, sizeof(CRet)) << " (ref)";
1737 }
1738
1739 template <typename CRet, typename CArg1, typename CArg2>
TestSimd2Args(uint32_t iterations,uint32_t mask,uint32_t maskwidth,const char * name)1740 void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
1741 const char *name) {
1742 ACMRandom rnd(ACMRandom::DeterministicSeed());
1743 fptr ref_simd;
1744 fptr simd;
1745 int error = 0;
1746 DECLARE_ALIGNED(32, uint8_t, s1[32]);
1747 DECLARE_ALIGNED(32, uint8_t, s2[32]);
1748 DECLARE_ALIGNED(32, uint8_t, d[32]);
1749 DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
1750 assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CRet) <= 32);
1751 memset(ref_d, 0, sizeof(ref_d));
1752 memset(d, 0, sizeof(d));
1753
1754 Map(name, &ref_simd, &simd);
1755 if (simd == NULL || ref_simd == NULL) {
1756 FAIL() << "Internal error: Unknown intrinsic function " << name;
1757 }
1758
1759 for (unsigned int count = 0;
1760 count < iterations && !error && !testing::Test::HasFailure(); count++) {
1761 for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
1762
1763 for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
1764
1765 if (maskwidth) SetMask(s2, sizeof(CArg2), mask, maskwidth);
1766
1767 if (typeid(CRet) == typeid(c_v64) && typeid(CArg1) == typeid(c_v64) &&
1768 typeid(CArg2) == typeid(c_v64)) {
1769 // V64_V64V64
1770 error = CompareSimd2Args<v64, v64, v64, CRet, CArg1, CArg2>(
1771 reinterpret_cast<fptr>(v64_store_aligned),
1772 reinterpret_cast<fptr>(v64_load_aligned),
1773 reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1774 reinterpret_cast<fptr>(c_v64_store_aligned),
1775 reinterpret_cast<fptr>(c_v64_load_aligned),
1776 reinterpret_cast<fptr>(c_v64_load_aligned),
1777 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1778 } else if (typeid(CRet) == typeid(c_v64) &&
1779 typeid(CArg1) == typeid(uint32_t) &&
1780 typeid(CArg2) == typeid(uint32_t)) {
1781 // V64_U32U32
1782 error = CompareSimd2Args<v64, uint32_t, uint32_t, CRet, CArg1, CArg2>(
1783 reinterpret_cast<fptr>(v64_store_aligned),
1784 reinterpret_cast<fptr>(u32_load_aligned),
1785 reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1786 reinterpret_cast<fptr>(c_v64_store_aligned),
1787 reinterpret_cast<fptr>(c_u32_load_aligned),
1788 reinterpret_cast<fptr>(c_u32_load_aligned),
1789 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1790 } else if (typeid(CRet) == typeid(uint32_t) &&
1791 typeid(CArg1) == typeid(c_v64) &&
1792 typeid(CArg2) == typeid(c_v64)) {
1793 // U32_V64V64
1794 error = CompareSimd2Args<uint32_t, v64, v64, CRet, CArg1, CArg2>(
1795 reinterpret_cast<fptr>(u32_store_aligned),
1796 reinterpret_cast<fptr>(v64_load_aligned),
1797 reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1798 reinterpret_cast<fptr>(c_u32_store_aligned),
1799 reinterpret_cast<fptr>(c_v64_load_aligned),
1800 reinterpret_cast<fptr>(c_v64_load_aligned),
1801 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1802 } else if (typeid(CRet) == typeid(int64_t) &&
1803 typeid(CArg1) == typeid(c_v64) &&
1804 typeid(CArg2) == typeid(c_v64)) {
1805 // S64_V64V64
1806 error = CompareSimd2Args<int64_t, v64, v64, CRet, CArg1, CArg2>(
1807 reinterpret_cast<fptr>(s64_store_aligned),
1808 reinterpret_cast<fptr>(v64_load_aligned),
1809 reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1810 reinterpret_cast<fptr>(c_s64_store_aligned),
1811 reinterpret_cast<fptr>(c_v64_load_aligned),
1812 reinterpret_cast<fptr>(c_v64_load_aligned),
1813 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1814 } else if (typeid(CRet) == typeid(c_v64) &&
1815 typeid(CArg1) == typeid(c_v64) &&
1816 typeid(CArg2) == typeid(uint32_t)) {
1817 // V64_V64U32
1818 error = CompareSimd2Args<v64, v64, uint32_t, CRet, CArg1, CArg2>(
1819 reinterpret_cast<fptr>(v64_store_aligned),
1820 reinterpret_cast<fptr>(v64_load_aligned),
1821 reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1822 reinterpret_cast<fptr>(c_v64_store_aligned),
1823 reinterpret_cast<fptr>(c_v64_load_aligned),
1824 reinterpret_cast<fptr>(c_u32_load_aligned),
1825 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1826 } else if (typeid(CRet) == typeid(c_v128) &&
1827 typeid(CArg1) == typeid(c_v128) &&
1828 typeid(CArg2) == typeid(c_v128)) {
1829 // V128_V128V128
1830 error = CompareSimd2Args<v128, v128, v128, CRet, CArg1, CArg2>(
1831 reinterpret_cast<fptr>(v128_store_aligned),
1832 reinterpret_cast<fptr>(v128_load_aligned),
1833 reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1834 reinterpret_cast<fptr>(c_v128_store_aligned),
1835 reinterpret_cast<fptr>(c_v128_load_aligned),
1836 reinterpret_cast<fptr>(c_v128_load_aligned),
1837 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1838 } else if (typeid(CRet) == typeid(uint32_t) &&
1839 typeid(CArg1) == typeid(c_v128) &&
1840 typeid(CArg2) == typeid(c_v128)) {
1841 // U32_V128V128
1842 error = CompareSimd2Args<uint32_t, v128, v128, CRet, CArg1, CArg2>(
1843 reinterpret_cast<fptr>(u32_store_aligned),
1844 reinterpret_cast<fptr>(v128_load_aligned),
1845 reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1846 reinterpret_cast<fptr>(c_u32_store_aligned),
1847 reinterpret_cast<fptr>(c_v128_load_aligned),
1848 reinterpret_cast<fptr>(c_v128_load_aligned),
1849 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1850 } else if (typeid(CRet) == typeid(uint64_t) &&
1851 typeid(CArg1) == typeid(c_v128) &&
1852 typeid(CArg2) == typeid(c_v128)) {
1853 // U64_V128V128
1854 error = CompareSimd2Args<uint64_t, v128, v128, CRet, CArg1, CArg2>(
1855 reinterpret_cast<fptr>(u64_store_aligned),
1856 reinterpret_cast<fptr>(v128_load_aligned),
1857 reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1858 reinterpret_cast<fptr>(c_u64_store_aligned),
1859 reinterpret_cast<fptr>(c_v128_load_aligned),
1860 reinterpret_cast<fptr>(c_v128_load_aligned),
1861 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1862 } else if (typeid(CRet) == typeid(int64_t) &&
1863 typeid(CArg1) == typeid(c_v128) &&
1864 typeid(CArg2) == typeid(c_v128)) {
1865 // S64_V128V128
1866 error = CompareSimd2Args<int64_t, v128, v128, CRet, CArg1, CArg2>(
1867 reinterpret_cast<fptr>(s64_store_aligned),
1868 reinterpret_cast<fptr>(v128_load_aligned),
1869 reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1870 reinterpret_cast<fptr>(c_s64_store_aligned),
1871 reinterpret_cast<fptr>(c_v128_load_aligned),
1872 reinterpret_cast<fptr>(c_v128_load_aligned),
1873 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1874 } else if (typeid(CRet) == typeid(c_v128) &&
1875 typeid(CArg1) == typeid(uint64_t) &&
1876 typeid(CArg2) == typeid(uint64_t)) {
1877 // V128_U64U64
1878 error = CompareSimd2Args<v128, uint64_t, uint64_t, CRet, CArg1, CArg2>(
1879 reinterpret_cast<fptr>(v128_store_aligned),
1880 reinterpret_cast<fptr>(u64_load_aligned),
1881 reinterpret_cast<fptr>(u64_load_aligned), simd, d,
1882 reinterpret_cast<fptr>(c_v128_store_aligned),
1883 reinterpret_cast<fptr>(c_u64_load_aligned),
1884 reinterpret_cast<fptr>(c_u64_load_aligned),
1885 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1886 } else if (typeid(CRet) == typeid(c_v128) &&
1887 typeid(CArg1) == typeid(c_v64) &&
1888 typeid(CArg2) == typeid(c_v64)) {
1889 // V128_V64V64
1890 error = CompareSimd2Args<v128, v64, v64, CRet, CArg1, CArg2>(
1891 reinterpret_cast<fptr>(v128_store_aligned),
1892 reinterpret_cast<fptr>(v64_load_aligned),
1893 reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1894 reinterpret_cast<fptr>(c_v128_store_aligned),
1895 reinterpret_cast<fptr>(c_v64_load_aligned),
1896 reinterpret_cast<fptr>(c_v64_load_aligned),
1897 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1898 } else if (typeid(CRet) == typeid(c_v128) &&
1899 typeid(CArg1) == typeid(c_v128) &&
1900 typeid(CArg2) == typeid(uint32_t)) {
1901 // V128_V128U32
1902 error = CompareSimd2Args<v128, v128, uint32_t, CRet, CArg1, CArg2>(
1903 reinterpret_cast<fptr>(v128_store_aligned),
1904 reinterpret_cast<fptr>(v128_load_aligned),
1905 reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1906 reinterpret_cast<fptr>(c_v128_store_aligned),
1907 reinterpret_cast<fptr>(c_v128_load_aligned),
1908 reinterpret_cast<fptr>(c_u32_load_aligned),
1909 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1910 } else if (typeid(CRet) == typeid(c_v256) &&
1911 typeid(CArg1) == typeid(c_v256) &&
1912 typeid(CArg2) == typeid(c_v256)) {
1913 // V256_V256V256
1914 error = CompareSimd2Args<v256, v256, v256, CRet, CArg1, CArg2>(
1915 reinterpret_cast<fptr>(v256_store_aligned),
1916 reinterpret_cast<fptr>(v256_load_aligned),
1917 reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1918 reinterpret_cast<fptr>(c_v256_store_aligned),
1919 reinterpret_cast<fptr>(c_v256_load_aligned),
1920 reinterpret_cast<fptr>(c_v256_load_aligned),
1921 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1922 } else if (typeid(CRet) == typeid(uint64_t) &&
1923 typeid(CArg1) == typeid(c_v256) &&
1924 typeid(CArg2) == typeid(c_v256)) {
1925 // U64_V256V256
1926 error = CompareSimd2Args<uint64_t, v256, v256, CRet, CArg1, CArg2>(
1927 reinterpret_cast<fptr>(u64_store_aligned),
1928 reinterpret_cast<fptr>(v256_load_aligned),
1929 reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1930 reinterpret_cast<fptr>(c_u64_store_aligned),
1931 reinterpret_cast<fptr>(c_v256_load_aligned),
1932 reinterpret_cast<fptr>(c_v256_load_aligned),
1933 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1934 } else if (typeid(CRet) == typeid(int64_t) &&
1935 typeid(CArg1) == typeid(c_v256) &&
1936 typeid(CArg2) == typeid(c_v256)) {
1937 // S64_V256V256
1938 error = CompareSimd2Args<int64_t, v256, v256, CRet, CArg1, CArg2>(
1939 reinterpret_cast<fptr>(s64_store_aligned),
1940 reinterpret_cast<fptr>(v256_load_aligned),
1941 reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1942 reinterpret_cast<fptr>(c_s64_store_aligned),
1943 reinterpret_cast<fptr>(c_v256_load_aligned),
1944 reinterpret_cast<fptr>(c_v256_load_aligned),
1945 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1946 } else if (typeid(CRet) == typeid(uint32_t) &&
1947 typeid(CArg1) == typeid(c_v256) &&
1948 typeid(CArg2) == typeid(c_v256)) {
1949 // U32_V256V256
1950 error = CompareSimd2Args<uint32_t, v256, v256, CRet, CArg1, CArg2>(
1951 reinterpret_cast<fptr>(u32_store_aligned),
1952 reinterpret_cast<fptr>(v256_load_aligned),
1953 reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1954 reinterpret_cast<fptr>(c_u32_store_aligned),
1955 reinterpret_cast<fptr>(c_v256_load_aligned),
1956 reinterpret_cast<fptr>(c_v256_load_aligned),
1957 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1958 } else if (typeid(CRet) == typeid(c_v256) &&
1959 typeid(CArg1) == typeid(c_v128) &&
1960 typeid(CArg2) == typeid(c_v128)) {
1961 // V256_V128V128
1962 error = CompareSimd2Args<v256, v128, v128, CRet, CArg1, CArg2>(
1963 reinterpret_cast<fptr>(v256_store_aligned),
1964 reinterpret_cast<fptr>(v128_load_aligned),
1965 reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1966 reinterpret_cast<fptr>(c_v256_store_aligned),
1967 reinterpret_cast<fptr>(c_v128_load_aligned),
1968 reinterpret_cast<fptr>(c_v128_load_aligned),
1969 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1970 } else if (typeid(CRet) == typeid(c_v256) &&
1971 typeid(CArg1) == typeid(c_v256) &&
1972 typeid(CArg2) == typeid(uint32_t)) {
1973 // V256_V256U32
1974 error = CompareSimd2Args<v256, v256, uint32_t, CRet, CArg1, CArg2>(
1975 reinterpret_cast<fptr>(v256_store_aligned),
1976 reinterpret_cast<fptr>(v256_load_aligned),
1977 reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1978 reinterpret_cast<fptr>(c_v256_store_aligned),
1979 reinterpret_cast<fptr>(c_v256_load_aligned),
1980 reinterpret_cast<fptr>(c_u32_load_aligned),
1981 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1982
1983 } else {
1984 FAIL() << "Internal error: Unknown intrinsic function "
1985 << typeid(CRet).name() << " " << name << "("
1986 << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ")";
1987 }
1988 }
1989
1990 EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
1991 << Print(s1, sizeof(CArg1)) << ", "
1992 << Print(s2, sizeof(CArg2)) << ") -> "
1993 << Print(d, sizeof(CRet)) << " (simd), "
1994 << Print(ref_d, sizeof(CRet)) << " (ref)";
1995 }
1996
1997 template <typename CRet, typename CArg1, typename CArg2, typename CArg3>
TestSimd3Args(uint32_t iterations,uint32_t mask,uint32_t maskwidth,const char * name)1998 void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
1999 const char *name) {
2000 ACMRandom rnd(ACMRandom::DeterministicSeed());
2001 fptr ref_simd;
2002 fptr simd;
2003 int error = 0;
2004 DECLARE_ALIGNED(32, uint8_t, s1[32]);
2005 DECLARE_ALIGNED(32, uint8_t, s2[32]);
2006 DECLARE_ALIGNED(32, uint8_t, s3[32]);
2007 DECLARE_ALIGNED(32, uint8_t, d[32]);
2008 DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
2009 assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CArg3) <= 32 &&
2010 sizeof(CRet) <= 32);
2011 memset(ref_d, 0, sizeof(ref_d));
2012 memset(d, 0, sizeof(d));
2013
2014 Map(name, &ref_simd, &simd);
2015 if (simd == NULL || ref_simd == NULL) {
2016 FAIL() << "Internal error: Unknown intrinsic function " << name;
2017 }
2018
2019 for (unsigned int count = 0;
2020 count < iterations && !error && !testing::Test::HasFailure(); count++) {
2021 for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
2022
2023 for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
2024
2025 for (unsigned int c = 0; c < sizeof(CArg3); c++) s3[c] = rnd.Rand8();
2026
2027 if (maskwidth) SetMask(s3, sizeof(CArg3), mask, maskwidth);
2028
2029 if (typeid(CRet) == typeid(c_v128) && typeid(CArg1) == typeid(c_v128) &&
2030 typeid(CArg2) == typeid(c_v128) && typeid(CArg3) == typeid(c_v128)) {
2031 // V128_V128V128V128
2032 error =
2033 CompareSimd3Args<v128, v128, v128, v128, CRet, CArg1, CArg2, CArg3>(
2034 reinterpret_cast<fptr>(v128_store_aligned),
2035 reinterpret_cast<fptr>(v128_load_aligned),
2036 reinterpret_cast<fptr>(v128_load_aligned),
2037 reinterpret_cast<fptr>(v128_load_aligned), simd, d,
2038 reinterpret_cast<fptr>(c_v128_store_aligned),
2039 reinterpret_cast<fptr>(c_v128_load_aligned),
2040 reinterpret_cast<fptr>(c_v128_load_aligned),
2041 reinterpret_cast<fptr>(c_v128_load_aligned),
2042 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
2043 } else if (typeid(CRet) == typeid(c_v256) &&
2044 typeid(CArg1) == typeid(c_v256) &&
2045 typeid(CArg2) == typeid(c_v256) &&
2046 typeid(CArg3) == typeid(c_v256)) {
2047 // V256_V256V256V256
2048 error =
2049 CompareSimd3Args<v256, v256, v256, v256, CRet, CArg1, CArg2, CArg3>(
2050 reinterpret_cast<fptr>(v256_store_aligned),
2051 reinterpret_cast<fptr>(v256_load_aligned),
2052 reinterpret_cast<fptr>(v256_load_aligned),
2053 reinterpret_cast<fptr>(v256_load_aligned), simd, d,
2054 reinterpret_cast<fptr>(c_v256_store_aligned),
2055 reinterpret_cast<fptr>(c_v256_load_aligned),
2056 reinterpret_cast<fptr>(c_v256_load_aligned),
2057 reinterpret_cast<fptr>(c_v256_load_aligned),
2058 reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
2059 } else {
2060 FAIL() << "Internal error: Unknown intrinsic function "
2061 << typeid(CRet).name() << " " << name << "("
2062 << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ", "
2063 << typeid(CArg3).name() << ")";
2064 }
2065 }
2066
2067 EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
2068 << Print(s1, sizeof(CArg1)) << ", "
2069 << Print(s2, sizeof(CArg2)) << ", "
2070 << Print(s3, sizeof(CArg3)) << ") -> "
2071 << Print(d, sizeof(CRet)) << " (simd), "
2072 << Print(ref_d, sizeof(CRet)) << " (ref)";
2073 }
2074
2075 // Instantiations to make the functions callable from another files
2076 template void TestSimd1Arg<c_v64, uint8_t>(uint32_t, uint32_t, uint32_t,
2077 const char *);
2078 template void TestSimd1Arg<c_v64, uint16_t>(uint32_t, uint32_t, uint32_t,
2079 const char *);
2080 template void TestSimd1Arg<c_v64, uint32_t>(uint32_t, uint32_t, uint32_t,
2081 const char *);
2082 template void TestSimd1Arg<c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
2083 const char *);
2084 template void TestSimd1Arg<uint32_t, c_v64>(uint32_t, uint32_t, uint32_t,
2085 const char *);
2086 template void TestSimd1Arg<int32_t, c_v64>(uint32_t, uint32_t, uint32_t,
2087 const char *);
2088 template void TestSimd1Arg<uint64_t, c_v64>(uint32_t, uint32_t, uint32_t,
2089 const char *);
2090 template void TestSimd1Arg<int64_t, c_v64>(uint32_t, uint32_t, uint32_t,
2091 const char *);
2092 template void TestSimd2Args<c_v64, uint32_t, uint32_t>(uint32_t, uint32_t,
2093 uint32_t, const char *);
2094 template void TestSimd2Args<c_v64, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
2095 const char *);
2096 template void TestSimd2Args<c_v64, c_v64, uint32_t>(uint32_t, uint32_t,
2097 uint32_t, const char *);
2098 template void TestSimd2Args<int64_t, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
2099 const char *);
2100 template void TestSimd2Args<uint32_t, c_v64, c_v64>(uint32_t, uint32_t,
2101 uint32_t, const char *);
2102 template void TestSimd1Arg<c_v128, c_v128>(uint32_t, uint32_t, uint32_t,
2103 const char *);
2104 template void TestSimd1Arg<c_v128, uint8_t>(uint32_t, uint32_t, uint32_t,
2105 const char *);
2106 template void TestSimd1Arg<c_v128, uint16_t>(uint32_t, uint32_t, uint32_t,
2107 const char *);
2108 template void TestSimd1Arg<c_v128, uint32_t>(uint32_t, uint32_t, uint32_t,
2109 const char *);
2110 template void TestSimd1Arg<c_v128, uint64_t>(uint32_t, uint32_t, uint32_t,
2111 const char *);
2112 template void TestSimd1Arg<c_v128, c_v64>(uint32_t, uint32_t, uint32_t,
2113 const char *);
2114 template void TestSimd1Arg<uint32_t, c_v128>(uint32_t, uint32_t, uint32_t,
2115 const char *);
2116 template void TestSimd1Arg<uint64_t, c_v128>(uint32_t, uint32_t, uint32_t,
2117 const char *);
2118 template void TestSimd1Arg<c_v64, c_v128>(uint32_t, uint32_t, uint32_t,
2119 const char *);
2120 template void TestSimd2Args<c_v128, c_v128, c_v128>(uint32_t, uint32_t,
2121 uint32_t, const char *);
2122 template void TestSimd2Args<c_v128, c_v128, uint32_t>(uint32_t, uint32_t,
2123 uint32_t, const char *);
2124 template void TestSimd2Args<c_v128, uint64_t, uint64_t>(uint32_t, uint32_t,
2125 uint32_t, const char *);
2126 template void TestSimd2Args<c_v128, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
2127 const char *);
2128 template void TestSimd2Args<uint64_t, c_v128, c_v128>(uint32_t, uint32_t,
2129 uint32_t, const char *);
2130 template void TestSimd2Args<int64_t, c_v128, c_v128>(uint32_t, uint32_t,
2131 uint32_t, const char *);
2132 template void TestSimd2Args<uint32_t, c_v128, c_v128>(uint32_t, uint32_t,
2133 uint32_t, const char *);
2134 template void TestSimd3Args<c_v128, c_v128, c_v128, c_v128>(uint32_t, uint32_t,
2135 uint32_t,
2136 const char *);
2137 template void TestSimd1Arg<c_v256, c_v128>(uint32_t, uint32_t, uint32_t,
2138 const char *);
2139 template void TestSimd1Arg<c_v256, c_v256>(uint32_t, uint32_t, uint32_t,
2140 const char *);
2141 template void TestSimd1Arg<uint64_t, c_v256>(uint32_t, uint32_t, uint32_t,
2142 const char *);
2143 template void TestSimd1Arg<c_v256, uint8_t>(uint32_t, uint32_t, uint32_t,
2144 const char *);
2145 template void TestSimd1Arg<c_v256, uint16_t>(uint32_t, uint32_t, uint32_t,
2146 const char *);
2147 template void TestSimd1Arg<c_v256, uint32_t>(uint32_t, uint32_t, uint32_t,
2148 const char *);
2149 template void TestSimd1Arg<c_v256, uint64_t>(uint32_t, uint32_t, uint32_t,
2150 const char *);
2151 template void TestSimd1Arg<uint32_t, c_v256>(uint32_t, uint32_t, uint32_t,
2152 const char *);
2153 template void TestSimd1Arg<c_v64, c_v256>(uint32_t, uint32_t, uint32_t,
2154 const char *);
2155 template void TestSimd2Args<c_v256, c_v128, c_v128>(uint32_t, uint32_t,
2156 uint32_t, const char *);
2157 template void TestSimd2Args<c_v256, c_v256, c_v256>(uint32_t, uint32_t,
2158 uint32_t, const char *);
2159 template void TestSimd2Args<c_v256, c_v256, uint32_t>(uint32_t, uint32_t,
2160 uint32_t, const char *);
2161 template void TestSimd2Args<uint64_t, c_v256, c_v256>(uint32_t, uint32_t,
2162 uint32_t, const char *);
2163 template void TestSimd2Args<int64_t, c_v256, c_v256>(uint32_t, uint32_t,
2164 uint32_t, const char *);
2165 template void TestSimd2Args<uint32_t, c_v256, c_v256>(uint32_t, uint32_t,
2166 uint32_t, const char *);
2167 template void TestSimd3Args<c_v256, c_v256, c_v256, c_v256>(uint32_t, uint32_t,
2168 uint32_t,
2169 const char *);
2170
2171 } // namespace SIMD_NAMESPACE
2172