• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #include <assert.h>
13 #include <string>
14 
15 #include "config/aom_dsp_rtcd.h"
16 
17 #include "test/acm_random.h"
18 // Inlining not forced for the compiler due to some tests calling
19 // SIMD_INLINE functions via function pointers
20 #undef SIMD_INLINE
21 #define SIMD_INLINE static inline
22 #include "aom_dsp/aom_simd.h"
23 #include "aom_dsp/simd/v256_intrinsics_c.h"
24 
25 // Machine tuned code goes into this file. This file is included from
26 // simd_cmp_sse2.cc, simd_cmp_ssse3.cc etc which define the macros
27 // ARCH (=neon, sse2, ssse3, etc), SIMD_NAMESPACE and ARCH_POSTFIX().
28 
29 #ifdef _MSC_VER
30 // Disable "value of intrinsic immediate argument 'value' is out of range
31 // 'lowerbound - upperbound'" warning. Visual Studio emits this warning though
32 // the parameters are conditionally checked in e.g., v256_shr_n_byte. Adding a
33 // mask doesn't always appear to be sufficient.
34 #pragma warning(disable : 4556)
35 #endif
36 
37 using libaom_test::ACMRandom;
38 
39 namespace SIMD_NAMESPACE {
40 
41 // Wrap templates around intrinsics using immediate values
42 template <int shift>
imm_v64_shl_n_byte(v64 a)43 v64 imm_v64_shl_n_byte(v64 a) {
44   return v64_shl_n_byte(a, shift);
45 }
46 template <int shift>
imm_v64_shr_n_byte(v64 a)47 v64 imm_v64_shr_n_byte(v64 a) {
48   return v64_shr_n_byte(a, shift);
49 }
50 template <int shift>
imm_v64_shl_n_8(v64 a)51 v64 imm_v64_shl_n_8(v64 a) {
52   return v64_shl_n_8(a, shift);
53 }
54 template <int shift>
imm_v64_shr_n_u8(v64 a)55 v64 imm_v64_shr_n_u8(v64 a) {
56   return v64_shr_n_u8(a, shift);
57 }
58 template <int shift>
imm_v64_shr_n_s8(v64 a)59 v64 imm_v64_shr_n_s8(v64 a) {
60   return v64_shr_n_s8(a, shift);
61 }
62 template <int shift>
imm_v64_shl_n_16(v64 a)63 v64 imm_v64_shl_n_16(v64 a) {
64   return v64_shl_n_16(a, shift);
65 }
66 template <int shift>
imm_v64_shr_n_u16(v64 a)67 v64 imm_v64_shr_n_u16(v64 a) {
68   return v64_shr_n_u16(a, shift);
69 }
70 template <int shift>
imm_v64_shr_n_s16(v64 a)71 v64 imm_v64_shr_n_s16(v64 a) {
72   return v64_shr_n_s16(a, shift);
73 }
74 template <int shift>
imm_v64_shl_n_32(v64 a)75 v64 imm_v64_shl_n_32(v64 a) {
76   return v64_shl_n_32(a, shift);
77 }
78 template <int shift>
imm_v64_shr_n_u32(v64 a)79 v64 imm_v64_shr_n_u32(v64 a) {
80   return v64_shr_n_u32(a, shift);
81 }
82 template <int shift>
imm_v64_shr_n_s32(v64 a)83 v64 imm_v64_shr_n_s32(v64 a) {
84   return v64_shr_n_s32(a, shift);
85 }
86 template <int shift>
imm_v64_align(v64 a,v64 b)87 v64 imm_v64_align(v64 a, v64 b) {
88   return v64_align(a, b, shift);
89 }
90 
91 // Wrap templates around corresponding C implementations of the above
92 template <int shift>
c_imm_v64_shl_n_byte(c_v64 a)93 c_v64 c_imm_v64_shl_n_byte(c_v64 a) {
94   return c_v64_shl_n_byte(a, shift);
95 }
96 template <int shift>
c_imm_v64_shr_n_byte(c_v64 a)97 c_v64 c_imm_v64_shr_n_byte(c_v64 a) {
98   return c_v64_shr_n_byte(a, shift);
99 }
100 template <int shift>
c_imm_v64_shl_n_8(c_v64 a)101 c_v64 c_imm_v64_shl_n_8(c_v64 a) {
102   return c_v64_shl_n_8(a, shift);
103 }
104 template <int shift>
c_imm_v64_shr_n_u8(c_v64 a)105 c_v64 c_imm_v64_shr_n_u8(c_v64 a) {
106   return c_v64_shr_n_u8(a, shift);
107 }
108 template <int shift>
c_imm_v64_shr_n_s8(c_v64 a)109 c_v64 c_imm_v64_shr_n_s8(c_v64 a) {
110   return c_v64_shr_n_s8(a, shift);
111 }
112 template <int shift>
c_imm_v64_shl_n_16(c_v64 a)113 c_v64 c_imm_v64_shl_n_16(c_v64 a) {
114   return c_v64_shl_n_16(a, shift);
115 }
116 template <int shift>
c_imm_v64_shr_n_u16(c_v64 a)117 c_v64 c_imm_v64_shr_n_u16(c_v64 a) {
118   return c_v64_shr_n_u16(a, shift);
119 }
120 template <int shift>
c_imm_v64_shr_n_s16(c_v64 a)121 c_v64 c_imm_v64_shr_n_s16(c_v64 a) {
122   return c_v64_shr_n_s16(a, shift);
123 }
124 template <int shift>
c_imm_v64_shl_n_32(c_v64 a)125 c_v64 c_imm_v64_shl_n_32(c_v64 a) {
126   return c_v64_shl_n_32(a, shift);
127 }
128 template <int shift>
c_imm_v64_shr_n_u32(c_v64 a)129 c_v64 c_imm_v64_shr_n_u32(c_v64 a) {
130   return c_v64_shr_n_u32(a, shift);
131 }
132 template <int shift>
c_imm_v64_shr_n_s32(c_v64 a)133 c_v64 c_imm_v64_shr_n_s32(c_v64 a) {
134   return c_v64_shr_n_s32(a, shift);
135 }
136 template <int shift>
c_imm_v64_align(c_v64 a,c_v64 b)137 c_v64 c_imm_v64_align(c_v64 a, c_v64 b) {
138   return c_v64_align(a, b, shift);
139 }
140 
141 template <int shift>
imm_v128_shl_n_byte(v128 a)142 v128 imm_v128_shl_n_byte(v128 a) {
143   return v128_shl_n_byte(a, shift);
144 }
145 template <int shift>
imm_v128_shr_n_byte(v128 a)146 v128 imm_v128_shr_n_byte(v128 a) {
147   return v128_shr_n_byte(a, shift);
148 }
149 template <int shift>
imm_v128_shl_n_8(v128 a)150 v128 imm_v128_shl_n_8(v128 a) {
151   return v128_shl_n_8(a, shift);
152 }
153 template <int shift>
imm_v128_shr_n_u8(v128 a)154 v128 imm_v128_shr_n_u8(v128 a) {
155   return v128_shr_n_u8(a, shift);
156 }
157 template <int shift>
imm_v128_shr_n_s8(v128 a)158 v128 imm_v128_shr_n_s8(v128 a) {
159   return v128_shr_n_s8(a, shift);
160 }
161 template <int shift>
imm_v128_shl_n_16(v128 a)162 v128 imm_v128_shl_n_16(v128 a) {
163   return v128_shl_n_16(a, shift);
164 }
165 template <int shift>
imm_v128_shr_n_u16(v128 a)166 v128 imm_v128_shr_n_u16(v128 a) {
167   return v128_shr_n_u16(a, shift);
168 }
169 template <int shift>
imm_v128_shr_n_s16(v128 a)170 v128 imm_v128_shr_n_s16(v128 a) {
171   return v128_shr_n_s16(a, shift);
172 }
173 template <int shift>
imm_v128_shl_n_32(v128 a)174 v128 imm_v128_shl_n_32(v128 a) {
175   return v128_shl_n_32(a, shift);
176 }
177 template <int shift>
imm_v128_shr_n_u32(v128 a)178 v128 imm_v128_shr_n_u32(v128 a) {
179   return v128_shr_n_u32(a, shift);
180 }
181 template <int shift>
imm_v128_shr_n_s32(v128 a)182 v128 imm_v128_shr_n_s32(v128 a) {
183   return v128_shr_n_s32(a, shift);
184 }
185 template <int shift>
imm_v128_shl_n_64(v128 a)186 v128 imm_v128_shl_n_64(v128 a) {
187   return v128_shl_n_64(a, shift);
188 }
189 template <int shift>
imm_v128_shr_n_u64(v128 a)190 v128 imm_v128_shr_n_u64(v128 a) {
191   return v128_shr_n_u64(a, shift);
192 }
193 template <int shift>
imm_v128_shr_n_s64(v128 a)194 v128 imm_v128_shr_n_s64(v128 a) {
195   return v128_shr_n_s64(a, shift);
196 }
197 template <int shift>
imm_v128_align(v128 a,v128 b)198 v128 imm_v128_align(v128 a, v128 b) {
199   return v128_align(a, b, shift);
200 }
201 
202 template <int shift>
c_imm_v128_shl_n_byte(c_v128 a)203 c_v128 c_imm_v128_shl_n_byte(c_v128 a) {
204   return c_v128_shl_n_byte(a, shift);
205 }
206 template <int shift>
c_imm_v128_shr_n_byte(c_v128 a)207 c_v128 c_imm_v128_shr_n_byte(c_v128 a) {
208   return c_v128_shr_n_byte(a, shift);
209 }
210 template <int shift>
c_imm_v128_shl_n_8(c_v128 a)211 c_v128 c_imm_v128_shl_n_8(c_v128 a) {
212   return c_v128_shl_n_8(a, shift);
213 }
214 template <int shift>
c_imm_v128_shr_n_u8(c_v128 a)215 c_v128 c_imm_v128_shr_n_u8(c_v128 a) {
216   return c_v128_shr_n_u8(a, shift);
217 }
218 template <int shift>
c_imm_v128_shr_n_s8(c_v128 a)219 c_v128 c_imm_v128_shr_n_s8(c_v128 a) {
220   return c_v128_shr_n_s8(a, shift);
221 }
222 template <int shift>
c_imm_v128_shl_n_16(c_v128 a)223 c_v128 c_imm_v128_shl_n_16(c_v128 a) {
224   return c_v128_shl_n_16(a, shift);
225 }
226 template <int shift>
c_imm_v128_shr_n_u16(c_v128 a)227 c_v128 c_imm_v128_shr_n_u16(c_v128 a) {
228   return c_v128_shr_n_u16(a, shift);
229 }
230 template <int shift>
c_imm_v128_shr_n_s16(c_v128 a)231 c_v128 c_imm_v128_shr_n_s16(c_v128 a) {
232   return c_v128_shr_n_s16(a, shift);
233 }
234 template <int shift>
c_imm_v128_shl_n_32(c_v128 a)235 c_v128 c_imm_v128_shl_n_32(c_v128 a) {
236   return c_v128_shl_n_32(a, shift);
237 }
238 template <int shift>
c_imm_v128_shr_n_u32(c_v128 a)239 c_v128 c_imm_v128_shr_n_u32(c_v128 a) {
240   return c_v128_shr_n_u32(a, shift);
241 }
242 template <int shift>
c_imm_v128_shr_n_s32(c_v128 a)243 c_v128 c_imm_v128_shr_n_s32(c_v128 a) {
244   return c_v128_shr_n_s32(a, shift);
245 }
246 template <int shift>
c_imm_v128_shl_n_64(c_v128 a)247 c_v128 c_imm_v128_shl_n_64(c_v128 a) {
248   return c_v128_shl_n_64(a, shift);
249 }
250 template <int shift>
c_imm_v128_shr_n_u64(c_v128 a)251 c_v128 c_imm_v128_shr_n_u64(c_v128 a) {
252   return c_v128_shr_n_u64(a, shift);
253 }
254 template <int shift>
c_imm_v128_shr_n_s64(c_v128 a)255 c_v128 c_imm_v128_shr_n_s64(c_v128 a) {
256   return c_v128_shr_n_s64(a, shift);
257 }
258 template <int shift>
c_imm_v128_align(c_v128 a,c_v128 b)259 c_v128 c_imm_v128_align(c_v128 a, c_v128 b) {
260   return c_v128_align(a, b, shift);
261 }
262 
263 template <int shift>
imm_v256_shl_n_word(v256 a)264 v256 imm_v256_shl_n_word(v256 a) {
265   return v256_shl_n_word(a, shift);
266 }
267 template <int shift>
imm_v256_shr_n_word(v256 a)268 v256 imm_v256_shr_n_word(v256 a) {
269   return v256_shr_n_word(a, shift);
270 }
271 template <int shift>
imm_v256_shl_n_byte(v256 a)272 v256 imm_v256_shl_n_byte(v256 a) {
273   return v256_shl_n_byte(a, shift);
274 }
275 template <int shift>
imm_v256_shr_n_byte(v256 a)276 v256 imm_v256_shr_n_byte(v256 a) {
277   return v256_shr_n_byte(a, shift);
278 }
279 template <int shift>
imm_v256_shl_n_8(v256 a)280 v256 imm_v256_shl_n_8(v256 a) {
281   return v256_shl_n_8(a, shift);
282 }
283 template <int shift>
imm_v256_shr_n_u8(v256 a)284 v256 imm_v256_shr_n_u8(v256 a) {
285   return v256_shr_n_u8(a, shift);
286 }
287 template <int shift>
imm_v256_shr_n_s8(v256 a)288 v256 imm_v256_shr_n_s8(v256 a) {
289   return v256_shr_n_s8(a, shift);
290 }
291 template <int shift>
imm_v256_shl_n_16(v256 a)292 v256 imm_v256_shl_n_16(v256 a) {
293   return v256_shl_n_16(a, shift);
294 }
295 template <int shift>
imm_v256_shr_n_u16(v256 a)296 v256 imm_v256_shr_n_u16(v256 a) {
297   return v256_shr_n_u16(a, shift);
298 }
299 template <int shift>
imm_v256_shr_n_s16(v256 a)300 v256 imm_v256_shr_n_s16(v256 a) {
301   return v256_shr_n_s16(a, shift);
302 }
303 template <int shift>
imm_v256_shl_n_32(v256 a)304 v256 imm_v256_shl_n_32(v256 a) {
305   return v256_shl_n_32(a, shift);
306 }
307 template <int shift>
imm_v256_shr_n_u32(v256 a)308 v256 imm_v256_shr_n_u32(v256 a) {
309   return v256_shr_n_u32(a, shift);
310 }
311 template <int shift>
imm_v256_shr_n_s32(v256 a)312 v256 imm_v256_shr_n_s32(v256 a) {
313   return v256_shr_n_s32(a, shift);
314 }
315 template <int shift>
imm_v256_shl_n_64(v256 a)316 v256 imm_v256_shl_n_64(v256 a) {
317   return v256_shl_n_64(a, shift);
318 }
319 template <int shift>
imm_v256_shr_n_u64(v256 a)320 v256 imm_v256_shr_n_u64(v256 a) {
321   return v256_shr_n_u64(a, shift);
322 }
323 template <int shift>
imm_v256_shr_n_s64(v256 a)324 v256 imm_v256_shr_n_s64(v256 a) {
325   return v256_shr_n_s64(a, shift);
326 }
327 template <int shift>
imm_v256_align(v256 a,v256 b)328 v256 imm_v256_align(v256 a, v256 b) {
329   return v256_align(a, b, shift);
330 }
331 
332 template <int shift>
c_imm_v256_shl_n_word(c_v256 a)333 c_v256 c_imm_v256_shl_n_word(c_v256 a) {
334   return c_v256_shl_n_word(a, shift);
335 }
336 template <int shift>
c_imm_v256_shr_n_word(c_v256 a)337 c_v256 c_imm_v256_shr_n_word(c_v256 a) {
338   return c_v256_shr_n_word(a, shift);
339 }
340 template <int shift>
c_imm_v256_shl_n_byte(c_v256 a)341 c_v256 c_imm_v256_shl_n_byte(c_v256 a) {
342   return c_v256_shl_n_byte(a, shift);
343 }
344 template <int shift>
c_imm_v256_shr_n_byte(c_v256 a)345 c_v256 c_imm_v256_shr_n_byte(c_v256 a) {
346   return c_v256_shr_n_byte(a, shift);
347 }
348 template <int shift>
c_imm_v256_shl_n_8(c_v256 a)349 c_v256 c_imm_v256_shl_n_8(c_v256 a) {
350   return c_v256_shl_n_8(a, shift);
351 }
352 template <int shift>
c_imm_v256_shr_n_u8(c_v256 a)353 c_v256 c_imm_v256_shr_n_u8(c_v256 a) {
354   return c_v256_shr_n_u8(a, shift);
355 }
356 template <int shift>
c_imm_v256_shr_n_s8(c_v256 a)357 c_v256 c_imm_v256_shr_n_s8(c_v256 a) {
358   return c_v256_shr_n_s8(a, shift);
359 }
360 template <int shift>
c_imm_v256_shl_n_16(c_v256 a)361 c_v256 c_imm_v256_shl_n_16(c_v256 a) {
362   return c_v256_shl_n_16(a, shift);
363 }
364 template <int shift>
c_imm_v256_shr_n_u16(c_v256 a)365 c_v256 c_imm_v256_shr_n_u16(c_v256 a) {
366   return c_v256_shr_n_u16(a, shift);
367 }
368 template <int shift>
c_imm_v256_shr_n_s16(c_v256 a)369 c_v256 c_imm_v256_shr_n_s16(c_v256 a) {
370   return c_v256_shr_n_s16(a, shift);
371 }
372 template <int shift>
c_imm_v256_shl_n_32(c_v256 a)373 c_v256 c_imm_v256_shl_n_32(c_v256 a) {
374   return c_v256_shl_n_32(a, shift);
375 }
376 template <int shift>
c_imm_v256_shr_n_u32(c_v256 a)377 c_v256 c_imm_v256_shr_n_u32(c_v256 a) {
378   return c_v256_shr_n_u32(a, shift);
379 }
380 template <int shift>
c_imm_v256_shr_n_s32(c_v256 a)381 c_v256 c_imm_v256_shr_n_s32(c_v256 a) {
382   return c_v256_shr_n_s32(a, shift);
383 }
384 template <int shift>
c_imm_v256_shl_n_64(c_v256 a)385 c_v256 c_imm_v256_shl_n_64(c_v256 a) {
386   return c_v256_shl_n_64(a, shift);
387 }
388 template <int shift>
c_imm_v256_shr_n_u64(c_v256 a)389 c_v256 c_imm_v256_shr_n_u64(c_v256 a) {
390   return c_v256_shr_n_u64(a, shift);
391 }
392 template <int shift>
c_imm_v256_shr_n_s64(c_v256 a)393 c_v256 c_imm_v256_shr_n_s64(c_v256 a) {
394   return c_v256_shr_n_s64(a, shift);
395 }
396 template <int shift>
c_imm_v256_align(c_v256 a,c_v256 b)397 c_v256 c_imm_v256_align(c_v256 a, c_v256 b) {
398   return c_v256_align(a, b, shift);
399 }
400 
401 // Wrappers around the the SAD and SSD functions
v64_sad_u8(v64 a,v64 b)402 uint32_t v64_sad_u8(v64 a, v64 b) {
403   return v64_sad_u8_sum(::v64_sad_u8(v64_sad_u8_init(), a, b));
404 }
v64_ssd_u8(v64 a,v64 b)405 uint32_t v64_ssd_u8(v64 a, v64 b) {
406   return v64_ssd_u8_sum(::v64_ssd_u8(v64_ssd_u8_init(), a, b));
407 }
408 
c_v64_sad_u8(c_v64 a,c_v64 b)409 uint32_t c_v64_sad_u8(c_v64 a, c_v64 b) {
410   return c_v64_sad_u8_sum(::c_v64_sad_u8(c_v64_sad_u8_init(), a, b));
411 }
c_v64_ssd_u8(c_v64 a,c_v64 b)412 uint32_t c_v64_ssd_u8(c_v64 a, c_v64 b) {
413   return c_v64_ssd_u8_sum(::c_v64_ssd_u8(c_v64_ssd_u8_init(), a, b));
414 }
v128_sad_u8(v128 a,v128 b)415 uint32_t v128_sad_u8(v128 a, v128 b) {
416   return v128_sad_u8_sum(::v128_sad_u8(v128_sad_u8_init(), a, b));
417 }
v128_ssd_u8(v128 a,v128 b)418 uint32_t v128_ssd_u8(v128 a, v128 b) {
419   return v128_ssd_u8_sum(::v128_ssd_u8(v128_ssd_u8_init(), a, b));
420 }
c_v128_sad_u8(c_v128 a,c_v128 b)421 uint32_t c_v128_sad_u8(c_v128 a, c_v128 b) {
422   return c_v128_sad_u8_sum(::c_v128_sad_u8(c_v128_sad_u8_init(), a, b));
423 }
c_v128_ssd_u8(c_v128 a,c_v128 b)424 uint32_t c_v128_ssd_u8(c_v128 a, c_v128 b) {
425   return c_v128_ssd_u8_sum(::c_v128_ssd_u8(c_v128_ssd_u8_init(), a, b));
426 }
v128_sad_u16(v128 a,v128 b)427 uint32_t v128_sad_u16(v128 a, v128 b) {
428   return v128_sad_u16_sum(::v128_sad_u16(v128_sad_u16_init(), a, b));
429 }
v128_ssd_s16(v128 a,v128 b)430 uint64_t v128_ssd_s16(v128 a, v128 b) {
431   return v128_ssd_s16_sum(::v128_ssd_s16(v128_ssd_s16_init(), a, b));
432 }
c_v128_sad_u16(c_v128 a,c_v128 b)433 uint32_t c_v128_sad_u16(c_v128 a, c_v128 b) {
434   return c_v128_sad_u16_sum(::c_v128_sad_u16(c_v128_sad_u16_init(), a, b));
435 }
c_v128_ssd_s16(c_v128 a,c_v128 b)436 uint64_t c_v128_ssd_s16(c_v128 a, c_v128 b) {
437   return c_v128_ssd_s16_sum(::c_v128_ssd_s16(c_v128_ssd_s16_init(), a, b));
438 }
v256_sad_u8(v256 a,v256 b)439 uint32_t v256_sad_u8(v256 a, v256 b) {
440   return v256_sad_u8_sum(::v256_sad_u8(v256_sad_u8_init(), a, b));
441 }
v256_ssd_u8(v256 a,v256 b)442 uint32_t v256_ssd_u8(v256 a, v256 b) {
443   return v256_ssd_u8_sum(::v256_ssd_u8(v256_ssd_u8_init(), a, b));
444 }
c_v256_sad_u8(c_v256 a,c_v256 b)445 uint32_t c_v256_sad_u8(c_v256 a, c_v256 b) {
446   return c_v256_sad_u8_sum(::c_v256_sad_u8(c_v256_sad_u8_init(), a, b));
447 }
c_v256_ssd_u8(c_v256 a,c_v256 b)448 uint32_t c_v256_ssd_u8(c_v256 a, c_v256 b) {
449   return c_v256_ssd_u8_sum(::c_v256_ssd_u8(c_v256_ssd_u8_init(), a, b));
450 }
v256_sad_u16(v256 a,v256 b)451 uint32_t v256_sad_u16(v256 a, v256 b) {
452   return v256_sad_u16_sum(::v256_sad_u16(v256_sad_u16_init(), a, b));
453 }
v256_ssd_s16(v256 a,v256 b)454 uint64_t v256_ssd_s16(v256 a, v256 b) {
455   return v256_ssd_s16_sum(::v256_ssd_s16(v256_ssd_s16_init(), a, b));
456 }
c_v256_sad_u16(c_v256 a,c_v256 b)457 uint32_t c_v256_sad_u16(c_v256 a, c_v256 b) {
458   return c_v256_sad_u16_sum(::c_v256_sad_u16(c_v256_sad_u16_init(), a, b));
459 }
c_v256_ssd_s16(c_v256 a,c_v256 b)460 uint64_t c_v256_ssd_s16(c_v256 a, c_v256 b) {
461   return c_v256_ssd_s16_sum(::c_v256_ssd_s16(c_v256_ssd_s16_init(), a, b));
462 }
463 
464 namespace {
465 
466 typedef void (*fptr)();
467 
468 typedef struct {
469   const char *name;
470   fptr ref;
471   fptr simd;
472 } mapping;
473 
474 #define MAP(name) \
475   { #name, reinterpret_cast < fptr>(c_##name), reinterpret_cast < fptr>(name) }
476 
477 const mapping m[] = { MAP(v64_sad_u8),
478                       MAP(v64_ssd_u8),
479                       MAP(v64_add_8),
480                       MAP(v64_add_16),
481                       MAP(v64_sadd_s8),
482                       MAP(v64_sadd_u8),
483                       MAP(v64_sadd_s16),
484                       MAP(v64_add_32),
485                       MAP(v64_sub_8),
486                       MAP(v64_ssub_u8),
487                       MAP(v64_ssub_s8),
488                       MAP(v64_sub_16),
489                       MAP(v64_ssub_s16),
490                       MAP(v64_ssub_u16),
491                       MAP(v64_sub_32),
492                       MAP(v64_ziplo_8),
493                       MAP(v64_ziphi_8),
494                       MAP(v64_ziplo_16),
495                       MAP(v64_ziphi_16),
496                       MAP(v64_ziplo_32),
497                       MAP(v64_ziphi_32),
498                       MAP(v64_pack_s32_u16),
499                       MAP(v64_pack_s32_s16),
500                       MAP(v64_pack_s16_u8),
501                       MAP(v64_pack_s16_s8),
502                       MAP(v64_unziphi_8),
503                       MAP(v64_unziplo_8),
504                       MAP(v64_unziphi_16),
505                       MAP(v64_unziplo_16),
506                       MAP(v64_or),
507                       MAP(v64_xor),
508                       MAP(v64_and),
509                       MAP(v64_andn),
510                       MAP(v64_mullo_s16),
511                       MAP(v64_mulhi_s16),
512                       MAP(v64_mullo_s32),
513                       MAP(v64_madd_s16),
514                       MAP(v64_madd_us8),
515                       MAP(v64_avg_u8),
516                       MAP(v64_rdavg_u8),
517                       MAP(v64_rdavg_u16),
518                       MAP(v64_avg_u16),
519                       MAP(v64_min_u8),
520                       MAP(v64_max_u8),
521                       MAP(v64_min_s8),
522                       MAP(v64_max_s8),
523                       MAP(v64_min_s16),
524                       MAP(v64_max_s16),
525                       MAP(v64_cmpgt_s8),
526                       MAP(v64_cmplt_s8),
527                       MAP(v64_cmpeq_8),
528                       MAP(v64_cmpgt_s16),
529                       MAP(v64_cmplt_s16),
530                       MAP(v64_cmpeq_16),
531                       MAP(v64_shuffle_8),
532                       MAP(imm_v64_align<1>),
533                       MAP(imm_v64_align<2>),
534                       MAP(imm_v64_align<3>),
535                       MAP(imm_v64_align<4>),
536                       MAP(imm_v64_align<5>),
537                       MAP(imm_v64_align<6>),
538                       MAP(imm_v64_align<7>),
539                       MAP(v64_abs_s8),
540                       MAP(v64_abs_s16),
541                       MAP(v64_unpacklo_u8_s16),
542                       MAP(v64_unpackhi_u8_s16),
543                       MAP(v64_unpacklo_s8_s16),
544                       MAP(v64_unpackhi_s8_s16),
545                       MAP(v64_unpacklo_u16_s32),
546                       MAP(v64_unpacklo_s16_s32),
547                       MAP(v64_unpackhi_u16_s32),
548                       MAP(v64_unpackhi_s16_s32),
549                       MAP(imm_v64_shr_n_byte<1>),
550                       MAP(imm_v64_shr_n_byte<2>),
551                       MAP(imm_v64_shr_n_byte<3>),
552                       MAP(imm_v64_shr_n_byte<4>),
553                       MAP(imm_v64_shr_n_byte<5>),
554                       MAP(imm_v64_shr_n_byte<6>),
555                       MAP(imm_v64_shr_n_byte<7>),
556                       MAP(imm_v64_shl_n_byte<1>),
557                       MAP(imm_v64_shl_n_byte<2>),
558                       MAP(imm_v64_shl_n_byte<3>),
559                       MAP(imm_v64_shl_n_byte<4>),
560                       MAP(imm_v64_shl_n_byte<5>),
561                       MAP(imm_v64_shl_n_byte<6>),
562                       MAP(imm_v64_shl_n_byte<7>),
563                       MAP(imm_v64_shl_n_8<1>),
564                       MAP(imm_v64_shl_n_8<2>),
565                       MAP(imm_v64_shl_n_8<3>),
566                       MAP(imm_v64_shl_n_8<4>),
567                       MAP(imm_v64_shl_n_8<5>),
568                       MAP(imm_v64_shl_n_8<6>),
569                       MAP(imm_v64_shl_n_8<7>),
570                       MAP(imm_v64_shr_n_u8<1>),
571                       MAP(imm_v64_shr_n_u8<2>),
572                       MAP(imm_v64_shr_n_u8<3>),
573                       MAP(imm_v64_shr_n_u8<4>),
574                       MAP(imm_v64_shr_n_u8<5>),
575                       MAP(imm_v64_shr_n_u8<6>),
576                       MAP(imm_v64_shr_n_u8<7>),
577                       MAP(imm_v64_shr_n_s8<1>),
578                       MAP(imm_v64_shr_n_s8<2>),
579                       MAP(imm_v64_shr_n_s8<3>),
580                       MAP(imm_v64_shr_n_s8<4>),
581                       MAP(imm_v64_shr_n_s8<5>),
582                       MAP(imm_v64_shr_n_s8<6>),
583                       MAP(imm_v64_shr_n_s8<7>),
584                       MAP(imm_v64_shl_n_16<1>),
585                       MAP(imm_v64_shl_n_16<2>),
586                       MAP(imm_v64_shl_n_16<4>),
587                       MAP(imm_v64_shl_n_16<6>),
588                       MAP(imm_v64_shl_n_16<8>),
589                       MAP(imm_v64_shl_n_16<10>),
590                       MAP(imm_v64_shl_n_16<12>),
591                       MAP(imm_v64_shl_n_16<14>),
592                       MAP(imm_v64_shr_n_u16<1>),
593                       MAP(imm_v64_shr_n_u16<2>),
594                       MAP(imm_v64_shr_n_u16<4>),
595                       MAP(imm_v64_shr_n_u16<6>),
596                       MAP(imm_v64_shr_n_u16<8>),
597                       MAP(imm_v64_shr_n_u16<10>),
598                       MAP(imm_v64_shr_n_u16<12>),
599                       MAP(imm_v64_shr_n_u16<14>),
600                       MAP(imm_v64_shr_n_s16<1>),
601                       MAP(imm_v64_shr_n_s16<2>),
602                       MAP(imm_v64_shr_n_s16<4>),
603                       MAP(imm_v64_shr_n_s16<6>),
604                       MAP(imm_v64_shr_n_s16<8>),
605                       MAP(imm_v64_shr_n_s16<10>),
606                       MAP(imm_v64_shr_n_s16<12>),
607                       MAP(imm_v64_shr_n_s16<14>),
608                       MAP(imm_v64_shl_n_32<1>),
609                       MAP(imm_v64_shl_n_32<4>),
610                       MAP(imm_v64_shl_n_32<8>),
611                       MAP(imm_v64_shl_n_32<12>),
612                       MAP(imm_v64_shl_n_32<16>),
613                       MAP(imm_v64_shl_n_32<20>),
614                       MAP(imm_v64_shl_n_32<24>),
615                       MAP(imm_v64_shl_n_32<28>),
616                       MAP(imm_v64_shr_n_u32<1>),
617                       MAP(imm_v64_shr_n_u32<4>),
618                       MAP(imm_v64_shr_n_u32<8>),
619                       MAP(imm_v64_shr_n_u32<12>),
620                       MAP(imm_v64_shr_n_u32<16>),
621                       MAP(imm_v64_shr_n_u32<20>),
622                       MAP(imm_v64_shr_n_u32<24>),
623                       MAP(imm_v64_shr_n_u32<28>),
624                       MAP(imm_v64_shr_n_s32<1>),
625                       MAP(imm_v64_shr_n_s32<4>),
626                       MAP(imm_v64_shr_n_s32<8>),
627                       MAP(imm_v64_shr_n_s32<12>),
628                       MAP(imm_v64_shr_n_s32<16>),
629                       MAP(imm_v64_shr_n_s32<20>),
630                       MAP(imm_v64_shr_n_s32<24>),
631                       MAP(imm_v64_shr_n_s32<28>),
632                       MAP(v64_shl_8),
633                       MAP(v64_shr_u8),
634                       MAP(v64_shr_s8),
635                       MAP(v64_shl_16),
636                       MAP(v64_shr_u16),
637                       MAP(v64_shr_s16),
638                       MAP(v64_shl_32),
639                       MAP(v64_shr_u32),
640                       MAP(v64_shr_s32),
641                       MAP(v64_hadd_u8),
642                       MAP(v64_hadd_s16),
643                       MAP(v64_dotp_s16),
644                       MAP(v64_dotp_su8),
645                       MAP(v64_u64),
646                       MAP(v64_low_u32),
647                       MAP(v64_high_u32),
648                       MAP(v64_low_s32),
649                       MAP(v64_high_s32),
650                       MAP(v64_dup_8),
651                       MAP(v64_dup_16),
652                       MAP(v64_dup_32),
653                       MAP(v64_from_32),
654                       MAP(v64_zero),
655                       MAP(v64_from_16),
656                       MAP(v128_sad_u8),
657                       MAP(v128_ssd_u8),
658                       MAP(v128_sad_u16),
659                       MAP(v128_ssd_s16),
660                       MAP(v128_add_8),
661                       MAP(v128_add_16),
662                       MAP(v128_sadd_s8),
663                       MAP(v128_sadd_u8),
664                       MAP(v128_sadd_s16),
665                       MAP(v128_add_32),
666                       MAP(v128_add_64),
667                       MAP(v128_sub_8),
668                       MAP(v128_ssub_u8),
669                       MAP(v128_ssub_s8),
670                       MAP(v128_sub_16),
671                       MAP(v128_ssub_s16),
672                       MAP(v128_ssub_u16),
673                       MAP(v128_sub_32),
674                       MAP(v128_sub_64),
675                       MAP(v128_ziplo_8),
676                       MAP(v128_ziphi_8),
677                       MAP(v128_ziplo_16),
678                       MAP(v128_ziphi_16),
679                       MAP(v128_ziplo_32),
680                       MAP(v128_ziphi_32),
681                       MAP(v128_ziplo_64),
682                       MAP(v128_ziphi_64),
683                       MAP(v128_unziphi_8),
684                       MAP(v128_unziplo_8),
685                       MAP(v128_unziphi_16),
686                       MAP(v128_unziplo_16),
687                       MAP(v128_unziphi_32),
688                       MAP(v128_unziplo_32),
689                       MAP(v128_pack_s32_u16),
690                       MAP(v128_pack_s32_s16),
691                       MAP(v128_pack_s16_u8),
692                       MAP(v128_pack_s16_s8),
693                       MAP(v128_or),
694                       MAP(v128_xor),
695                       MAP(v128_and),
696                       MAP(v128_andn),
697                       MAP(v128_mullo_s16),
698                       MAP(v128_mulhi_s16),
699                       MAP(v128_mullo_s32),
700                       MAP(v128_madd_s16),
701                       MAP(v128_madd_us8),
702                       MAP(v128_avg_u8),
703                       MAP(v128_rdavg_u8),
704                       MAP(v128_rdavg_u16),
705                       MAP(v128_avg_u16),
706                       MAP(v128_min_u8),
707                       MAP(v128_max_u8),
708                       MAP(v128_min_s8),
709                       MAP(v128_max_s8),
710                       MAP(v128_min_s16),
711                       MAP(v128_max_s16),
712                       MAP(v128_min_s32),
713                       MAP(v128_max_s32),
714                       MAP(v128_cmpgt_s8),
715                       MAP(v128_cmplt_s8),
716                       MAP(v128_cmpeq_8),
717                       MAP(v128_cmpgt_s16),
718                       MAP(v128_cmpeq_16),
719                       MAP(v128_cmplt_s16),
720                       MAP(v128_cmpgt_s32),
721                       MAP(v128_cmpeq_32),
722                       MAP(v128_cmplt_s32),
723                       MAP(v128_shuffle_8),
724                       MAP(imm_v128_align<1>),
725                       MAP(imm_v128_align<2>),
726                       MAP(imm_v128_align<3>),
727                       MAP(imm_v128_align<4>),
728                       MAP(imm_v128_align<5>),
729                       MAP(imm_v128_align<6>),
730                       MAP(imm_v128_align<7>),
731                       MAP(imm_v128_align<8>),
732                       MAP(imm_v128_align<9>),
733                       MAP(imm_v128_align<10>),
734                       MAP(imm_v128_align<11>),
735                       MAP(imm_v128_align<12>),
736                       MAP(imm_v128_align<13>),
737                       MAP(imm_v128_align<14>),
738                       MAP(imm_v128_align<15>),
739                       MAP(v128_abs_s8),
740                       MAP(v128_abs_s16),
741                       MAP(v128_padd_u8),
742                       MAP(v128_padd_s16),
743                       MAP(v128_unpacklo_u16_s32),
744                       MAP(v128_unpacklo_s16_s32),
745                       MAP(v128_unpackhi_u16_s32),
746                       MAP(v128_unpackhi_s16_s32),
747                       MAP(imm_v128_shr_n_byte<1>),
748                       MAP(imm_v128_shr_n_byte<2>),
749                       MAP(imm_v128_shr_n_byte<3>),
750                       MAP(imm_v128_shr_n_byte<4>),
751                       MAP(imm_v128_shr_n_byte<5>),
752                       MAP(imm_v128_shr_n_byte<6>),
753                       MAP(imm_v128_shr_n_byte<7>),
754                       MAP(imm_v128_shr_n_byte<8>),
755                       MAP(imm_v128_shr_n_byte<9>),
756                       MAP(imm_v128_shr_n_byte<10>),
757                       MAP(imm_v128_shr_n_byte<11>),
758                       MAP(imm_v128_shr_n_byte<12>),
759                       MAP(imm_v128_shr_n_byte<13>),
760                       MAP(imm_v128_shr_n_byte<14>),
761                       MAP(imm_v128_shr_n_byte<15>),
762                       MAP(imm_v128_shl_n_byte<1>),
763                       MAP(imm_v128_shl_n_byte<2>),
764                       MAP(imm_v128_shl_n_byte<3>),
765                       MAP(imm_v128_shl_n_byte<4>),
766                       MAP(imm_v128_shl_n_byte<5>),
767                       MAP(imm_v128_shl_n_byte<6>),
768                       MAP(imm_v128_shl_n_byte<7>),
769                       MAP(imm_v128_shl_n_byte<8>),
770                       MAP(imm_v128_shl_n_byte<9>),
771                       MAP(imm_v128_shl_n_byte<10>),
772                       MAP(imm_v128_shl_n_byte<11>),
773                       MAP(imm_v128_shl_n_byte<12>),
774                       MAP(imm_v128_shl_n_byte<13>),
775                       MAP(imm_v128_shl_n_byte<14>),
776                       MAP(imm_v128_shl_n_byte<15>),
777                       MAP(imm_v128_shl_n_8<1>),
778                       MAP(imm_v128_shl_n_8<2>),
779                       MAP(imm_v128_shl_n_8<3>),
780                       MAP(imm_v128_shl_n_8<4>),
781                       MAP(imm_v128_shl_n_8<5>),
782                       MAP(imm_v128_shl_n_8<6>),
783                       MAP(imm_v128_shl_n_8<7>),
784                       MAP(imm_v128_shr_n_u8<1>),
785                       MAP(imm_v128_shr_n_u8<2>),
786                       MAP(imm_v128_shr_n_u8<3>),
787                       MAP(imm_v128_shr_n_u8<4>),
788                       MAP(imm_v128_shr_n_u8<5>),
789                       MAP(imm_v128_shr_n_u8<6>),
790                       MAP(imm_v128_shr_n_u8<7>),
791                       MAP(imm_v128_shr_n_s8<1>),
792                       MAP(imm_v128_shr_n_s8<2>),
793                       MAP(imm_v128_shr_n_s8<3>),
794                       MAP(imm_v128_shr_n_s8<4>),
795                       MAP(imm_v128_shr_n_s8<5>),
796                       MAP(imm_v128_shr_n_s8<6>),
797                       MAP(imm_v128_shr_n_s8<7>),
798                       MAP(imm_v128_shl_n_16<1>),
799                       MAP(imm_v128_shl_n_16<2>),
800                       MAP(imm_v128_shl_n_16<4>),
801                       MAP(imm_v128_shl_n_16<6>),
802                       MAP(imm_v128_shl_n_16<8>),
803                       MAP(imm_v128_shl_n_16<10>),
804                       MAP(imm_v128_shl_n_16<12>),
805                       MAP(imm_v128_shl_n_16<14>),
806                       MAP(imm_v128_shr_n_u16<1>),
807                       MAP(imm_v128_shr_n_u16<2>),
808                       MAP(imm_v128_shr_n_u16<4>),
809                       MAP(imm_v128_shr_n_u16<6>),
810                       MAP(imm_v128_shr_n_u16<8>),
811                       MAP(imm_v128_shr_n_u16<10>),
812                       MAP(imm_v128_shr_n_u16<12>),
813                       MAP(imm_v128_shr_n_u16<14>),
814                       MAP(imm_v128_shr_n_s16<1>),
815                       MAP(imm_v128_shr_n_s16<2>),
816                       MAP(imm_v128_shr_n_s16<4>),
817                       MAP(imm_v128_shr_n_s16<6>),
818                       MAP(imm_v128_shr_n_s16<8>),
819                       MAP(imm_v128_shr_n_s16<10>),
820                       MAP(imm_v128_shr_n_s16<12>),
821                       MAP(imm_v128_shr_n_s16<14>),
822                       MAP(imm_v128_shl_n_32<1>),
823                       MAP(imm_v128_shl_n_32<4>),
824                       MAP(imm_v128_shl_n_32<8>),
825                       MAP(imm_v128_shl_n_32<12>),
826                       MAP(imm_v128_shl_n_32<16>),
827                       MAP(imm_v128_shl_n_32<20>),
828                       MAP(imm_v128_shl_n_32<24>),
829                       MAP(imm_v128_shl_n_32<28>),
830                       MAP(imm_v128_shr_n_u32<1>),
831                       MAP(imm_v128_shr_n_u32<4>),
832                       MAP(imm_v128_shr_n_u32<8>),
833                       MAP(imm_v128_shr_n_u32<12>),
834                       MAP(imm_v128_shr_n_u32<16>),
835                       MAP(imm_v128_shr_n_u32<20>),
836                       MAP(imm_v128_shr_n_u32<24>),
837                       MAP(imm_v128_shr_n_u32<28>),
838                       MAP(imm_v128_shr_n_s32<1>),
839                       MAP(imm_v128_shr_n_s32<4>),
840                       MAP(imm_v128_shr_n_s32<8>),
841                       MAP(imm_v128_shr_n_s32<12>),
842                       MAP(imm_v128_shr_n_s32<16>),
843                       MAP(imm_v128_shr_n_s32<20>),
844                       MAP(imm_v128_shr_n_s32<24>),
845                       MAP(imm_v128_shr_n_s32<28>),
846                       MAP(imm_v128_shl_n_64<1>),
847                       MAP(imm_v128_shl_n_64<4>),
848                       MAP(imm_v128_shl_n_64<8>),
849                       MAP(imm_v128_shl_n_64<12>),
850                       MAP(imm_v128_shl_n_64<16>),
851                       MAP(imm_v128_shl_n_64<20>),
852                       MAP(imm_v128_shl_n_64<24>),
853                       MAP(imm_v128_shl_n_64<28>),
854                       MAP(imm_v128_shl_n_64<32>),
855                       MAP(imm_v128_shl_n_64<36>),
856                       MAP(imm_v128_shl_n_64<40>),
857                       MAP(imm_v128_shl_n_64<44>),
858                       MAP(imm_v128_shl_n_64<48>),
859                       MAP(imm_v128_shl_n_64<52>),
860                       MAP(imm_v128_shl_n_64<56>),
861                       MAP(imm_v128_shl_n_64<60>),
862                       MAP(imm_v128_shr_n_u64<1>),
863                       MAP(imm_v128_shr_n_u64<4>),
864                       MAP(imm_v128_shr_n_u64<8>),
865                       MAP(imm_v128_shr_n_u64<12>),
866                       MAP(imm_v128_shr_n_u64<16>),
867                       MAP(imm_v128_shr_n_u64<20>),
868                       MAP(imm_v128_shr_n_u64<24>),
869                       MAP(imm_v128_shr_n_u64<28>),
870                       MAP(imm_v128_shr_n_u64<32>),
871                       MAP(imm_v128_shr_n_u64<36>),
872                       MAP(imm_v128_shr_n_u64<40>),
873                       MAP(imm_v128_shr_n_u64<44>),
874                       MAP(imm_v128_shr_n_u64<48>),
875                       MAP(imm_v128_shr_n_u64<52>),
876                       MAP(imm_v128_shr_n_u64<56>),
877                       MAP(imm_v128_shr_n_u64<60>),
878                       MAP(imm_v128_shr_n_s64<1>),
879                       MAP(imm_v128_shr_n_s64<4>),
880                       MAP(imm_v128_shr_n_s64<8>),
881                       MAP(imm_v128_shr_n_s64<12>),
882                       MAP(imm_v128_shr_n_s64<16>),
883                       MAP(imm_v128_shr_n_s64<20>),
884                       MAP(imm_v128_shr_n_s64<24>),
885                       MAP(imm_v128_shr_n_s64<28>),
886                       MAP(imm_v128_shr_n_s64<32>),
887                       MAP(imm_v128_shr_n_s64<36>),
888                       MAP(imm_v128_shr_n_s64<40>),
889                       MAP(imm_v128_shr_n_s64<44>),
890                       MAP(imm_v128_shr_n_s64<48>),
891                       MAP(imm_v128_shr_n_s64<52>),
892                       MAP(imm_v128_shr_n_s64<56>),
893                       MAP(imm_v128_shr_n_s64<60>),
894                       MAP(v128_from_v64),
895                       MAP(v128_zip_8),
896                       MAP(v128_zip_16),
897                       MAP(v128_zip_32),
898                       MAP(v128_mul_s16),
899                       MAP(v128_unpack_u8_s16),
900                       MAP(v128_unpack_s8_s16),
901                       MAP(v128_unpack_u16_s32),
902                       MAP(v128_unpack_s16_s32),
903                       MAP(v128_shl_8),
904                       MAP(v128_shr_u8),
905                       MAP(v128_shr_s8),
906                       MAP(v128_shl_16),
907                       MAP(v128_shr_u16),
908                       MAP(v128_shr_s16),
909                       MAP(v128_shl_32),
910                       MAP(v128_shr_u32),
911                       MAP(v128_shr_s32),
912                       MAP(v128_shl_64),
913                       MAP(v128_shr_u64),
914                       MAP(v128_shr_s64),
915                       MAP(v128_hadd_u8),
916                       MAP(v128_dotp_su8),
917                       MAP(v128_dotp_s16),
918                       MAP(v128_dotp_s32),
919                       MAP(v128_low_u32),
920                       MAP(v128_low_v64),
921                       MAP(v128_high_v64),
922                       MAP(v128_from_64),
923                       MAP(v128_from_32),
924                       MAP(v128_movemask_8),
925                       MAP(v128_zero),
926                       MAP(v128_dup_8),
927                       MAP(v128_dup_16),
928                       MAP(v128_dup_32),
929                       MAP(v128_dup_64),
930                       MAP(v128_unpacklo_u8_s16),
931                       MAP(v128_unpackhi_u8_s16),
932                       MAP(v128_unpacklo_s8_s16),
933                       MAP(v128_unpackhi_s8_s16),
934                       MAP(v128_blend_8),
935                       MAP(u32_load_unaligned),
936                       MAP(u32_store_unaligned),
937                       MAP(v64_load_unaligned),
938                       MAP(v64_store_unaligned),
939                       MAP(v128_load_unaligned),
940                       MAP(v128_store_unaligned),
941                       MAP(v256_sad_u8),
942                       MAP(v256_ssd_u8),
943                       MAP(v256_sad_u16),
944                       MAP(v256_ssd_s16),
945                       MAP(v256_hadd_u8),
946                       MAP(v256_low_u64),
947                       MAP(v256_dotp_su8),
948                       MAP(v256_dotp_s16),
949                       MAP(v256_dotp_s32),
950                       MAP(v256_add_8),
951                       MAP(v256_add_16),
952                       MAP(v256_sadd_s8),
953                       MAP(v256_sadd_u8),
954                       MAP(v256_sadd_s16),
955                       MAP(v256_add_32),
956                       MAP(v256_add_64),
957                       MAP(v256_sub_8),
958                       MAP(v256_ssub_u8),
959                       MAP(v256_ssub_s8),
960                       MAP(v256_sub_16),
961                       MAP(v256_ssub_u16),
962                       MAP(v256_ssub_s16),
963                       MAP(v256_sub_32),
964                       MAP(v256_sub_64),
965                       MAP(v256_ziplo_8),
966                       MAP(v256_ziphi_8),
967                       MAP(v256_ziplo_16),
968                       MAP(v256_ziphi_16),
969                       MAP(v256_ziplo_32),
970                       MAP(v256_ziphi_32),
971                       MAP(v256_ziplo_64),
972                       MAP(v256_ziphi_64),
973                       MAP(v256_unziphi_8),
974                       MAP(v256_unziplo_8),
975                       MAP(v256_unziphi_16),
976                       MAP(v256_unziplo_16),
977                       MAP(v256_unziphi_32),
978                       MAP(v256_unziplo_32),
979                       MAP(v256_unziphi_64),
980                       MAP(v256_unziplo_64),
981                       MAP(v256_pack_s32_u16),
982                       MAP(v256_pack_s32_s16),
983                       MAP(v256_pack_s16_u8),
984                       MAP(v256_pack_s16_s8),
985                       MAP(v256_or),
986                       MAP(v256_xor),
987                       MAP(v256_and),
988                       MAP(v256_andn),
989                       MAP(v256_mullo_s16),
990                       MAP(v256_mulhi_s16),
991                       MAP(v256_mullo_s32),
992                       MAP(v256_madd_s16),
993                       MAP(v256_madd_us8),
994                       MAP(v256_avg_u8),
995                       MAP(v256_rdavg_u8),
996                       MAP(v256_rdavg_u16),
997                       MAP(v256_avg_u16),
998                       MAP(v256_min_u8),
999                       MAP(v256_max_u8),
1000                       MAP(v256_min_s8),
1001                       MAP(v256_max_s8),
1002                       MAP(v256_min_s16),
1003                       MAP(v256_max_s16),
1004                       MAP(v256_min_s32),
1005                       MAP(v256_max_s32),
1006                       MAP(v256_cmpgt_s8),
1007                       MAP(v256_cmplt_s8),
1008                       MAP(v256_cmpeq_8),
1009                       MAP(v256_cmpgt_s16),
1010                       MAP(v256_cmplt_s16),
1011                       MAP(v256_cmpeq_16),
1012                       MAP(v256_cmpgt_s32),
1013                       MAP(v256_cmplt_s32),
1014                       MAP(v256_cmpeq_32),
1015                       MAP(v256_shuffle_8),
1016                       MAP(v256_pshuffle_8),
1017                       MAP(v256_wideshuffle_8),
1018                       MAP(imm_v256_align<1>),
1019                       MAP(imm_v256_align<2>),
1020                       MAP(imm_v256_align<3>),
1021                       MAP(imm_v256_align<4>),
1022                       MAP(imm_v256_align<5>),
1023                       MAP(imm_v256_align<6>),
1024                       MAP(imm_v256_align<7>),
1025                       MAP(imm_v256_align<8>),
1026                       MAP(imm_v256_align<9>),
1027                       MAP(imm_v256_align<10>),
1028                       MAP(imm_v256_align<11>),
1029                       MAP(imm_v256_align<12>),
1030                       MAP(imm_v256_align<13>),
1031                       MAP(imm_v256_align<14>),
1032                       MAP(imm_v256_align<15>),
1033                       MAP(imm_v256_align<16>),
1034                       MAP(imm_v256_align<17>),
1035                       MAP(imm_v256_align<18>),
1036                       MAP(imm_v256_align<19>),
1037                       MAP(imm_v256_align<20>),
1038                       MAP(imm_v256_align<21>),
1039                       MAP(imm_v256_align<22>),
1040                       MAP(imm_v256_align<23>),
1041                       MAP(imm_v256_align<24>),
1042                       MAP(imm_v256_align<25>),
1043                       MAP(imm_v256_align<26>),
1044                       MAP(imm_v256_align<27>),
1045                       MAP(imm_v256_align<28>),
1046                       MAP(imm_v256_align<29>),
1047                       MAP(imm_v256_align<30>),
1048                       MAP(imm_v256_align<31>),
1049                       MAP(v256_from_v128),
1050                       MAP(v256_zip_8),
1051                       MAP(v256_zip_16),
1052                       MAP(v256_zip_32),
1053                       MAP(v256_mul_s16),
1054                       MAP(v256_unpack_u8_s16),
1055                       MAP(v256_unpack_s8_s16),
1056                       MAP(v256_unpack_u16_s32),
1057                       MAP(v256_unpack_s16_s32),
1058                       MAP(v256_shl_8),
1059                       MAP(v256_shr_u8),
1060                       MAP(v256_shr_s8),
1061                       MAP(v256_shl_16),
1062                       MAP(v256_shr_u16),
1063                       MAP(v256_shr_s16),
1064                       MAP(v256_shl_32),
1065                       MAP(v256_shr_u32),
1066                       MAP(v256_shr_s32),
1067                       MAP(v256_shl_64),
1068                       MAP(v256_shr_u64),
1069                       MAP(v256_shr_s64),
1070                       MAP(v256_abs_s8),
1071                       MAP(v256_abs_s16),
1072                       MAP(v256_padd_u8),
1073                       MAP(v256_padd_s16),
1074                       MAP(v256_unpacklo_u16_s32),
1075                       MAP(v256_unpacklo_s16_s32),
1076                       MAP(v256_unpackhi_u16_s32),
1077                       MAP(v256_unpackhi_s16_s32),
1078                       MAP(imm_v256_shr_n_word<1>),
1079                       MAP(imm_v256_shr_n_word<2>),
1080                       MAP(imm_v256_shr_n_word<3>),
1081                       MAP(imm_v256_shr_n_word<4>),
1082                       MAP(imm_v256_shr_n_word<5>),
1083                       MAP(imm_v256_shr_n_word<6>),
1084                       MAP(imm_v256_shr_n_word<7>),
1085                       MAP(imm_v256_shr_n_word<8>),
1086                       MAP(imm_v256_shr_n_word<9>),
1087                       MAP(imm_v256_shr_n_word<10>),
1088                       MAP(imm_v256_shr_n_word<11>),
1089                       MAP(imm_v256_shr_n_word<12>),
1090                       MAP(imm_v256_shr_n_word<13>),
1091                       MAP(imm_v256_shr_n_word<14>),
1092                       MAP(imm_v256_shr_n_word<15>),
1093                       MAP(imm_v256_shl_n_word<1>),
1094                       MAP(imm_v256_shl_n_word<2>),
1095                       MAP(imm_v256_shl_n_word<3>),
1096                       MAP(imm_v256_shl_n_word<4>),
1097                       MAP(imm_v256_shl_n_word<5>),
1098                       MAP(imm_v256_shl_n_word<6>),
1099                       MAP(imm_v256_shl_n_word<7>),
1100                       MAP(imm_v256_shl_n_word<8>),
1101                       MAP(imm_v256_shl_n_word<9>),
1102                       MAP(imm_v256_shl_n_word<10>),
1103                       MAP(imm_v256_shl_n_word<11>),
1104                       MAP(imm_v256_shl_n_word<12>),
1105                       MAP(imm_v256_shl_n_word<13>),
1106                       MAP(imm_v256_shl_n_word<14>),
1107                       MAP(imm_v256_shl_n_word<15>),
1108                       MAP(imm_v256_shr_n_byte<1>),
1109                       MAP(imm_v256_shr_n_byte<2>),
1110                       MAP(imm_v256_shr_n_byte<3>),
1111                       MAP(imm_v256_shr_n_byte<4>),
1112                       MAP(imm_v256_shr_n_byte<5>),
1113                       MAP(imm_v256_shr_n_byte<6>),
1114                       MAP(imm_v256_shr_n_byte<7>),
1115                       MAP(imm_v256_shr_n_byte<8>),
1116                       MAP(imm_v256_shr_n_byte<9>),
1117                       MAP(imm_v256_shr_n_byte<10>),
1118                       MAP(imm_v256_shr_n_byte<11>),
1119                       MAP(imm_v256_shr_n_byte<12>),
1120                       MAP(imm_v256_shr_n_byte<13>),
1121                       MAP(imm_v256_shr_n_byte<14>),
1122                       MAP(imm_v256_shr_n_byte<15>),
1123                       MAP(imm_v256_shr_n_byte<16>),
1124                       MAP(imm_v256_shr_n_byte<17>),
1125                       MAP(imm_v256_shr_n_byte<18>),
1126                       MAP(imm_v256_shr_n_byte<19>),
1127                       MAP(imm_v256_shr_n_byte<20>),
1128                       MAP(imm_v256_shr_n_byte<21>),
1129                       MAP(imm_v256_shr_n_byte<22>),
1130                       MAP(imm_v256_shr_n_byte<23>),
1131                       MAP(imm_v256_shr_n_byte<24>),
1132                       MAP(imm_v256_shr_n_byte<25>),
1133                       MAP(imm_v256_shr_n_byte<26>),
1134                       MAP(imm_v256_shr_n_byte<27>),
1135                       MAP(imm_v256_shr_n_byte<28>),
1136                       MAP(imm_v256_shr_n_byte<29>),
1137                       MAP(imm_v256_shr_n_byte<30>),
1138                       MAP(imm_v256_shr_n_byte<31>),
1139                       MAP(imm_v256_shl_n_byte<1>),
1140                       MAP(imm_v256_shl_n_byte<2>),
1141                       MAP(imm_v256_shl_n_byte<3>),
1142                       MAP(imm_v256_shl_n_byte<4>),
1143                       MAP(imm_v256_shl_n_byte<5>),
1144                       MAP(imm_v256_shl_n_byte<6>),
1145                       MAP(imm_v256_shl_n_byte<7>),
1146                       MAP(imm_v256_shl_n_byte<8>),
1147                       MAP(imm_v256_shl_n_byte<9>),
1148                       MAP(imm_v256_shl_n_byte<10>),
1149                       MAP(imm_v256_shl_n_byte<11>),
1150                       MAP(imm_v256_shl_n_byte<12>),
1151                       MAP(imm_v256_shl_n_byte<13>),
1152                       MAP(imm_v256_shl_n_byte<14>),
1153                       MAP(imm_v256_shl_n_byte<15>),
1154                       MAP(imm_v256_shl_n_byte<16>),
1155                       MAP(imm_v256_shl_n_byte<17>),
1156                       MAP(imm_v256_shl_n_byte<18>),
1157                       MAP(imm_v256_shl_n_byte<19>),
1158                       MAP(imm_v256_shl_n_byte<20>),
1159                       MAP(imm_v256_shl_n_byte<21>),
1160                       MAP(imm_v256_shl_n_byte<22>),
1161                       MAP(imm_v256_shl_n_byte<23>),
1162                       MAP(imm_v256_shl_n_byte<24>),
1163                       MAP(imm_v256_shl_n_byte<25>),
1164                       MAP(imm_v256_shl_n_byte<26>),
1165                       MAP(imm_v256_shl_n_byte<27>),
1166                       MAP(imm_v256_shl_n_byte<28>),
1167                       MAP(imm_v256_shl_n_byte<29>),
1168                       MAP(imm_v256_shl_n_byte<30>),
1169                       MAP(imm_v256_shl_n_byte<31>),
1170                       MAP(imm_v256_shl_n_8<1>),
1171                       MAP(imm_v256_shl_n_8<2>),
1172                       MAP(imm_v256_shl_n_8<3>),
1173                       MAP(imm_v256_shl_n_8<4>),
1174                       MAP(imm_v256_shl_n_8<5>),
1175                       MAP(imm_v256_shl_n_8<6>),
1176                       MAP(imm_v256_shl_n_8<7>),
1177                       MAP(imm_v256_shr_n_u8<1>),
1178                       MAP(imm_v256_shr_n_u8<2>),
1179                       MAP(imm_v256_shr_n_u8<3>),
1180                       MAP(imm_v256_shr_n_u8<4>),
1181                       MAP(imm_v256_shr_n_u8<5>),
1182                       MAP(imm_v256_shr_n_u8<6>),
1183                       MAP(imm_v256_shr_n_u8<7>),
1184                       MAP(imm_v256_shr_n_s8<1>),
1185                       MAP(imm_v256_shr_n_s8<2>),
1186                       MAP(imm_v256_shr_n_s8<3>),
1187                       MAP(imm_v256_shr_n_s8<4>),
1188                       MAP(imm_v256_shr_n_s8<5>),
1189                       MAP(imm_v256_shr_n_s8<6>),
1190                       MAP(imm_v256_shr_n_s8<7>),
1191                       MAP(imm_v256_shl_n_16<1>),
1192                       MAP(imm_v256_shl_n_16<2>),
1193                       MAP(imm_v256_shl_n_16<4>),
1194                       MAP(imm_v256_shl_n_16<6>),
1195                       MAP(imm_v256_shl_n_16<8>),
1196                       MAP(imm_v256_shl_n_16<10>),
1197                       MAP(imm_v256_shl_n_16<12>),
1198                       MAP(imm_v256_shl_n_16<14>),
1199                       MAP(imm_v256_shr_n_u16<1>),
1200                       MAP(imm_v256_shr_n_u16<2>),
1201                       MAP(imm_v256_shr_n_u16<4>),
1202                       MAP(imm_v256_shr_n_u16<6>),
1203                       MAP(imm_v256_shr_n_u16<8>),
1204                       MAP(imm_v256_shr_n_u16<10>),
1205                       MAP(imm_v256_shr_n_u16<12>),
1206                       MAP(imm_v256_shr_n_u16<14>),
1207                       MAP(imm_v256_shr_n_s16<1>),
1208                       MAP(imm_v256_shr_n_s16<2>),
1209                       MAP(imm_v256_shr_n_s16<4>),
1210                       MAP(imm_v256_shr_n_s16<6>),
1211                       MAP(imm_v256_shr_n_s16<8>),
1212                       MAP(imm_v256_shr_n_s16<10>),
1213                       MAP(imm_v256_shr_n_s16<12>),
1214                       MAP(imm_v256_shr_n_s16<14>),
1215                       MAP(imm_v256_shl_n_32<1>),
1216                       MAP(imm_v256_shl_n_32<4>),
1217                       MAP(imm_v256_shl_n_32<8>),
1218                       MAP(imm_v256_shl_n_32<12>),
1219                       MAP(imm_v256_shl_n_32<16>),
1220                       MAP(imm_v256_shl_n_32<20>),
1221                       MAP(imm_v256_shl_n_32<24>),
1222                       MAP(imm_v256_shl_n_32<28>),
1223                       MAP(imm_v256_shr_n_u32<1>),
1224                       MAP(imm_v256_shr_n_u32<4>),
1225                       MAP(imm_v256_shr_n_u32<8>),
1226                       MAP(imm_v256_shr_n_u32<12>),
1227                       MAP(imm_v256_shr_n_u32<16>),
1228                       MAP(imm_v256_shr_n_u32<20>),
1229                       MAP(imm_v256_shr_n_u32<24>),
1230                       MAP(imm_v256_shr_n_u32<28>),
1231                       MAP(imm_v256_shr_n_s32<1>),
1232                       MAP(imm_v256_shr_n_s32<4>),
1233                       MAP(imm_v256_shr_n_s32<8>),
1234                       MAP(imm_v256_shr_n_s32<12>),
1235                       MAP(imm_v256_shr_n_s32<16>),
1236                       MAP(imm_v256_shr_n_s32<20>),
1237                       MAP(imm_v256_shr_n_s32<24>),
1238                       MAP(imm_v256_shr_n_s32<28>),
1239                       MAP(imm_v256_shl_n_64<1>),
1240                       MAP(imm_v256_shl_n_64<4>),
1241                       MAP(imm_v256_shl_n_64<8>),
1242                       MAP(imm_v256_shl_n_64<12>),
1243                       MAP(imm_v256_shl_n_64<16>),
1244                       MAP(imm_v256_shl_n_64<20>),
1245                       MAP(imm_v256_shl_n_64<24>),
1246                       MAP(imm_v256_shl_n_64<28>),
1247                       MAP(imm_v256_shl_n_64<32>),
1248                       MAP(imm_v256_shl_n_64<36>),
1249                       MAP(imm_v256_shl_n_64<40>),
1250                       MAP(imm_v256_shl_n_64<44>),
1251                       MAP(imm_v256_shl_n_64<48>),
1252                       MAP(imm_v256_shl_n_64<52>),
1253                       MAP(imm_v256_shl_n_64<56>),
1254                       MAP(imm_v256_shl_n_64<60>),
1255                       MAP(imm_v256_shr_n_u64<1>),
1256                       MAP(imm_v256_shr_n_u64<4>),
1257                       MAP(imm_v256_shr_n_u64<8>),
1258                       MAP(imm_v256_shr_n_u64<12>),
1259                       MAP(imm_v256_shr_n_u64<16>),
1260                       MAP(imm_v256_shr_n_u64<20>),
1261                       MAP(imm_v256_shr_n_u64<24>),
1262                       MAP(imm_v256_shr_n_u64<28>),
1263                       MAP(imm_v256_shr_n_u64<32>),
1264                       MAP(imm_v256_shr_n_u64<36>),
1265                       MAP(imm_v256_shr_n_u64<40>),
1266                       MAP(imm_v256_shr_n_u64<44>),
1267                       MAP(imm_v256_shr_n_u64<48>),
1268                       MAP(imm_v256_shr_n_u64<52>),
1269                       MAP(imm_v256_shr_n_u64<56>),
1270                       MAP(imm_v256_shr_n_u64<60>),
1271                       MAP(imm_v256_shr_n_s64<1>),
1272                       MAP(imm_v256_shr_n_s64<4>),
1273                       MAP(imm_v256_shr_n_s64<8>),
1274                       MAP(imm_v256_shr_n_s64<12>),
1275                       MAP(imm_v256_shr_n_s64<16>),
1276                       MAP(imm_v256_shr_n_s64<20>),
1277                       MAP(imm_v256_shr_n_s64<24>),
1278                       MAP(imm_v256_shr_n_s64<28>),
1279                       MAP(imm_v256_shr_n_s64<32>),
1280                       MAP(imm_v256_shr_n_s64<36>),
1281                       MAP(imm_v256_shr_n_s64<40>),
1282                       MAP(imm_v256_shr_n_s64<44>),
1283                       MAP(imm_v256_shr_n_s64<48>),
1284                       MAP(imm_v256_shr_n_s64<52>),
1285                       MAP(imm_v256_shr_n_s64<56>),
1286                       MAP(imm_v256_shr_n_s64<60>),
1287                       MAP(v256_movemask_8),
1288                       MAP(v256_zero),
1289                       MAP(v256_dup_8),
1290                       MAP(v256_dup_16),
1291                       MAP(v256_dup_32),
1292                       MAP(v256_dup_64),
1293                       MAP(v256_low_u32),
1294                       MAP(v256_low_v64),
1295                       MAP(v256_from_64),
1296                       MAP(v256_from_v64),
1297                       MAP(v256_ziplo_128),
1298                       MAP(v256_ziphi_128),
1299                       MAP(v256_unpacklo_u8_s16),
1300                       MAP(v256_unpackhi_u8_s16),
1301                       MAP(v256_unpacklo_s8_s16),
1302                       MAP(v256_unpackhi_s8_s16),
1303                       MAP(v256_blend_8),
1304                       { nullptr, nullptr, nullptr } };
1305 #undef MAP
1306 
1307 // Map reference functions to machine tuned functions. Since the
1308 // functions depend on machine tuned types, the non-machine tuned
1309 // instantiations of the test can't refer to these functions directly,
1310 // so we refer to them by name and do the mapping here.
Map(const char * name,fptr * ref,fptr * simd)1311 void Map(const char *name, fptr *ref, fptr *simd) {
1312   unsigned int i;
1313   for (i = 0; m[i].name && strcmp(name, m[i].name); i++) {
1314   }
1315 
1316   *ref = m[i].ref;
1317   *simd = m[i].simd;
1318 }
1319 
1320 // Used for printing errors in TestSimd1Arg, TestSimd2Args and TestSimd3Args
Print(const uint8_t * a,int size)1321 std::string Print(const uint8_t *a, int size) {
1322   std::string text = "0x";
1323   for (int i = 0; i < size; i++) {
1324     const uint8_t c = a[!CONFIG_BIG_ENDIAN ? size - 1 - i : i];
1325     // Same as snprintf(..., ..., "%02x", c)
1326     text += (c >> 4) + '0' + ((c >> 4) > 9) * ('a' - '0' - 10);
1327     text += (c & 15) + '0' + ((c & 15) > 9) * ('a' - '0' - 10);
1328   }
1329 
1330   return text;
1331 }
1332 
1333 // Used in TestSimd1Arg, TestSimd2Args and TestSimd3Args to restrict argument
1334 // ranges
SetMask(uint8_t * s,int size,uint32_t mask,uint32_t maskwidth)1335 void SetMask(uint8_t *s, int size, uint32_t mask, uint32_t maskwidth) {
1336   switch (maskwidth) {
1337     case 0: {
1338       break;
1339     }
1340     case 8: {
1341       for (int i = 0; i < size; i++) s[i] &= mask;
1342       break;
1343     }
1344     case 16: {
1345       uint16_t *t = reinterpret_cast<uint16_t *>(s);
1346       assert(!(reinterpret_cast<uintptr_t>(s) & 1));
1347       for (int i = 0; i < size / 2; i++) t[i] &= mask;
1348       break;
1349     }
1350     case 32: {
1351       uint32_t *t = reinterpret_cast<uint32_t *>(s);
1352       assert(!(reinterpret_cast<uintptr_t>(s) & 3));
1353       for (int i = 0; i < size / 4; i++) t[i] &= mask;
1354       break;
1355     }
1356     case 64: {
1357       uint64_t *t = reinterpret_cast<uint64_t *>(s);
1358       assert(!(reinterpret_cast<uintptr_t>(s) & 7));
1359       for (int i = 0; i < size / 8; i++) t[i] &= mask;
1360       break;
1361     }
1362     default: {
1363       FAIL() << "Unsupported mask width";
1364       break;
1365     }
1366   }
1367 }
1368 
1369 // We need some extra load/store functions
u64_store_aligned(void * p,uint64_t a)1370 void u64_store_aligned(void *p, uint64_t a) {
1371   v64_store_aligned(p, v64_from_64(a));
1372 }
s32_store_aligned(void * p,int32_t a)1373 void s32_store_aligned(void *p, int32_t a) {
1374   u32_store_aligned(p, static_cast<uint32_t>(a));
1375 }
s64_store_aligned(void * p,int64_t a)1376 void s64_store_aligned(void *p, int64_t a) {
1377   v64_store_aligned(p, v64_from_64(static_cast<uint64_t>(a)));
1378 }
1379 
c_u64_store_aligned(void * p,uint64_t a)1380 void c_u64_store_aligned(void *p, uint64_t a) {
1381   c_v64_store_aligned(p, c_v64_from_64(a));
1382 }
1383 
c_s32_store_aligned(void * p,int32_t a)1384 void c_s32_store_aligned(void *p, int32_t a) {
1385   c_u32_store_aligned(p, static_cast<uint32_t>(a));
1386 }
1387 
c_s64_store_aligned(void * p,int64_t a)1388 void c_s64_store_aligned(void *p, int64_t a) {
1389   c_v64_store_aligned(p, c_v64_from_64(static_cast<uint64_t>(a)));
1390 }
1391 
u64_load_aligned(const void * p)1392 uint64_t u64_load_aligned(const void *p) {
1393   return v64_u64(v64_load_aligned(p));
1394 }
u16_load_aligned(const void * p)1395 uint16_t u16_load_aligned(const void *p) {
1396   return *(reinterpret_cast<const uint16_t *>(p));
1397 }
u8_load_aligned(const void * p)1398 uint8_t u8_load_aligned(const void *p) {
1399   return *(reinterpret_cast<const uint8_t *>(p));
1400 }
1401 
c_u64_load_aligned(const void * p)1402 uint64_t c_u64_load_aligned(const void *p) {
1403   return c_v64_u64(c_v64_load_aligned(p));
1404 }
c_u16_load_aligned(const void * p)1405 uint16_t c_u16_load_aligned(const void *p) {
1406   return *(reinterpret_cast<const uint16_t *>(p));
1407 }
c_u8_load_aligned(const void * p)1408 uint8_t c_u8_load_aligned(const void *p) {
1409   return *(reinterpret_cast<const uint8_t *>(p));
1410 }
1411 
1412 // CompareSimd1Arg, CompareSimd2Args and CompareSimd3Args compare
1413 // intrinsics taking 1, 2 or 3 arguments respectively with their
1414 // corresponding C reference.  Ideally, the loads and stores should
1415 // have gone into the template parameter list, but v64 and v128 could
1416 // be typedef'ed to the same type (which is the case on x86) and then
1417 // we can't instantiate both v64 and v128, so the function return and
1418 // argument types, including the always differing types in the C
1419 // equivalent are used instead.  The function arguments must be void
1420 // pointers and then go through a cast to avoid matching errors in the
1421 // branches eliminated by the typeid tests in the calling function.
1422 template <typename Ret, typename Arg, typename CRet, typename CArg>
CompareSimd1Arg(fptr store,fptr load,fptr simd,void * d,fptr c_store,fptr c_load,fptr c_simd,void * ref_d,const void * a)1423 int CompareSimd1Arg(fptr store, fptr load, fptr simd, void *d, fptr c_store,
1424                     fptr c_load, fptr c_simd, void *ref_d, const void *a) {
1425   void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
1426   Arg (*const my_load)(const void *) = (Arg(*const)(const void *))load;
1427   Ret (*const my_simd)(Arg) = (Ret(*const)(Arg))simd;
1428   void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
1429   CArg (*const my_c_load)(const void *) = (CArg(*const)(const void *))c_load;
1430   CRet (*const my_c_simd)(CArg) = (CRet(*const)(CArg))c_simd;
1431 
1432   // Call reference and intrinsic
1433   my_c_store(ref_d, my_c_simd(my_c_load(a)));
1434   my_store(d, my_simd(my_load(a)));
1435 
1436   // Compare results
1437   return memcmp(ref_d, d, sizeof(CRet));
1438 }
1439 
1440 template <typename Ret, typename Arg1, typename Arg2, typename CRet,
1441           typename CArg1, typename CArg2>
CompareSimd2Args(fptr store,fptr load1,fptr load2,fptr simd,void * d,fptr c_store,fptr c_load1,fptr c_load2,fptr c_simd,void * ref_d,const void * a,const void * b)1442 int CompareSimd2Args(fptr store, fptr load1, fptr load2, fptr simd, void *d,
1443                      fptr c_store, fptr c_load1, fptr c_load2, fptr c_simd,
1444                      void *ref_d, const void *a, const void *b) {
1445   void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
1446   Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
1447   Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
1448   Ret (*const my_simd)(Arg1, Arg2) = (Ret(*const)(Arg1, Arg2))simd;
1449   void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
1450   CArg1 (*const my_c_load1)(const void *) =
1451       (CArg1(*const)(const void *))c_load1;
1452   CArg2 (*const my_c_load2)(const void *) =
1453       (CArg2(*const)(const void *))c_load2;
1454   CRet (*const my_c_simd)(CArg1, CArg2) = (CRet(*const)(CArg1, CArg2))c_simd;
1455 
1456   // Call reference and intrinsic
1457   my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b)));
1458   my_store(d, my_simd(my_load1(a), my_load2(b)));
1459 
1460   // Compare results
1461   return memcmp(ref_d, d, sizeof(CRet));
1462 }
1463 
1464 template <typename Ret, typename Arg1, typename Arg2, typename Arg3,
1465           typename CRet, typename CArg1, typename CArg2, typename CArg3>
CompareSimd3Args(fptr store,fptr load1,fptr load2,fptr load3,fptr simd,void * d,fptr c_store,fptr c_load1,fptr c_load2,fptr c_load3,fptr c_simd,void * ref_d,const void * a,const void * b,const void * c)1466 int CompareSimd3Args(fptr store, fptr load1, fptr load2, fptr load3, fptr simd,
1467                      void *d, fptr c_store, fptr c_load1, fptr c_load2,
1468                      fptr c_load3, fptr c_simd, void *ref_d, const void *a,
1469                      const void *b, const void *c) {
1470   void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
1471   Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
1472   Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
1473   Arg3 (*const my_load3)(const void *) = (Arg3(*const)(const void *))load3;
1474   Ret (*const my_simd)(Arg1, Arg2, Arg3) = (Ret(*const)(Arg1, Arg2, Arg3))simd;
1475   void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
1476   CArg1 (*const my_c_load1)(const void *) =
1477       (CArg1(*const)(const void *))c_load1;
1478   CArg2 (*const my_c_load2)(const void *) =
1479       (CArg2(*const)(const void *))c_load2;
1480   CArg3 (*const my_c_load3)(const void *) =
1481       (CArg3(*const)(const void *))c_load3;
1482   CRet (*const my_c_simd)(CArg1, CArg2, CArg3) =
1483       (CRet(*const)(CArg1, CArg2, CArg3))c_simd;
1484 
1485   // Call reference and intrinsic
1486   my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b), my_c_load3(c)));
1487   my_store(d, my_simd(my_load1(a), my_load2(b), my_load3(c)));
1488 
1489   // Compare results
1490   return memcmp(ref_d, d, sizeof(CRet));
1491 }
1492 
1493 }  // namespace
1494 
1495 template <typename CRet, typename CArg>
TestSimd1Arg(uint32_t iterations,uint32_t mask,uint32_t maskwidth,const char * name)1496 void TestSimd1Arg(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
1497                   const char *name) {
1498   ACMRandom rnd(ACMRandom::DeterministicSeed());
1499   fptr ref_simd;
1500   fptr simd;
1501   int error = 0;
1502   DECLARE_ALIGNED(32, uint8_t, s[32]);
1503   DECLARE_ALIGNED(32, uint8_t, d[32]);
1504   DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
1505   assert(sizeof(CArg) <= 32 && sizeof(CRet) <= 32);
1506   memset(ref_d, 0, sizeof(ref_d));
1507   memset(d, 0, sizeof(d));
1508 
1509   Map(name, &ref_simd, &simd);
1510   if (simd == nullptr || ref_simd == nullptr) {
1511     FAIL() << "Internal error: Unknown intrinsic function " << name;
1512   }
1513   for (unsigned int count = 0;
1514        count < iterations && !error && !testing::Test::HasFailure(); count++) {
1515     for (unsigned int c = 0; c < sizeof(CArg); c++) s[c] = rnd.Rand8();
1516 
1517     if (maskwidth) {
1518       SetMask(s, sizeof(CArg), mask, maskwidth);
1519     }
1520 
1521     if (typeid(CRet) == typeid(c_v64) && typeid(CArg) == typeid(c_v64)) {
1522       // V64_V64
1523       error = CompareSimd1Arg<v64, v64, c_v64, c_v64>(
1524           reinterpret_cast<fptr>(v64_store_aligned),
1525           reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1526           reinterpret_cast<fptr>(c_v64_store_aligned),
1527           reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1528     } else if (typeid(CRet) == typeid(c_v64) &&
1529                typeid(CArg) == typeid(uint8_t)) {
1530       // V64_U8
1531       error = CompareSimd1Arg<v64, uint8_t, c_v64, uint8_t>(
1532           reinterpret_cast<fptr>(v64_store_aligned),
1533           reinterpret_cast<fptr>(u8_load_aligned), simd, d,
1534           reinterpret_cast<fptr>(c_v64_store_aligned),
1535           reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
1536     } else if (typeid(CRet) == typeid(c_v64) &&
1537                typeid(CArg) == typeid(uint16_t)) {
1538       // V64_U16
1539       error = CompareSimd1Arg<v64, uint16_t, c_v64, uint16_t>(
1540           reinterpret_cast<fptr>(v64_store_aligned),
1541           reinterpret_cast<fptr>(u16_load_aligned), simd, d,
1542           reinterpret_cast<fptr>(c_v64_store_aligned),
1543           reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
1544     } else if (typeid(CRet) == typeid(c_v64) &&
1545                typeid(CArg) == typeid(uint32_t)) {
1546       // V64_U32
1547       error = CompareSimd1Arg<v64, uint32_t, c_v64, uint32_t>(
1548           reinterpret_cast<fptr>(v64_store_aligned),
1549           reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1550           reinterpret_cast<fptr>(c_v64_store_aligned),
1551           reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
1552     } else if (typeid(CRet) == typeid(uint64_t) &&
1553                typeid(CArg) == typeid(c_v64)) {
1554       // U64_V64
1555       error = CompareSimd1Arg<uint64_t, v64, uint64_t, c_v64>(
1556           reinterpret_cast<fptr>(u64_store_aligned),
1557           reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1558           reinterpret_cast<fptr>(c_u64_store_aligned),
1559           reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1560     } else if (typeid(CRet) == typeid(int64_t) &&
1561                typeid(CArg) == typeid(c_v64)) {
1562       // S64_V64
1563       error = CompareSimd1Arg<int64_t, v64, int64_t, c_v64>(
1564           reinterpret_cast<fptr>(s64_store_aligned),
1565           reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1566           reinterpret_cast<fptr>(c_s64_store_aligned),
1567           reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1568     } else if (typeid(CRet) == typeid(uint32_t) &&
1569                typeid(CArg) == typeid(c_v64)) {
1570       // U32_V64
1571       error = CompareSimd1Arg<uint32_t, v64, uint32_t, c_v64>(
1572           reinterpret_cast<fptr>(u32_store_aligned),
1573           reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1574           reinterpret_cast<fptr>(c_u32_store_aligned),
1575           reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1576     } else if (typeid(CRet) == typeid(int32_t) &&
1577                typeid(CArg) == typeid(c_v64)) {
1578       // S32_V64
1579       error = CompareSimd1Arg<int32_t, v64, int32_t, c_v64>(
1580           reinterpret_cast<fptr>(s32_store_aligned),
1581           reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1582           reinterpret_cast<fptr>(c_s32_store_aligned),
1583           reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1584     } else if (typeid(CRet) == typeid(uint32_t) &&
1585                typeid(CArg) == typeid(c_v128)) {
1586       // U32_V128
1587       error = CompareSimd1Arg<uint32_t, v128, uint32_t, c_v128>(
1588           reinterpret_cast<fptr>(u32_store_aligned),
1589           reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1590           reinterpret_cast<fptr>(c_u32_store_aligned),
1591           reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
1592     } else if (typeid(CRet) == typeid(uint64_t) &&
1593                typeid(CArg) == typeid(c_v128)) {
1594       // U64_V128
1595       error = CompareSimd1Arg<uint64_t, v128, uint64_t, c_v128>(
1596           reinterpret_cast<fptr>(u64_store_aligned),
1597           reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1598           reinterpret_cast<fptr>(c_u64_store_aligned),
1599           reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
1600     } else if (typeid(CRet) == typeid(uint64_t) &&
1601                typeid(CArg) == typeid(c_v256)) {
1602       // U64_V256
1603       error = CompareSimd1Arg<uint64_t, v256, uint64_t, c_v256>(
1604           reinterpret_cast<fptr>(u64_store_aligned),
1605           reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1606           reinterpret_cast<fptr>(c_u64_store_aligned),
1607           reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
1608     } else if (typeid(CRet) == typeid(c_v64) &&
1609                typeid(CArg) == typeid(c_v128)) {
1610       // V64_V128
1611       error = CompareSimd1Arg<v64, v128, c_v64, c_v128>(
1612           reinterpret_cast<fptr>(v64_store_aligned),
1613           reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1614           reinterpret_cast<fptr>(c_v64_store_aligned),
1615           reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
1616     } else if (typeid(CRet) == typeid(c_v128) &&
1617                typeid(CArg) == typeid(c_v128)) {
1618       // V128_V128
1619       error = CompareSimd1Arg<v128, v128, c_v128, c_v128>(
1620           reinterpret_cast<fptr>(v128_store_aligned),
1621           reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1622           reinterpret_cast<fptr>(c_v128_store_aligned),
1623           reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
1624     } else if (typeid(CRet) == typeid(c_v128) &&
1625                typeid(CArg) == typeid(c_v64)) {
1626       // V128_V64
1627       error = CompareSimd1Arg<v128, v64, c_v128, c_v64>(
1628           reinterpret_cast<fptr>(v128_store_aligned),
1629           reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1630           reinterpret_cast<fptr>(c_v128_store_aligned),
1631           reinterpret_cast<fptr>(c_v64_load_aligned), ref_simd, ref_d, s);
1632     } else if (typeid(CRet) == typeid(c_v128) &&
1633                typeid(CArg) == typeid(uint8_t)) {
1634       // V128_U8
1635       error = CompareSimd1Arg<v128, uint8_t, c_v128, uint8_t>(
1636           reinterpret_cast<fptr>(v128_store_aligned),
1637           reinterpret_cast<fptr>(u8_load_aligned), simd, d,
1638           reinterpret_cast<fptr>(c_v128_store_aligned),
1639           reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
1640     } else if (typeid(CRet) == typeid(c_v128) &&
1641                typeid(CArg) == typeid(uint16_t)) {
1642       // V128_U16
1643       error = CompareSimd1Arg<v128, uint16_t, c_v128, uint16_t>(
1644           reinterpret_cast<fptr>(v128_store_aligned),
1645           reinterpret_cast<fptr>(u16_load_aligned), simd, d,
1646           reinterpret_cast<fptr>(c_v128_store_aligned),
1647           reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
1648     } else if (typeid(CRet) == typeid(c_v128) &&
1649                typeid(CArg) == typeid(uint32_t)) {
1650       // V128_U32
1651       error = CompareSimd1Arg<v128, uint32_t, c_v128, uint32_t>(
1652           reinterpret_cast<fptr>(v128_store_aligned),
1653           reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1654           reinterpret_cast<fptr>(c_v128_store_aligned),
1655           reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
1656     } else if (typeid(CRet) == typeid(c_v128) &&
1657                typeid(CArg) == typeid(uint64_t)) {
1658       // V128_U64
1659       error = CompareSimd1Arg<v128, uint64_t, c_v128, uint64_t>(
1660           reinterpret_cast<fptr>(v128_store_aligned),
1661           reinterpret_cast<fptr>(u64_load_aligned), simd, d,
1662           reinterpret_cast<fptr>(c_v128_store_aligned),
1663           reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
1664     } else if (typeid(CRet) == typeid(c_v256) &&
1665                typeid(CArg) == typeid(c_v256)) {
1666       // V256_V256
1667       error = CompareSimd1Arg<v256, v256, c_v256, c_v256>(
1668           reinterpret_cast<fptr>(v256_store_aligned),
1669           reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1670           reinterpret_cast<fptr>(c_v256_store_aligned),
1671           reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
1672     } else if (typeid(CRet) == typeid(c_v256) &&
1673                typeid(CArg) == typeid(c_v128)) {
1674       // V256_V128
1675       error = CompareSimd1Arg<v256, v128, c_v256, c_v128>(
1676           reinterpret_cast<fptr>(v256_store_aligned),
1677           reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1678           reinterpret_cast<fptr>(c_v256_store_aligned),
1679           reinterpret_cast<fptr>(c_v128_load_aligned), ref_simd, ref_d, s);
1680     } else if (typeid(CRet) == typeid(c_v256) &&
1681                typeid(CArg) == typeid(uint8_t)) {
1682       // V256_U8
1683       error = CompareSimd1Arg<v256, uint8_t, c_v256, uint8_t>(
1684           reinterpret_cast<fptr>(v256_store_aligned),
1685           reinterpret_cast<fptr>(u8_load_aligned), simd, d,
1686           reinterpret_cast<fptr>(c_v256_store_aligned),
1687           reinterpret_cast<fptr>(c_u8_load_aligned), ref_simd, ref_d, s);
1688     } else if (typeid(CRet) == typeid(c_v256) &&
1689                typeid(CArg) == typeid(uint16_t)) {
1690       // V256_U16
1691       error = CompareSimd1Arg<v256, uint16_t, c_v256, uint16_t>(
1692           reinterpret_cast<fptr>(v256_store_aligned),
1693           reinterpret_cast<fptr>(u16_load_aligned), simd, d,
1694           reinterpret_cast<fptr>(c_v256_store_aligned),
1695           reinterpret_cast<fptr>(c_u16_load_aligned), ref_simd, ref_d, s);
1696     } else if (typeid(CRet) == typeid(c_v256) &&
1697                typeid(CArg) == typeid(uint32_t)) {
1698       // V256_U32
1699       error = CompareSimd1Arg<v256, uint32_t, c_v256, uint32_t>(
1700           reinterpret_cast<fptr>(v256_store_aligned),
1701           reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1702           reinterpret_cast<fptr>(c_v256_store_aligned),
1703           reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
1704     } else if (typeid(CRet) == typeid(c_v256) &&
1705                typeid(CArg) == typeid(uint64_t)) {
1706       // V256_U64
1707       error = CompareSimd1Arg<v256, uint64_t, c_v256, uint64_t>(
1708           reinterpret_cast<fptr>(v256_store_aligned),
1709           reinterpret_cast<fptr>(u64_load_aligned), simd, d,
1710           reinterpret_cast<fptr>(c_v256_store_aligned),
1711           reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
1712     } else if (typeid(CRet) == typeid(uint32_t) &&
1713                typeid(CArg) == typeid(c_v256)) {
1714       // U32_V256
1715       error = CompareSimd1Arg<uint32_t, v256, uint32_t, c_v256>(
1716           reinterpret_cast<fptr>(u32_store_aligned),
1717           reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1718           reinterpret_cast<fptr>(c_u32_store_aligned),
1719           reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
1720     } else if (typeid(CRet) == typeid(c_v64) &&
1721                typeid(CArg) == typeid(c_v256)) {
1722       // V64_V256
1723       error = CompareSimd1Arg<v64, v256, c_v64, c_v256>(
1724           reinterpret_cast<fptr>(v64_store_aligned),
1725           reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1726           reinterpret_cast<fptr>(c_v64_store_aligned),
1727           reinterpret_cast<fptr>(c_v256_load_aligned), ref_simd, ref_d, s);
1728     } else {
1729       FAIL() << "Internal error: Unknown intrinsic function "
1730              << typeid(CRet).name() << " " << name << "(" << typeid(CArg).name()
1731              << ")";
1732     }
1733   }
1734 
1735   EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
1736                       << Print(s, sizeof(CArg)) << ") -> "
1737                       << Print(d, sizeof(CRet)) << " (simd), "
1738                       << Print(ref_d, sizeof(CRet)) << " (ref)";
1739 }
1740 
1741 template <typename CRet, typename CArg1, typename CArg2>
TestSimd2Args(uint32_t iterations,uint32_t mask,uint32_t maskwidth,const char * name)1742 void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
1743                    const char *name) {
1744   ACMRandom rnd(ACMRandom::DeterministicSeed());
1745   fptr ref_simd;
1746   fptr simd;
1747   int error = 0;
1748   DECLARE_ALIGNED(32, uint8_t, s1[32]);
1749   DECLARE_ALIGNED(32, uint8_t, s2[32]);
1750   DECLARE_ALIGNED(32, uint8_t, d[32]);
1751   DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
1752   assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CRet) <= 32);
1753   memset(ref_d, 0, sizeof(ref_d));
1754   memset(d, 0, sizeof(d));
1755 
1756   Map(name, &ref_simd, &simd);
1757   if (simd == nullptr || ref_simd == nullptr) {
1758     FAIL() << "Internal error: Unknown intrinsic function " << name;
1759   }
1760 
1761   for (unsigned int count = 0;
1762        count < iterations && !error && !testing::Test::HasFailure(); count++) {
1763     for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
1764 
1765     for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
1766 
1767     if (maskwidth) SetMask(s2, sizeof(CArg2), mask, maskwidth);
1768 
1769     if (typeid(CRet) == typeid(c_v64) && typeid(CArg1) == typeid(c_v64) &&
1770         typeid(CArg2) == typeid(c_v64)) {
1771       // V64_V64V64
1772       error = CompareSimd2Args<v64, v64, v64, c_v64, c_v64, c_v64>(
1773           reinterpret_cast<fptr>(v64_store_aligned),
1774           reinterpret_cast<fptr>(v64_load_aligned),
1775           reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1776           reinterpret_cast<fptr>(c_v64_store_aligned),
1777           reinterpret_cast<fptr>(c_v64_load_aligned),
1778           reinterpret_cast<fptr>(c_v64_load_aligned),
1779           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1780     } else if (typeid(CRet) == typeid(c_v64) &&
1781                typeid(CArg1) == typeid(uint32_t) &&
1782                typeid(CArg2) == typeid(uint32_t)) {
1783       // V64_U32U32
1784       error =
1785           CompareSimd2Args<v64, uint32_t, uint32_t, c_v64, uint32_t, uint32_t>(
1786               reinterpret_cast<fptr>(v64_store_aligned),
1787               reinterpret_cast<fptr>(u32_load_aligned),
1788               reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1789               reinterpret_cast<fptr>(c_v64_store_aligned),
1790               reinterpret_cast<fptr>(c_u32_load_aligned),
1791               reinterpret_cast<fptr>(c_u32_load_aligned),
1792               reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1793     } else if (typeid(CRet) == typeid(uint32_t) &&
1794                typeid(CArg1) == typeid(c_v64) &&
1795                typeid(CArg2) == typeid(c_v64)) {
1796       // U32_V64V64
1797       error = CompareSimd2Args<uint32_t, v64, v64, uint32_t, c_v64, c_v64>(
1798           reinterpret_cast<fptr>(u32_store_aligned),
1799           reinterpret_cast<fptr>(v64_load_aligned),
1800           reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1801           reinterpret_cast<fptr>(c_u32_store_aligned),
1802           reinterpret_cast<fptr>(c_v64_load_aligned),
1803           reinterpret_cast<fptr>(c_v64_load_aligned),
1804           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1805     } else if (typeid(CRet) == typeid(int64_t) &&
1806                typeid(CArg1) == typeid(c_v64) &&
1807                typeid(CArg2) == typeid(c_v64)) {
1808       // S64_V64V64
1809       error = CompareSimd2Args<int64_t, v64, v64, int64_t, c_v64, c_v64>(
1810           reinterpret_cast<fptr>(s64_store_aligned),
1811           reinterpret_cast<fptr>(v64_load_aligned),
1812           reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1813           reinterpret_cast<fptr>(c_s64_store_aligned),
1814           reinterpret_cast<fptr>(c_v64_load_aligned),
1815           reinterpret_cast<fptr>(c_v64_load_aligned),
1816           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1817     } else if (typeid(CRet) == typeid(c_v64) &&
1818                typeid(CArg1) == typeid(c_v64) &&
1819                typeid(CArg2) == typeid(uint32_t)) {
1820       // V64_V64U32
1821       error = CompareSimd2Args<v64, v64, uint32_t, c_v64, c_v64, uint32_t>(
1822           reinterpret_cast<fptr>(v64_store_aligned),
1823           reinterpret_cast<fptr>(v64_load_aligned),
1824           reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1825           reinterpret_cast<fptr>(c_v64_store_aligned),
1826           reinterpret_cast<fptr>(c_v64_load_aligned),
1827           reinterpret_cast<fptr>(c_u32_load_aligned),
1828           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1829     } else if (typeid(CRet) == typeid(c_v128) &&
1830                typeid(CArg1) == typeid(c_v128) &&
1831                typeid(CArg2) == typeid(c_v128)) {
1832       // V128_V128V128
1833       error = CompareSimd2Args<v128, v128, v128, c_v128, c_v128, c_v128>(
1834           reinterpret_cast<fptr>(v128_store_aligned),
1835           reinterpret_cast<fptr>(v128_load_aligned),
1836           reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1837           reinterpret_cast<fptr>(c_v128_store_aligned),
1838           reinterpret_cast<fptr>(c_v128_load_aligned),
1839           reinterpret_cast<fptr>(c_v128_load_aligned),
1840           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1841     } else if (typeid(CRet) == typeid(uint32_t) &&
1842                typeid(CArg1) == typeid(c_v128) &&
1843                typeid(CArg2) == typeid(c_v128)) {
1844       // U32_V128V128
1845       error = CompareSimd2Args<uint32_t, v128, v128, uint32_t, c_v128, c_v128>(
1846           reinterpret_cast<fptr>(u32_store_aligned),
1847           reinterpret_cast<fptr>(v128_load_aligned),
1848           reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1849           reinterpret_cast<fptr>(c_u32_store_aligned),
1850           reinterpret_cast<fptr>(c_v128_load_aligned),
1851           reinterpret_cast<fptr>(c_v128_load_aligned),
1852           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1853     } else if (typeid(CRet) == typeid(uint64_t) &&
1854                typeid(CArg1) == typeid(c_v128) &&
1855                typeid(CArg2) == typeid(c_v128)) {
1856       // U64_V128V128
1857       error = CompareSimd2Args<uint64_t, v128, v128, uint64_t, c_v128, c_v128>(
1858           reinterpret_cast<fptr>(u64_store_aligned),
1859           reinterpret_cast<fptr>(v128_load_aligned),
1860           reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1861           reinterpret_cast<fptr>(c_u64_store_aligned),
1862           reinterpret_cast<fptr>(c_v128_load_aligned),
1863           reinterpret_cast<fptr>(c_v128_load_aligned),
1864           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1865     } else if (typeid(CRet) == typeid(int64_t) &&
1866                typeid(CArg1) == typeid(c_v128) &&
1867                typeid(CArg2) == typeid(c_v128)) {
1868       // S64_V128V128
1869       error = CompareSimd2Args<int64_t, v128, v128, int64_t, c_v128, c_v128>(
1870           reinterpret_cast<fptr>(s64_store_aligned),
1871           reinterpret_cast<fptr>(v128_load_aligned),
1872           reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1873           reinterpret_cast<fptr>(c_s64_store_aligned),
1874           reinterpret_cast<fptr>(c_v128_load_aligned),
1875           reinterpret_cast<fptr>(c_v128_load_aligned),
1876           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1877     } else if (typeid(CRet) == typeid(c_v128) &&
1878                typeid(CArg1) == typeid(uint64_t) &&
1879                typeid(CArg2) == typeid(uint64_t)) {
1880       // V128_U64U64
1881       error = CompareSimd2Args<v128, uint64_t, uint64_t, c_v128, uint64_t,
1882                                uint64_t>(
1883           reinterpret_cast<fptr>(v128_store_aligned),
1884           reinterpret_cast<fptr>(u64_load_aligned),
1885           reinterpret_cast<fptr>(u64_load_aligned), simd, d,
1886           reinterpret_cast<fptr>(c_v128_store_aligned),
1887           reinterpret_cast<fptr>(c_u64_load_aligned),
1888           reinterpret_cast<fptr>(c_u64_load_aligned),
1889           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1890     } else if (typeid(CRet) == typeid(c_v128) &&
1891                typeid(CArg1) == typeid(c_v64) &&
1892                typeid(CArg2) == typeid(c_v64)) {
1893       // V128_V64V64
1894       error = CompareSimd2Args<v128, v64, v64, c_v128, c_v64, c_v64>(
1895           reinterpret_cast<fptr>(v128_store_aligned),
1896           reinterpret_cast<fptr>(v64_load_aligned),
1897           reinterpret_cast<fptr>(v64_load_aligned), simd, d,
1898           reinterpret_cast<fptr>(c_v128_store_aligned),
1899           reinterpret_cast<fptr>(c_v64_load_aligned),
1900           reinterpret_cast<fptr>(c_v64_load_aligned),
1901           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1902     } else if (typeid(CRet) == typeid(c_v128) &&
1903                typeid(CArg1) == typeid(c_v128) &&
1904                typeid(CArg2) == typeid(uint32_t)) {
1905       // V128_V128U32
1906       error = CompareSimd2Args<v128, v128, uint32_t, c_v128, c_v128, uint32_t>(
1907           reinterpret_cast<fptr>(v128_store_aligned),
1908           reinterpret_cast<fptr>(v128_load_aligned),
1909           reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1910           reinterpret_cast<fptr>(c_v128_store_aligned),
1911           reinterpret_cast<fptr>(c_v128_load_aligned),
1912           reinterpret_cast<fptr>(c_u32_load_aligned),
1913           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1914     } else if (typeid(CRet) == typeid(c_v256) &&
1915                typeid(CArg1) == typeid(c_v256) &&
1916                typeid(CArg2) == typeid(c_v256)) {
1917       // V256_V256V256
1918       error = CompareSimd2Args<v256, v256, v256, c_v256, c_v256, c_v256>(
1919           reinterpret_cast<fptr>(v256_store_aligned),
1920           reinterpret_cast<fptr>(v256_load_aligned),
1921           reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1922           reinterpret_cast<fptr>(c_v256_store_aligned),
1923           reinterpret_cast<fptr>(c_v256_load_aligned),
1924           reinterpret_cast<fptr>(c_v256_load_aligned),
1925           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1926     } else if (typeid(CRet) == typeid(uint64_t) &&
1927                typeid(CArg1) == typeid(c_v256) &&
1928                typeid(CArg2) == typeid(c_v256)) {
1929       // U64_V256V256
1930       error = CompareSimd2Args<uint64_t, v256, v256, uint64_t, c_v256, c_v256>(
1931           reinterpret_cast<fptr>(u64_store_aligned),
1932           reinterpret_cast<fptr>(v256_load_aligned),
1933           reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1934           reinterpret_cast<fptr>(c_u64_store_aligned),
1935           reinterpret_cast<fptr>(c_v256_load_aligned),
1936           reinterpret_cast<fptr>(c_v256_load_aligned),
1937           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1938     } else if (typeid(CRet) == typeid(int64_t) &&
1939                typeid(CArg1) == typeid(c_v256) &&
1940                typeid(CArg2) == typeid(c_v256)) {
1941       // S64_V256V256
1942       error = CompareSimd2Args<int64_t, v256, v256, int64_t, c_v256, c_v256>(
1943           reinterpret_cast<fptr>(s64_store_aligned),
1944           reinterpret_cast<fptr>(v256_load_aligned),
1945           reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1946           reinterpret_cast<fptr>(c_s64_store_aligned),
1947           reinterpret_cast<fptr>(c_v256_load_aligned),
1948           reinterpret_cast<fptr>(c_v256_load_aligned),
1949           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1950     } else if (typeid(CRet) == typeid(uint32_t) &&
1951                typeid(CArg1) == typeid(c_v256) &&
1952                typeid(CArg2) == typeid(c_v256)) {
1953       // U32_V256V256
1954       error = CompareSimd2Args<uint32_t, v256, v256, uint32_t, c_v256, c_v256>(
1955           reinterpret_cast<fptr>(u32_store_aligned),
1956           reinterpret_cast<fptr>(v256_load_aligned),
1957           reinterpret_cast<fptr>(v256_load_aligned), simd, d,
1958           reinterpret_cast<fptr>(c_u32_store_aligned),
1959           reinterpret_cast<fptr>(c_v256_load_aligned),
1960           reinterpret_cast<fptr>(c_v256_load_aligned),
1961           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1962     } else if (typeid(CRet) == typeid(c_v256) &&
1963                typeid(CArg1) == typeid(c_v128) &&
1964                typeid(CArg2) == typeid(c_v128)) {
1965       // V256_V128V128
1966       error = CompareSimd2Args<v256, v128, v128, c_v256, c_v128, c_v128>(
1967           reinterpret_cast<fptr>(v256_store_aligned),
1968           reinterpret_cast<fptr>(v128_load_aligned),
1969           reinterpret_cast<fptr>(v128_load_aligned), simd, d,
1970           reinterpret_cast<fptr>(c_v256_store_aligned),
1971           reinterpret_cast<fptr>(c_v128_load_aligned),
1972           reinterpret_cast<fptr>(c_v128_load_aligned),
1973           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1974     } else if (typeid(CRet) == typeid(c_v256) &&
1975                typeid(CArg1) == typeid(c_v256) &&
1976                typeid(CArg2) == typeid(uint32_t)) {
1977       // V256_V256U32
1978       error = CompareSimd2Args<v256, v256, uint32_t, c_v256, c_v256, uint32_t>(
1979           reinterpret_cast<fptr>(v256_store_aligned),
1980           reinterpret_cast<fptr>(v256_load_aligned),
1981           reinterpret_cast<fptr>(u32_load_aligned), simd, d,
1982           reinterpret_cast<fptr>(c_v256_store_aligned),
1983           reinterpret_cast<fptr>(c_v256_load_aligned),
1984           reinterpret_cast<fptr>(c_u32_load_aligned),
1985           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
1986 
1987     } else {
1988       FAIL() << "Internal error: Unknown intrinsic function "
1989              << typeid(CRet).name() << " " << name << "("
1990              << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ")";
1991     }
1992   }
1993 
1994   EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
1995                       << Print(s1, sizeof(CArg1)) << ", "
1996                       << Print(s2, sizeof(CArg2)) << ") -> "
1997                       << Print(d, sizeof(CRet)) << " (simd), "
1998                       << Print(ref_d, sizeof(CRet)) << " (ref)";
1999 }
2000 
2001 template <typename CRet, typename CArg1, typename CArg2, typename CArg3>
TestSimd3Args(uint32_t iterations,uint32_t mask,uint32_t maskwidth,const char * name)2002 void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
2003                    const char *name) {
2004   ACMRandom rnd(ACMRandom::DeterministicSeed());
2005   fptr ref_simd;
2006   fptr simd;
2007   int error = 0;
2008   DECLARE_ALIGNED(32, uint8_t, s1[32]);
2009   DECLARE_ALIGNED(32, uint8_t, s2[32]);
2010   DECLARE_ALIGNED(32, uint8_t, s3[32]);
2011   DECLARE_ALIGNED(32, uint8_t, d[32]);
2012   DECLARE_ALIGNED(32, uint8_t, ref_d[32]);
2013   assert(sizeof(CArg1) <= 32 && sizeof(CArg2) <= 32 && sizeof(CArg3) <= 32 &&
2014          sizeof(CRet) <= 32);
2015   memset(ref_d, 0, sizeof(ref_d));
2016   memset(d, 0, sizeof(d));
2017 
2018   Map(name, &ref_simd, &simd);
2019   if (simd == nullptr || ref_simd == nullptr) {
2020     FAIL() << "Internal error: Unknown intrinsic function " << name;
2021   }
2022 
2023   for (unsigned int count = 0;
2024        count < iterations && !error && !testing::Test::HasFailure(); count++) {
2025     for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
2026 
2027     for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
2028 
2029     for (unsigned int c = 0; c < sizeof(CArg3); c++) s3[c] = rnd.Rand8();
2030 
2031     if (maskwidth) SetMask(s3, sizeof(CArg3), mask, maskwidth);
2032 
2033     if (typeid(CRet) == typeid(c_v128) && typeid(CArg1) == typeid(c_v128) &&
2034         typeid(CArg2) == typeid(c_v128) && typeid(CArg3) == typeid(c_v128)) {
2035       // V128_V128V128V128
2036       error = CompareSimd3Args<v128, v128, v128, v128, c_v128, c_v128, c_v128,
2037                                c_v128>(
2038           reinterpret_cast<fptr>(v128_store_aligned),
2039           reinterpret_cast<fptr>(v128_load_aligned),
2040           reinterpret_cast<fptr>(v128_load_aligned),
2041           reinterpret_cast<fptr>(v128_load_aligned), simd, d,
2042           reinterpret_cast<fptr>(c_v128_store_aligned),
2043           reinterpret_cast<fptr>(c_v128_load_aligned),
2044           reinterpret_cast<fptr>(c_v128_load_aligned),
2045           reinterpret_cast<fptr>(c_v128_load_aligned),
2046           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
2047     } else if (typeid(CRet) == typeid(c_v256) &&
2048                typeid(CArg1) == typeid(c_v256) &&
2049                typeid(CArg2) == typeid(c_v256) &&
2050                typeid(CArg3) == typeid(c_v256)) {
2051       // V256_V256V256V256
2052       error = CompareSimd3Args<v256, v256, v256, v256, c_v256, c_v256, c_v256,
2053                                c_v256>(
2054           reinterpret_cast<fptr>(v256_store_aligned),
2055           reinterpret_cast<fptr>(v256_load_aligned),
2056           reinterpret_cast<fptr>(v256_load_aligned),
2057           reinterpret_cast<fptr>(v256_load_aligned), simd, d,
2058           reinterpret_cast<fptr>(c_v256_store_aligned),
2059           reinterpret_cast<fptr>(c_v256_load_aligned),
2060           reinterpret_cast<fptr>(c_v256_load_aligned),
2061           reinterpret_cast<fptr>(c_v256_load_aligned),
2062           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
2063     } else {
2064       FAIL() << "Internal error: Unknown intrinsic function "
2065              << typeid(CRet).name() << " " << name << "("
2066              << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ", "
2067              << typeid(CArg3).name() << ")";
2068     }
2069   }
2070 
2071   EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
2072                       << Print(s1, sizeof(CArg1)) << ", "
2073                       << Print(s2, sizeof(CArg2)) << ", "
2074                       << Print(s3, sizeof(CArg3)) << ") -> "
2075                       << Print(d, sizeof(CRet)) << " (simd), "
2076                       << Print(ref_d, sizeof(CRet)) << " (ref)";
2077 }
2078 
2079 // Instantiations to make the functions callable from another files
2080 template void TestSimd1Arg<c_v64, uint8_t>(uint32_t, uint32_t, uint32_t,
2081                                            const char *);
2082 template void TestSimd1Arg<c_v64, uint16_t>(uint32_t, uint32_t, uint32_t,
2083                                             const char *);
2084 template void TestSimd1Arg<c_v64, uint32_t>(uint32_t, uint32_t, uint32_t,
2085                                             const char *);
2086 template void TestSimd1Arg<c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
2087                                          const char *);
2088 template void TestSimd1Arg<uint32_t, c_v64>(uint32_t, uint32_t, uint32_t,
2089                                             const char *);
2090 template void TestSimd1Arg<int32_t, c_v64>(uint32_t, uint32_t, uint32_t,
2091                                            const char *);
2092 template void TestSimd1Arg<uint64_t, c_v64>(uint32_t, uint32_t, uint32_t,
2093                                             const char *);
2094 template void TestSimd1Arg<int64_t, c_v64>(uint32_t, uint32_t, uint32_t,
2095                                            const char *);
2096 template void TestSimd2Args<c_v64, uint32_t, uint32_t>(uint32_t, uint32_t,
2097                                                        uint32_t, const char *);
2098 template void TestSimd2Args<c_v64, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
2099                                                  const char *);
2100 template void TestSimd2Args<c_v64, c_v64, uint32_t>(uint32_t, uint32_t,
2101                                                     uint32_t, const char *);
2102 template void TestSimd2Args<int64_t, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
2103                                                    const char *);
2104 template void TestSimd2Args<uint32_t, c_v64, c_v64>(uint32_t, uint32_t,
2105                                                     uint32_t, const char *);
2106 template void TestSimd1Arg<c_v128, c_v128>(uint32_t, uint32_t, uint32_t,
2107                                            const char *);
2108 template void TestSimd1Arg<c_v128, uint8_t>(uint32_t, uint32_t, uint32_t,
2109                                             const char *);
2110 template void TestSimd1Arg<c_v128, uint16_t>(uint32_t, uint32_t, uint32_t,
2111                                              const char *);
2112 template void TestSimd1Arg<c_v128, uint32_t>(uint32_t, uint32_t, uint32_t,
2113                                              const char *);
2114 template void TestSimd1Arg<c_v128, uint64_t>(uint32_t, uint32_t, uint32_t,
2115                                              const char *);
2116 template void TestSimd1Arg<c_v128, c_v64>(uint32_t, uint32_t, uint32_t,
2117                                           const char *);
2118 template void TestSimd1Arg<uint32_t, c_v128>(uint32_t, uint32_t, uint32_t,
2119                                              const char *);
2120 template void TestSimd1Arg<uint64_t, c_v128>(uint32_t, uint32_t, uint32_t,
2121                                              const char *);
2122 template void TestSimd1Arg<c_v64, c_v128>(uint32_t, uint32_t, uint32_t,
2123                                           const char *);
2124 template void TestSimd2Args<c_v128, c_v128, c_v128>(uint32_t, uint32_t,
2125                                                     uint32_t, const char *);
2126 template void TestSimd2Args<c_v128, c_v128, uint32_t>(uint32_t, uint32_t,
2127                                                       uint32_t, const char *);
2128 template void TestSimd2Args<c_v128, uint64_t, uint64_t>(uint32_t, uint32_t,
2129                                                         uint32_t, const char *);
2130 template void TestSimd2Args<c_v128, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
2131                                                   const char *);
2132 template void TestSimd2Args<uint64_t, c_v128, c_v128>(uint32_t, uint32_t,
2133                                                       uint32_t, const char *);
2134 template void TestSimd2Args<int64_t, c_v128, c_v128>(uint32_t, uint32_t,
2135                                                      uint32_t, const char *);
2136 template void TestSimd2Args<uint32_t, c_v128, c_v128>(uint32_t, uint32_t,
2137                                                       uint32_t, const char *);
2138 template void TestSimd3Args<c_v128, c_v128, c_v128, c_v128>(uint32_t, uint32_t,
2139                                                             uint32_t,
2140                                                             const char *);
2141 template void TestSimd1Arg<c_v256, c_v128>(uint32_t, uint32_t, uint32_t,
2142                                            const char *);
2143 template void TestSimd1Arg<c_v256, c_v256>(uint32_t, uint32_t, uint32_t,
2144                                            const char *);
2145 template void TestSimd1Arg<uint64_t, c_v256>(uint32_t, uint32_t, uint32_t,
2146                                              const char *);
2147 template void TestSimd1Arg<c_v256, uint8_t>(uint32_t, uint32_t, uint32_t,
2148                                             const char *);
2149 template void TestSimd1Arg<c_v256, uint16_t>(uint32_t, uint32_t, uint32_t,
2150                                              const char *);
2151 template void TestSimd1Arg<c_v256, uint32_t>(uint32_t, uint32_t, uint32_t,
2152                                              const char *);
2153 template void TestSimd1Arg<c_v256, uint64_t>(uint32_t, uint32_t, uint32_t,
2154                                              const char *);
2155 template void TestSimd1Arg<uint32_t, c_v256>(uint32_t, uint32_t, uint32_t,
2156                                              const char *);
2157 template void TestSimd1Arg<c_v64, c_v256>(uint32_t, uint32_t, uint32_t,
2158                                           const char *);
2159 template void TestSimd2Args<c_v256, c_v128, c_v128>(uint32_t, uint32_t,
2160                                                     uint32_t, const char *);
2161 template void TestSimd2Args<c_v256, c_v256, c_v256>(uint32_t, uint32_t,
2162                                                     uint32_t, const char *);
2163 template void TestSimd2Args<c_v256, c_v256, uint32_t>(uint32_t, uint32_t,
2164                                                       uint32_t, const char *);
2165 template void TestSimd2Args<uint64_t, c_v256, c_v256>(uint32_t, uint32_t,
2166                                                       uint32_t, const char *);
2167 template void TestSimd2Args<int64_t, c_v256, c_v256>(uint32_t, uint32_t,
2168                                                      uint32_t, const char *);
2169 template void TestSimd2Args<uint32_t, c_v256, c_v256>(uint32_t, uint32_t,
2170                                                       uint32_t, const char *);
2171 template void TestSimd3Args<c_v256, c_v256, c_v256, c_v256>(uint32_t, uint32_t,
2172                                                             uint32_t,
2173                                                             const char *);
2174 
2175 }  // namespace SIMD_NAMESPACE
2176