1 /*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12 #ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
13 #define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
14
15 #include <stdio.h>
16 #include <stdlib.h>
17
18 #include "config/aom_config.h"
19
20 #include "aom_dsp/simd/v64_intrinsics_c.h"
21
22 typedef union {
23 uint8_t u8[16];
24 uint16_t u16[8];
25 uint32_t u32[4];
26 uint64_t u64[2];
27 int8_t s8[16];
28 int16_t s16[8];
29 int32_t s32[4];
30 int64_t s64[2];
31 c_v64 v64[2];
32 } c_v128;
33
c_v128_low_u32(c_v128 a)34 SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; }
35
c_v128_low_v64(c_v128 a)36 SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; }
37
c_v128_high_v64(c_v128 a)38 SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; }
39
c_v128_from_64(uint64_t hi,uint64_t lo)40 SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) {
41 c_v128 t;
42 t.u64[1] = hi;
43 t.u64[0] = lo;
44 return t;
45 }
46
c_v128_from_v64(c_v64 hi,c_v64 lo)47 SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) {
48 c_v128 t;
49 t.v64[1] = hi;
50 t.v64[0] = lo;
51 return t;
52 }
53
c_v128_from_32(uint32_t a,uint32_t b,uint32_t c,uint32_t d)54 SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c,
55 uint32_t d) {
56 c_v128 t;
57 t.u32[3] = a;
58 t.u32[2] = b;
59 t.u32[1] = c;
60 t.u32[0] = d;
61 return t;
62 }
63
c_v128_load_unaligned(const void * p)64 SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) {
65 c_v128 t;
66 uint8_t *pp = (uint8_t *)p;
67 uint8_t *q = (uint8_t *)&t;
68 int c;
69 for (c = 0; c < 16; c++) q[c] = pp[c];
70 return t;
71 }
72
c_v128_load_aligned(const void * p)73 SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) {
74 if (SIMD_CHECK && (uintptr_t)p & 15) {
75 fprintf(stderr, "Error: unaligned v128 load at %p\n", p);
76 abort();
77 }
78 return c_v128_load_unaligned(p);
79 }
80
c_v128_store_unaligned(void * p,c_v128 a)81 SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) {
82 uint8_t *pp = (uint8_t *)p;
83 uint8_t *q = (uint8_t *)&a;
84 int c;
85 for (c = 0; c < 16; c++) pp[c] = q[c];
86 }
87
c_v128_store_aligned(void * p,c_v128 a)88 SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) {
89 if (SIMD_CHECK && (uintptr_t)p & 15) {
90 fprintf(stderr, "Error: unaligned v128 store at %p\n", p);
91 abort();
92 }
93 c_v128_store_unaligned(p, a);
94 }
95
c_v128_zero(void)96 SIMD_INLINE c_v128 c_v128_zero(void) {
97 c_v128 t;
98 t.u64[1] = t.u64[0] = 0;
99 return t;
100 }
101
c_v128_dup_8(uint8_t x)102 SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) {
103 c_v128 t;
104 t.v64[1] = t.v64[0] = c_v64_dup_8(x);
105 return t;
106 }
107
c_v128_dup_16(uint16_t x)108 SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) {
109 c_v128 t;
110 t.v64[1] = t.v64[0] = c_v64_dup_16(x);
111 return t;
112 }
113
c_v128_dup_32(uint32_t x)114 SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) {
115 c_v128 t;
116 t.v64[1] = t.v64[0] = c_v64_dup_32(x);
117 return t;
118 }
119
c_v128_dup_64(uint64_t x)120 SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) {
121 c_v128 t;
122 t.u64[1] = t.u64[0] = x;
123 return t;
124 }
125
c_v128_dotp_su8(c_v128 a,c_v128 b)126 SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) {
127 return c_v64_dotp_su8(a.v64[1], b.v64[1]) +
128 c_v64_dotp_su8(a.v64[0], b.v64[0]);
129 }
130
c_v128_dotp_s16(c_v128 a,c_v128 b)131 SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) {
132 return c_v64_dotp_s16(a.v64[1], b.v64[1]) +
133 c_v64_dotp_s16(a.v64[0], b.v64[0]);
134 }
135
c_v128_dotp_s32(c_v128 a,c_v128 b)136 SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) {
137 // 32 bit products, 64 bit sum
138 return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) +
139 (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) +
140 (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) +
141 (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]);
142 }
143
c_v128_hadd_u8(c_v128 a)144 SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) {
145 return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
146 }
147
148 typedef struct {
149 uint32_t val;
150 int count;
151 } c_sad128_internal;
152
c_v128_sad_u8_init(void)153 SIMD_INLINE c_sad128_internal c_v128_sad_u8_init(void) {
154 c_sad128_internal t;
155 t.val = t.count = 0;
156 return t;
157 }
158
159 /* Implementation dependent return value. Result must be finalised with
160 * v128_sad_u8_sum(). The result for more than 32 v128_sad_u8() calls is
161 * undefined. */
c_v128_sad_u8(c_sad128_internal s,c_v128 a,c_v128 b)162 SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a,
163 c_v128 b) {
164 int c;
165 for (c = 0; c < 16; c++)
166 s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
167 s.count++;
168 if (SIMD_CHECK && s.count > 32) {
169 fprintf(stderr,
170 "Error: sad called 32 times returning an undefined result\n");
171 abort();
172 }
173 return s;
174 }
175
c_v128_sad_u8_sum(c_sad128_internal s)176 SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s.val; }
177
178 typedef uint32_t c_ssd128_internal;
179
c_v128_ssd_u8_init(void)180 SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init(void) { return 0; }
181
182 /* Implementation dependent return value. Result must be finalised with
183 * v128_ssd_u8_sum(). */
c_v128_ssd_u8(c_ssd128_internal s,c_v128 a,c_v128 b)184 SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a,
185 c_v128 b) {
186 int c;
187 for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
188 return s;
189 }
190
c_v128_ssd_u8_sum(c_ssd128_internal s)191 SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; }
192
c_v128_or(c_v128 a,c_v128 b)193 SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) {
194 return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]),
195 c_v64_or(a.v64[0], b.v64[0]));
196 }
197
c_v128_xor(c_v128 a,c_v128 b)198 SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) {
199 return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]),
200 c_v64_xor(a.v64[0], b.v64[0]));
201 }
202
c_v128_and(c_v128 a,c_v128 b)203 SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) {
204 return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]),
205 c_v64_and(a.v64[0], b.v64[0]));
206 }
207
c_v128_andn(c_v128 a,c_v128 b)208 SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) {
209 return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]),
210 c_v64_andn(a.v64[0], b.v64[0]));
211 }
212
c_v128_add_8(c_v128 a,c_v128 b)213 SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) {
214 return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]),
215 c_v64_add_8(a.v64[0], b.v64[0]));
216 }
217
c_v128_add_16(c_v128 a,c_v128 b)218 SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) {
219 return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]),
220 c_v64_add_16(a.v64[0], b.v64[0]));
221 }
222
c_v128_sadd_u8(c_v128 a,c_v128 b)223 SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) {
224 return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]),
225 c_v64_sadd_u8(a.v64[0], b.v64[0]));
226 }
227
c_v128_sadd_s8(c_v128 a,c_v128 b)228 SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) {
229 return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]),
230 c_v64_sadd_s8(a.v64[0], b.v64[0]));
231 }
232
c_v128_sadd_s16(c_v128 a,c_v128 b)233 SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) {
234 return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]),
235 c_v64_sadd_s16(a.v64[0], b.v64[0]));
236 }
237
c_v128_add_32(c_v128 a,c_v128 b)238 SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) {
239 return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]),
240 c_v64_add_32(a.v64[0], b.v64[0]));
241 }
242
c_v128_add_64(c_v128 a,c_v128 b)243 SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) {
244 // Two complement overflow (silences sanitizers)
245 return c_v128_from_64(
246 a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1
247 : a.v64[1].u64 + b.v64[1].u64,
248 a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1
249 : a.v64[0].u64 + b.v64[0].u64);
250 }
251
c_v128_padd_s16(c_v128 a)252 SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
253 c_v128 t;
254 t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
255 t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
256 t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
257 t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
258 return t;
259 }
260
c_v128_padd_u8(c_v128 a)261 SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) {
262 c_v128 t;
263 t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1];
264 t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3];
265 t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5];
266 t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7];
267 t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9];
268 t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11];
269 t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13];
270 t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15];
271 return t;
272 }
273
c_v128_sub_8(c_v128 a,c_v128 b)274 SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) {
275 return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]),
276 c_v64_sub_8(a.v64[0], b.v64[0]));
277 }
278
c_v128_ssub_u8(c_v128 a,c_v128 b)279 SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) {
280 return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]),
281 c_v64_ssub_u8(a.v64[0], b.v64[0]));
282 }
283
c_v128_ssub_s8(c_v128 a,c_v128 b)284 SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) {
285 return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]),
286 c_v64_ssub_s8(a.v64[0], b.v64[0]));
287 }
288
c_v128_sub_16(c_v128 a,c_v128 b)289 SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) {
290 return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]),
291 c_v64_sub_16(a.v64[0], b.v64[0]));
292 }
293
c_v128_ssub_s16(c_v128 a,c_v128 b)294 SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) {
295 return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]),
296 c_v64_ssub_s16(a.v64[0], b.v64[0]));
297 }
298
c_v128_ssub_u16(c_v128 a,c_v128 b)299 SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) {
300 return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]),
301 c_v64_ssub_u16(a.v64[0], b.v64[0]));
302 }
303
c_v128_sub_32(c_v128 a,c_v128 b)304 SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) {
305 return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]),
306 c_v64_sub_32(a.v64[0], b.v64[0]));
307 }
308
c_v128_sub_64(c_v128 a,c_v128 b)309 SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) {
310 // Two complement underflow (silences sanitizers)
311 return c_v128_from_64(
312 a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1
313 : a.v64[1].u64 - b.v64[1].u64,
314 a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1
315 : a.v64[0].u64 - b.v64[0].u64);
316 }
317
c_v128_abs_s16(c_v128 a)318 SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) {
319 return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
320 }
321
c_v128_abs_s8(c_v128 a)322 SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) {
323 return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0]));
324 }
325
c_v128_mul_s16(c_v64 a,c_v64 b)326 SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) {
327 c_v64 lo_bits = c_v64_mullo_s16(a, b);
328 c_v64 hi_bits = c_v64_mulhi_s16(a, b);
329 return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits),
330 c_v64_ziplo_16(hi_bits, lo_bits));
331 }
332
c_v128_mullo_s16(c_v128 a,c_v128 b)333 SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) {
334 return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]),
335 c_v64_mullo_s16(a.v64[0], b.v64[0]));
336 }
337
c_v128_mulhi_s16(c_v128 a,c_v128 b)338 SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) {
339 return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]),
340 c_v64_mulhi_s16(a.v64[0], b.v64[0]));
341 }
342
c_v128_mullo_s32(c_v128 a,c_v128 b)343 SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) {
344 return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]),
345 c_v64_mullo_s32(a.v64[0], b.v64[0]));
346 }
347
c_v128_madd_s16(c_v128 a,c_v128 b)348 SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) {
349 return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]),
350 c_v64_madd_s16(a.v64[0], b.v64[0]));
351 }
352
c_v128_madd_us8(c_v128 a,c_v128 b)353 SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) {
354 return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]),
355 c_v64_madd_us8(a.v64[0], b.v64[0]));
356 }
357
c_v128_avg_u8(c_v128 a,c_v128 b)358 SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) {
359 return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]),
360 c_v64_avg_u8(a.v64[0], b.v64[0]));
361 }
362
c_v128_rdavg_u8(c_v128 a,c_v128 b)363 SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) {
364 return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]),
365 c_v64_rdavg_u8(a.v64[0], b.v64[0]));
366 }
367
c_v128_rdavg_u16(c_v128 a,c_v128 b)368 SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) {
369 return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]),
370 c_v64_rdavg_u16(a.v64[0], b.v64[0]));
371 }
372
c_v128_avg_u16(c_v128 a,c_v128 b)373 SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) {
374 return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]),
375 c_v64_avg_u16(a.v64[0], b.v64[0]));
376 }
377
c_v128_min_u8(c_v128 a,c_v128 b)378 SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) {
379 return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]),
380 c_v64_min_u8(a.v64[0], b.v64[0]));
381 }
382
c_v128_max_u8(c_v128 a,c_v128 b)383 SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) {
384 return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]),
385 c_v64_max_u8(a.v64[0], b.v64[0]));
386 }
387
c_v128_min_s8(c_v128 a,c_v128 b)388 SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) {
389 return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]),
390 c_v64_min_s8(a.v64[0], b.v64[0]));
391 }
392
c_v128_movemask_8(c_v128 a)393 SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) {
394 return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
395 ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
396 ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
397 ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
398 ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
399 ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
400 ((a.s8[0] < 0) << 0);
401 }
402
c_v128_blend_8(c_v128 a,c_v128 b,c_v128 c)403 SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) {
404 c_v128 t;
405 for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
406 return t;
407 }
408
c_v128_max_s8(c_v128 a,c_v128 b)409 SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) {
410 return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]),
411 c_v64_max_s8(a.v64[0], b.v64[0]));
412 }
413
c_v128_min_s16(c_v128 a,c_v128 b)414 SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) {
415 return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]),
416 c_v64_min_s16(a.v64[0], b.v64[0]));
417 }
418
c_v128_max_s16(c_v128 a,c_v128 b)419 SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) {
420 return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]),
421 c_v64_max_s16(a.v64[0], b.v64[0]));
422 }
423
c_v128_max_s32(c_v128 a,c_v128 b)424 SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) {
425 c_v128 t;
426 int c;
427 for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c];
428 return t;
429 }
430
c_v128_min_s32(c_v128 a,c_v128 b)431 SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) {
432 c_v128 t;
433 int c;
434 for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c];
435 return t;
436 }
437
c_v128_ziplo_8(c_v128 a,c_v128 b)438 SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) {
439 return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]),
440 c_v64_ziplo_8(a.v64[0], b.v64[0]));
441 }
442
c_v128_ziphi_8(c_v128 a,c_v128 b)443 SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) {
444 return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]),
445 c_v64_ziplo_8(a.v64[1], b.v64[1]));
446 }
447
c_v128_ziplo_16(c_v128 a,c_v128 b)448 SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) {
449 return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]),
450 c_v64_ziplo_16(a.v64[0], b.v64[0]));
451 }
452
c_v128_ziphi_16(c_v128 a,c_v128 b)453 SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) {
454 return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]),
455 c_v64_ziplo_16(a.v64[1], b.v64[1]));
456 }
457
c_v128_ziplo_32(c_v128 a,c_v128 b)458 SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) {
459 return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]),
460 c_v64_ziplo_32(a.v64[0], b.v64[0]));
461 }
462
c_v128_ziphi_32(c_v128 a,c_v128 b)463 SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) {
464 return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]),
465 c_v64_ziplo_32(a.v64[1], b.v64[1]));
466 }
467
c_v128_ziplo_64(c_v128 a,c_v128 b)468 SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) {
469 return c_v128_from_v64(a.v64[0], b.v64[0]);
470 }
471
c_v128_ziphi_64(c_v128 a,c_v128 b)472 SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) {
473 return c_v128_from_v64(a.v64[1], b.v64[1]);
474 }
475
c_v128_zip_8(c_v64 a,c_v64 b)476 SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) {
477 return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b));
478 }
479
c_v128_zip_16(c_v64 a,c_v64 b)480 SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) {
481 return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b));
482 }
483
c_v128_zip_32(c_v64 a,c_v64 b)484 SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) {
485 return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b));
486 }
487
_c_v128_unzip_8(c_v128 a,c_v128 b,int mode)488 SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) {
489 c_v128 t;
490 if (mode) {
491 t.u8[15] = b.u8[15];
492 t.u8[14] = b.u8[13];
493 t.u8[13] = b.u8[11];
494 t.u8[12] = b.u8[9];
495 t.u8[11] = b.u8[7];
496 t.u8[10] = b.u8[5];
497 t.u8[9] = b.u8[3];
498 t.u8[8] = b.u8[1];
499 t.u8[7] = a.u8[15];
500 t.u8[6] = a.u8[13];
501 t.u8[5] = a.u8[11];
502 t.u8[4] = a.u8[9];
503 t.u8[3] = a.u8[7];
504 t.u8[2] = a.u8[5];
505 t.u8[1] = a.u8[3];
506 t.u8[0] = a.u8[1];
507 } else {
508 t.u8[15] = a.u8[14];
509 t.u8[14] = a.u8[12];
510 t.u8[13] = a.u8[10];
511 t.u8[12] = a.u8[8];
512 t.u8[11] = a.u8[6];
513 t.u8[10] = a.u8[4];
514 t.u8[9] = a.u8[2];
515 t.u8[8] = a.u8[0];
516 t.u8[7] = b.u8[14];
517 t.u8[6] = b.u8[12];
518 t.u8[5] = b.u8[10];
519 t.u8[4] = b.u8[8];
520 t.u8[3] = b.u8[6];
521 t.u8[2] = b.u8[4];
522 t.u8[1] = b.u8[2];
523 t.u8[0] = b.u8[0];
524 }
525 return t;
526 }
527
c_v128_unziplo_8(c_v128 a,c_v128 b)528 SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) {
529 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1)
530 : _c_v128_unzip_8(a, b, 0);
531 }
532
c_v128_unziphi_8(c_v128 a,c_v128 b)533 SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) {
534 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0)
535 : _c_v128_unzip_8(b, a, 1);
536 }
537
_c_v128_unzip_16(c_v128 a,c_v128 b,int mode)538 SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) {
539 c_v128 t;
540 if (mode) {
541 t.u16[7] = b.u16[7];
542 t.u16[6] = b.u16[5];
543 t.u16[5] = b.u16[3];
544 t.u16[4] = b.u16[1];
545 t.u16[3] = a.u16[7];
546 t.u16[2] = a.u16[5];
547 t.u16[1] = a.u16[3];
548 t.u16[0] = a.u16[1];
549 } else {
550 t.u16[7] = a.u16[6];
551 t.u16[6] = a.u16[4];
552 t.u16[5] = a.u16[2];
553 t.u16[4] = a.u16[0];
554 t.u16[3] = b.u16[6];
555 t.u16[2] = b.u16[4];
556 t.u16[1] = b.u16[2];
557 t.u16[0] = b.u16[0];
558 }
559 return t;
560 }
561
c_v128_unziplo_16(c_v128 a,c_v128 b)562 SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) {
563 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1)
564 : _c_v128_unzip_16(a, b, 0);
565 }
566
c_v128_unziphi_16(c_v128 a,c_v128 b)567 SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) {
568 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0)
569 : _c_v128_unzip_16(b, a, 1);
570 }
571
_c_v128_unzip_32(c_v128 a,c_v128 b,int mode)572 SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) {
573 c_v128 t;
574 if (mode) {
575 t.u32[3] = b.u32[3];
576 t.u32[2] = b.u32[1];
577 t.u32[1] = a.u32[3];
578 t.u32[0] = a.u32[1];
579 } else {
580 t.u32[3] = a.u32[2];
581 t.u32[2] = a.u32[0];
582 t.u32[1] = b.u32[2];
583 t.u32[0] = b.u32[0];
584 }
585 return t;
586 }
587
c_v128_unziplo_32(c_v128 a,c_v128 b)588 SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) {
589 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1)
590 : _c_v128_unzip_32(a, b, 0);
591 }
592
c_v128_unziphi_32(c_v128 a,c_v128 b)593 SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) {
594 return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0)
595 : _c_v128_unzip_32(b, a, 1);
596 }
597
c_v128_unpack_u8_s16(c_v64 a)598 SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) {
599 return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a));
600 }
601
c_v128_unpacklo_u8_s16(c_v128 a)602 SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) {
603 return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]),
604 c_v64_unpacklo_u8_s16(a.v64[0]));
605 }
606
c_v128_unpackhi_u8_s16(c_v128 a)607 SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) {
608 return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]),
609 c_v64_unpacklo_u8_s16(a.v64[1]));
610 }
611
c_v128_unpack_s8_s16(c_v64 a)612 SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) {
613 return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a));
614 }
615
c_v128_unpacklo_s8_s16(c_v128 a)616 SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) {
617 return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]),
618 c_v64_unpacklo_s8_s16(a.v64[0]));
619 }
620
c_v128_unpackhi_s8_s16(c_v128 a)621 SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) {
622 return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]),
623 c_v64_unpacklo_s8_s16(a.v64[1]));
624 }
625
c_v128_pack_s32_s16(c_v128 a,c_v128 b)626 SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) {
627 return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]),
628 c_v64_pack_s32_s16(b.v64[1], b.v64[0]));
629 }
630
c_v128_pack_s32_u16(c_v128 a,c_v128 b)631 SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) {
632 return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]),
633 c_v64_pack_s32_u16(b.v64[1], b.v64[0]));
634 }
635
c_v128_pack_s16_u8(c_v128 a,c_v128 b)636 SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) {
637 return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]),
638 c_v64_pack_s16_u8(b.v64[1], b.v64[0]));
639 }
640
c_v128_pack_s16_s8(c_v128 a,c_v128 b)641 SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) {
642 return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]),
643 c_v64_pack_s16_s8(b.v64[1], b.v64[0]));
644 }
645
c_v128_unpack_u16_s32(c_v64 a)646 SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) {
647 return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a));
648 }
649
c_v128_unpack_s16_s32(c_v64 a)650 SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) {
651 return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a));
652 }
653
c_v128_unpacklo_u16_s32(c_v128 a)654 SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) {
655 return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]),
656 c_v64_unpacklo_u16_s32(a.v64[0]));
657 }
658
c_v128_unpacklo_s16_s32(c_v128 a)659 SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) {
660 return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]),
661 c_v64_unpacklo_s16_s32(a.v64[0]));
662 }
663
c_v128_unpackhi_u16_s32(c_v128 a)664 SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) {
665 return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]),
666 c_v64_unpacklo_u16_s32(a.v64[1]));
667 }
668
c_v128_unpackhi_s16_s32(c_v128 a)669 SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) {
670 return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]),
671 c_v64_unpacklo_s16_s32(a.v64[1]));
672 }
673
c_v128_shuffle_8(c_v128 a,c_v128 pattern)674 SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) {
675 c_v128 t;
676 int c;
677 for (c = 0; c < 16; c++)
678 t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
679 : pattern.u8[c] & 15];
680
681 return t;
682 }
683
c_v128_cmpgt_s8(c_v128 a,c_v128 b)684 SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) {
685 return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]),
686 c_v64_cmpgt_s8(a.v64[0], b.v64[0]));
687 }
688
c_v128_cmplt_s8(c_v128 a,c_v128 b)689 SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) {
690 return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]),
691 c_v64_cmplt_s8(a.v64[0], b.v64[0]));
692 }
693
c_v128_cmpeq_8(c_v128 a,c_v128 b)694 SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) {
695 return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]),
696 c_v64_cmpeq_8(a.v64[0], b.v64[0]));
697 }
698
c_v128_cmpgt_s16(c_v128 a,c_v128 b)699 SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) {
700 return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]),
701 c_v64_cmpgt_s16(a.v64[0], b.v64[0]));
702 }
703
c_v128_cmplt_s16(c_v128 a,c_v128 b)704 SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) {
705 return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]),
706 c_v64_cmplt_s16(a.v64[0], b.v64[0]));
707 }
708
c_v128_cmpeq_16(c_v128 a,c_v128 b)709 SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) {
710 return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]),
711 c_v64_cmpeq_16(a.v64[0], b.v64[0]));
712 }
713
c_v128_cmpgt_s32(c_v128 a,c_v128 b)714 SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) {
715 c_v128 t;
716 int c;
717 for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]);
718 return t;
719 }
720
c_v128_cmplt_s32(c_v128 a,c_v128 b)721 SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) {
722 c_v128 t;
723 int c;
724 for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]);
725 return t;
726 }
727
c_v128_cmpeq_32(c_v128 a,c_v128 b)728 SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) {
729 c_v128 t;
730 int c;
731 for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]);
732 return t;
733 }
734
c_v128_shl_n_byte(c_v128 a,const unsigned int n)735 SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
736 if (n == 0) return a;
737 if (n < 8)
738 return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
739 c_v64_shr_n_byte(a.v64[0], 8 - n)),
740 c_v64_shl_n_byte(a.v64[0], n));
741 else
742 return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero());
743 }
744
c_v128_shr_n_byte(c_v128 a,const unsigned int n)745 SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
746 if (n == 0) return a;
747 if (n < 8)
748 return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
749 c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
750 c_v64_shl_n_byte(a.v64[1], 8 - n)));
751 else
752 return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8));
753 }
754
c_v128_align(c_v128 a,c_v128 b,const unsigned int c)755 SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) {
756 if (SIMD_CHECK && c > 15) {
757 fprintf(stderr, "Error: undefined alignment %d\n", c);
758 abort();
759 }
760 return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c))
761 : b;
762 }
763
c_v128_shl_8(c_v128 a,const unsigned int c)764 SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) {
765 return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c));
766 }
767
c_v128_shr_u8(c_v128 a,const unsigned int c)768 SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) {
769 return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c));
770 }
771
c_v128_shr_s8(c_v128 a,const unsigned int c)772 SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) {
773 return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c));
774 }
775
c_v128_shl_16(c_v128 a,const unsigned int c)776 SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) {
777 return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c));
778 }
779
c_v128_shr_u16(c_v128 a,const unsigned int c)780 SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) {
781 return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c),
782 c_v64_shr_u16(a.v64[0], c));
783 }
784
c_v128_shr_s16(c_v128 a,const unsigned int c)785 SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) {
786 return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c),
787 c_v64_shr_s16(a.v64[0], c));
788 }
789
c_v128_shl_32(c_v128 a,const unsigned int c)790 SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) {
791 return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c));
792 }
793
c_v128_shr_u32(c_v128 a,const unsigned int c)794 SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) {
795 return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c),
796 c_v64_shr_u32(a.v64[0], c));
797 }
798
c_v128_shr_s32(c_v128 a,const unsigned int c)799 SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) {
800 return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c),
801 c_v64_shr_s32(a.v64[0], c));
802 }
803
c_v128_shl_64(c_v128 a,const unsigned int c)804 SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) {
805 a.v64[1].u64 <<= c;
806 a.v64[0].u64 <<= c;
807 return c_v128_from_v64(a.v64[1], a.v64[0]);
808 }
809
c_v128_shr_u64(c_v128 a,const unsigned int c)810 SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) {
811 a.v64[1].u64 >>= c;
812 a.v64[0].u64 >>= c;
813 return c_v128_from_v64(a.v64[1], a.v64[0]);
814 }
815
c_v128_shr_s64(c_v128 a,const unsigned int c)816 SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) {
817 a.v64[1].s64 >>= c;
818 a.v64[0].s64 >>= c;
819 return c_v128_from_v64(a.v64[1], a.v64[0]);
820 }
821
c_v128_shl_n_8(c_v128 a,const unsigned int n)822 SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) {
823 return c_v128_shl_8(a, n);
824 }
825
c_v128_shl_n_16(c_v128 a,const unsigned int n)826 SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) {
827 return c_v128_shl_16(a, n);
828 }
829
c_v128_shl_n_32(c_v128 a,const unsigned int n)830 SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) {
831 return c_v128_shl_32(a, n);
832 }
833
c_v128_shl_n_64(c_v128 a,const unsigned int n)834 SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) {
835 return c_v128_shl_64(a, n);
836 }
837
c_v128_shr_n_u8(c_v128 a,const unsigned int n)838 SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) {
839 return c_v128_shr_u8(a, n);
840 }
841
c_v128_shr_n_u16(c_v128 a,const unsigned int n)842 SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) {
843 return c_v128_shr_u16(a, n);
844 }
845
c_v128_shr_n_u32(c_v128 a,const unsigned int n)846 SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) {
847 return c_v128_shr_u32(a, n);
848 }
849
c_v128_shr_n_u64(c_v128 a,const unsigned int n)850 SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) {
851 return c_v128_shr_u64(a, n);
852 }
853
c_v128_shr_n_s8(c_v128 a,const unsigned int n)854 SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) {
855 return c_v128_shr_s8(a, n);
856 }
857
c_v128_shr_n_s16(c_v128 a,const unsigned int n)858 SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) {
859 return c_v128_shr_s16(a, n);
860 }
861
c_v128_shr_n_s32(c_v128 a,const unsigned int n)862 SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) {
863 return c_v128_shr_s32(a, n);
864 }
865
c_v128_shr_n_s64(c_v128 a,const unsigned int n)866 SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) {
867 return c_v128_shr_s64(a, n);
868 }
869
870 typedef uint32_t c_sad128_internal_u16;
871
c_v128_sad_u16_init(void)872 SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init(void) { return 0; }
873
874 /* Implementation dependent return value. Result must be finalised with
875 * v128_sad_u16_sum(). */
c_v128_sad_u16(c_sad128_internal_u16 s,c_v128 a,c_v128 b)876 SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s,
877 c_v128 a, c_v128 b) {
878 int c;
879 for (c = 0; c < 8; c++)
880 s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
881 return s;
882 }
883
c_v128_sad_u16_sum(c_sad128_internal_u16 s)884 SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; }
885
886 typedef uint64_t c_ssd128_internal_s16;
887
c_v128_ssd_s16_init(void)888 SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init(void) { return 0; }
889
890 /* Implementation dependent return value. Result must be finalised with
891 * v128_ssd_s16_sum(). */
c_v128_ssd_s16(c_ssd128_internal_s16 s,c_v128 a,c_v128 b)892 SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s,
893 c_v128 a, c_v128 b) {
894 int c;
895 for (c = 0; c < 8; c++)
896 s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
897 (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
898 return s;
899 }
900
c_v128_ssd_s16_sum(c_ssd128_internal_s16 s)901 SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; }
902
903 #endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
904