• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
12 #define VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
13 
14 #include <msa.h>
15 
16 #include "./vpx_config.h"
17 #include "vpx/vpx_integer.h"
18 
19 #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
20 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
21 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
22 
23 #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
24 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
25 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
26 
27 #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
28 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
29 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
30 
31 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
32 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
33 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
34 
35 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
36 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
37 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
38 
39 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
40 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
41 
42 #if (__mips_isa_rev >= 6)
43 #define LW(psrc)                                     \
44   ({                                                 \
45     const uint8_t *psrc_m = (const uint8_t *)(psrc); \
46     uint32_t val_m;                                  \
47                                                      \
48     asm volatile("lw  %[val_m],  %[psrc_m]  \n\t"    \
49                                                      \
50                  : [val_m] "=r"(val_m)               \
51                  : [psrc_m] "m"(*psrc_m));           \
52                                                      \
53     val_m;                                           \
54   })
55 
56 #if (__mips == 64)
57 #define LD(psrc)                                     \
58   ({                                                 \
59     const uint8_t *psrc_m = (const uint8_t *)(psrc); \
60     uint64_t val_m = 0;                              \
61                                                      \
62     asm volatile("ld  %[val_m],  %[psrc_m]  \n\t"    \
63                                                      \
64                  : [val_m] "=r"(val_m)               \
65                  : [psrc_m] "m"(*psrc_m));           \
66                                                      \
67     val_m;                                           \
68   })
69 #else  // !(__mips == 64)
70 #define LD(psrc)                                            \
71   ({                                                        \
72     const uint8_t *psrc_ld = (const uint8_t *)(psrc);       \
73     uint32_t val0_m, val1_m;                                \
74     uint64_t val_m = 0;                                     \
75                                                             \
76     val0_m = LW(psrc_ld);                                   \
77     val1_m = LW(psrc_ld + 4);                               \
78                                                             \
79     val_m = (uint64_t)(val1_m);                             \
80     val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
81     val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
82                                                             \
83     val_m;                                                  \
84   })
85 #endif  // (__mips == 64)
86 
87 #define SH(val, pdst)                             \
88   {                                               \
89     uint8_t *pdst_m = (uint8_t *)(pdst);          \
90     const uint16_t val_m = (val);                 \
91                                                   \
92     asm volatile("sh  %[val_m],  %[pdst_m]  \n\t" \
93                                                   \
94                  : [pdst_m] "=m"(*pdst_m)         \
95                  : [val_m] "r"(val_m));           \
96   }
97 
98 #define SW(val, pdst)                             \
99   {                                               \
100     uint8_t *pdst_m = (uint8_t *)(pdst);          \
101     const uint32_t val_m = (val);                 \
102                                                   \
103     asm volatile("sw  %[val_m],  %[pdst_m]  \n\t" \
104                                                   \
105                  : [pdst_m] "=m"(*pdst_m)         \
106                  : [val_m] "r"(val_m));           \
107   }
108 
109 #define SD(val, pdst)                             \
110   {                                               \
111     uint8_t *pdst_m = (uint8_t *)(pdst);          \
112     const uint64_t val_m = (val);                 \
113                                                   \
114     asm volatile("sd  %[val_m],  %[pdst_m]  \n\t" \
115                                                   \
116                  : [pdst_m] "=m"(*pdst_m)         \
117                  : [val_m] "r"(val_m));           \
118   }
119 #else  // !(__mips_isa_rev >= 6)
120 #define LW(psrc)                                     \
121   ({                                                 \
122     const uint8_t *psrc_m = (const uint8_t *)(psrc); \
123     uint32_t val_m;                                  \
124                                                      \
125     asm volatile(                                    \
126         "lwr %[val_m], 0(%[psrc_m]) \n\t"            \
127         "lwl %[val_m], 3(%[psrc_m]) \n\t"            \
128         : [val_m] "=&r"(val_m)                       \
129         : [psrc_m] "r"(psrc_m));                     \
130                                                      \
131     val_m;                                           \
132   })
133 
134 #if (__mips == 64)
135 #define LD(psrc)                                     \
136   ({                                                 \
137     const uint8_t *psrc_m = (const uint8_t *)(psrc); \
138     uint64_t val_m = 0;                              \
139                                                      \
140     asm volatile(                                    \
141         "ldr %[val_m], 0(%[psrc_m]) \n\t"            \
142         "ldl %[val_m], 7(%[psrc_m]) \n\t"            \
143         : [val_m] "=&r"(val_m)                       \
144         : [psrc_m] "r"(psrc_m));                     \
145                                                      \
146     val_m;                                           \
147   })
148 #else  // !(__mips == 64)
149 #define LD(psrc)                                            \
150   ({                                                        \
151     const uint8_t *psrc_m1 = (const uint8_t *)(psrc);       \
152     uint32_t val0_m, val1_m;                                \
153     uint64_t val_m = 0;                                     \
154                                                             \
155     val0_m = LW(psrc_m1);                                   \
156     val1_m = LW(psrc_m1 + 4);                               \
157                                                             \
158     val_m = (uint64_t)(val1_m);                             \
159     val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
160     val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
161                                                             \
162     val_m;                                                  \
163   })
164 #endif  // (__mips == 64)
165 #define SH(val, pdst)                              \
166   {                                                \
167     uint8_t *pdst_m = (uint8_t *)(pdst);           \
168     const uint16_t val_m = (val);                  \
169                                                    \
170     asm volatile("ush  %[val_m],  %[pdst_m]  \n\t" \
171                                                    \
172                  : [pdst_m] "=m"(*pdst_m)          \
173                  : [val_m] "r"(val_m));            \
174   }
175 
176 #define SW(val, pdst)                              \
177   {                                                \
178     uint8_t *pdst_m = (uint8_t *)(pdst);           \
179     const uint32_t val_m = (val);                  \
180                                                    \
181     asm volatile("usw  %[val_m],  %[pdst_m]  \n\t" \
182                                                    \
183                  : [pdst_m] "=m"(*pdst_m)          \
184                  : [val_m] "r"(val_m));            \
185   }
186 
187 #define SD(val, pdst)                                        \
188   {                                                          \
189     uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
190     uint32_t val0_m, val1_m;                                 \
191                                                              \
192     val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
193     val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
194                                                              \
195     SW(val0_m, pdst_m1);                                     \
196     SW(val1_m, pdst_m1 + 4);                                 \
197   }
198 #endif  // (__mips_isa_rev >= 6)
199 
200 /* Description : Load 4 words with stride
201    Arguments   : Inputs  - psrc, stride
202                  Outputs - out0, out1, out2, out3
203    Details     : Load word in 'out0' from (psrc)
204                  Load word in 'out1' from (psrc + stride)
205                  Load word in 'out2' from (psrc + 2 * stride)
206                  Load word in 'out3' from (psrc + 3 * stride)
207 */
208 #define LW4(psrc, stride, out0, out1, out2, out3) \
209   {                                               \
210     out0 = LW((psrc));                            \
211     out1 = LW((psrc) + stride);                   \
212     out2 = LW((psrc) + 2 * stride);               \
213     out3 = LW((psrc) + 3 * stride);               \
214   }
215 
216 /* Description : Load double words with stride
217    Arguments   : Inputs  - psrc, stride
218                  Outputs - out0, out1
219    Details     : Load double word in 'out0' from (psrc)
220                  Load double word in 'out1' from (psrc + stride)
221 */
222 #define LD2(psrc, stride, out0, out1) \
223   {                                   \
224     out0 = LD((psrc));                \
225     out1 = LD((psrc) + stride);       \
226   }
227 #define LD4(psrc, stride, out0, out1, out2, out3) \
228   {                                               \
229     LD2((psrc), stride, out0, out1);              \
230     LD2((psrc) + 2 * stride, stride, out2, out3); \
231   }
232 
233 /* Description : Store 4 words with stride
234    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
235    Details     : Store word from 'in0' to (pdst)
236                  Store word from 'in1' to (pdst + stride)
237                  Store word from 'in2' to (pdst + 2 * stride)
238                  Store word from 'in3' to (pdst + 3 * stride)
239 */
240 #define SW4(in0, in1, in2, in3, pdst, stride) \
241   {                                           \
242     SW(in0, (pdst));                          \
243     SW(in1, (pdst) + stride);                 \
244     SW(in2, (pdst) + 2 * stride);             \
245     SW(in3, (pdst) + 3 * stride);             \
246   }
247 
248 /* Description : Store 4 double words with stride
249    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
250    Details     : Store double word from 'in0' to (pdst)
251                  Store double word from 'in1' to (pdst + stride)
252                  Store double word from 'in2' to (pdst + 2 * stride)
253                  Store double word from 'in3' to (pdst + 3 * stride)
254 */
255 #define SD4(in0, in1, in2, in3, pdst, stride) \
256   {                                           \
257     SD(in0, (pdst));                          \
258     SD(in1, (pdst) + stride);                 \
259     SD(in2, (pdst) + 2 * stride);             \
260     SD(in3, (pdst) + 3 * stride);             \
261   }
262 
263 /* Description : Load vectors with 16 byte elements with stride
264    Arguments   : Inputs  - psrc, stride
265                  Outputs - out0, out1
266                  Return Type - as per RTYPE
267    Details     : Load 16 byte elements in 'out0' from (psrc)
268                  Load 16 byte elements in 'out1' from (psrc + stride)
269 */
270 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
271   {                                            \
272     out0 = LD_B(RTYPE, (psrc));                \
273     out1 = LD_B(RTYPE, (psrc) + stride);       \
274   }
275 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
276 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
277 
278 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
279   {                                                  \
280     LD_B2(RTYPE, (psrc), stride, out0, out1);        \
281     out2 = LD_B(RTYPE, (psrc) + 2 * stride);         \
282   }
283 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
284 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
285 
286 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
287   {                                                        \
288     LD_B2(RTYPE, (psrc), stride, out0, out1);              \
289     LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
290   }
291 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
292 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
293 
294 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
295   {                                                              \
296     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
297     out4 = LD_B(RTYPE, (psrc) + 4 * stride);                     \
298   }
299 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
300 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
301 
302 #define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
303               out7)                                                          \
304   {                                                                          \
305     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
306     LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
307   }
308 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
309 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
310 
311 /* Description : Load vectors with 8 halfword elements with stride
312    Arguments   : Inputs  - psrc, stride
313                  Outputs - out0, out1
314    Details     : Load 8 halfword elements in 'out0' from (psrc)
315                  Load 8 halfword elements in 'out1' from (psrc + stride)
316 */
317 #define LD_H2(RTYPE, psrc, stride, out0, out1) \
318   {                                            \
319     out0 = LD_H(RTYPE, (psrc));                \
320     out1 = LD_H(RTYPE, (psrc) + (stride));     \
321   }
322 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
323 
324 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
325   {                                                        \
326     LD_H2(RTYPE, (psrc), stride, out0, out1);              \
327     LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
328   }
329 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
330 
331 /* Description : Load 2 vectors of signed word elements with stride
332    Arguments   : Inputs  - psrc, stride
333                  Outputs - out0, out1
334                  Return Type - signed word
335 */
336 #define LD_SW2(psrc, stride, out0, out1) \
337   {                                      \
338     out0 = LD_SW((psrc));                \
339     out1 = LD_SW((psrc) + stride);       \
340   }
341 
342 /* Description : Store vectors of 16 byte elements with stride
343    Arguments   : Inputs - in0, in1, pdst, stride
344    Details     : Store 16 byte elements from 'in0' to (pdst)
345                  Store 16 byte elements from 'in1' to (pdst + stride)
346 */
347 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
348   {                                          \
349     ST_B(RTYPE, in0, (pdst));                \
350     ST_B(RTYPE, in1, (pdst) + stride);       \
351   }
352 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
353 
354 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
355   {                                                      \
356     ST_B2(RTYPE, in0, in1, (pdst), stride);              \
357     ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
358   }
359 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
360 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
361 
362 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
363   {                                                                        \
364     ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
365     ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
366   }
367 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
368 
369 /* Description : Store vectors of 8 halfword elements with stride
370    Arguments   : Inputs - in0, in1, pdst, stride
371    Details     : Store 8 halfword elements from 'in0' to (pdst)
372                  Store 8 halfword elements from 'in1' to (pdst + stride)
373 */
374 #define ST_H2(RTYPE, in0, in1, pdst, stride) \
375   {                                          \
376     ST_H(RTYPE, in0, (pdst));                \
377     ST_H(RTYPE, in1, (pdst) + stride);       \
378   }
379 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
380 
381 /* Description : Store vectors of word elements with stride
382    Arguments   : Inputs - in0, in1, pdst, stride
383    Details     : Store 4 word elements from 'in0' to (pdst)
384                  Store 4 word elements from 'in1' to (pdst + stride)
385 */
386 #define ST_SW2(in0, in1, pdst, stride) \
387   {                                    \
388     ST_SW(in0, (pdst));                \
389     ST_SW(in1, (pdst) + stride);       \
390   }
391 
392 /* Description : Store 2x4 byte block to destination memory from input vector
393    Arguments   : Inputs - in, stidx, pdst, stride
394    Details     : Index 'stidx' halfword element from 'in' vector is copied to
395                  the GP register and stored to (pdst)
396                  Index 'stidx+1' halfword element from 'in' vector is copied to
397                  the GP register and stored to (pdst + stride)
398                  Index 'stidx+2' halfword element from 'in' vector is copied to
399                  the GP register and stored to (pdst + 2 * stride)
400                  Index 'stidx+3' halfword element from 'in' vector is copied to
401                  the GP register and stored to (pdst + 3 * stride)
402 */
403 #define ST2x4_UB(in, stidx, pdst, stride)            \
404   {                                                  \
405     uint16_t out0_m, out1_m, out2_m, out3_m;         \
406     uint8_t *pblk_2x4_m = (uint8_t *)(pdst);         \
407                                                      \
408     out0_m = __msa_copy_u_h((v8i16)in, (stidx));     \
409     out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
410     out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
411     out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
412                                                      \
413     SH(out0_m, pblk_2x4_m);                          \
414     SH(out1_m, pblk_2x4_m + stride);                 \
415     SH(out2_m, pblk_2x4_m + 2 * stride);             \
416     SH(out3_m, pblk_2x4_m + 3 * stride);             \
417   }
418 
419 /* Description : Store 4x4 byte block to destination memory from input vector
420    Arguments   : Inputs - in0, in1, pdst, stride
421    Details     : 'Idx0' word element from input vector 'in0' is copied to the
422                  GP register and stored to (pdst)
423                  'Idx1' word element from input vector 'in0' is copied to the
424                  GP register and stored to (pdst + stride)
425                  'Idx2' word element from input vector 'in0' is copied to the
426                  GP register and stored to (pdst + 2 * stride)
427                  'Idx3' word element from input vector 'in0' is copied to the
428                  GP register and stored to (pdst + 3 * stride)
429 */
430 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
431   {                                                              \
432     uint32_t out0_m, out1_m, out2_m, out3_m;                     \
433     uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                     \
434                                                                  \
435     out0_m = __msa_copy_u_w((v4i32)in0, idx0);                   \
436     out1_m = __msa_copy_u_w((v4i32)in0, idx1);                   \
437     out2_m = __msa_copy_u_w((v4i32)in1, idx2);                   \
438     out3_m = __msa_copy_u_w((v4i32)in1, idx3);                   \
439                                                                  \
440     SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);     \
441   }
442 #define ST4x8_UB(in0, in1, pdst, stride)                           \
443   {                                                                \
444     uint8_t *pblk_4x8 = (uint8_t *)(pdst);                         \
445                                                                    \
446     ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);              \
447     ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
448   }
449 
450 /* Description : Store 8x1 byte block to destination memory from input vector
451    Arguments   : Inputs - in, pdst
452    Details     : Index 0 double word element from 'in' vector is copied to the
453                  GP register and stored to (pdst)
454 */
455 #define ST8x1_UB(in, pdst)                 \
456   {                                        \
457     uint64_t out0_m;                       \
458                                            \
459     out0_m = __msa_copy_u_d((v2i64)in, 0); \
460     SD(out0_m, pdst);                      \
461   }
462 
463 /* Description : Store 8x2 byte block to destination memory from input vector
464    Arguments   : Inputs - in, pdst, stride
465    Details     : Index 0 double word element from 'in' vector is copied to the
466                  GP register and stored to (pdst)
467                  Index 1 double word element from 'in' vector is copied to the
468                  GP register and stored to (pdst + stride)
469 */
470 #define ST8x2_UB(in, pdst, stride)           \
471   {                                          \
472     uint64_t out0_m, out1_m;                 \
473     uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
474                                              \
475     out0_m = __msa_copy_u_d((v2i64)in, 0);   \
476     out1_m = __msa_copy_u_d((v2i64)in, 1);   \
477                                              \
478     SD(out0_m, pblk_8x2_m);                  \
479     SD(out1_m, pblk_8x2_m + stride);         \
480   }
481 
482 /* Description : Store 8x4 byte block to destination memory from input
483                  vectors
484    Arguments   : Inputs - in0, in1, pdst, stride
485    Details     : Index 0 double word element from 'in0' vector is copied to the
486                  GP register and stored to (pdst)
487                  Index 1 double word element from 'in0' vector is copied to the
488                  GP register and stored to (pdst + stride)
489                  Index 0 double word element from 'in1' vector is copied to the
490                  GP register and stored to (pdst + 2 * stride)
491                  Index 1 double word element from 'in1' vector is copied to the
492                  GP register and stored to (pdst + 3 * stride)
493 */
494 #define ST8x4_UB(in0, in1, pdst, stride)                     \
495   {                                                          \
496     uint64_t out0_m, out1_m, out2_m, out3_m;                 \
497     uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                 \
498                                                              \
499     out0_m = __msa_copy_u_d((v2i64)in0, 0);                  \
500     out1_m = __msa_copy_u_d((v2i64)in0, 1);                  \
501     out2_m = __msa_copy_u_d((v2i64)in1, 0);                  \
502     out3_m = __msa_copy_u_d((v2i64)in1, 1);                  \
503                                                              \
504     SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
505   }
506 
507 /* Description : Immediate number of elements to slide with zero
508    Arguments   : Inputs  - in0, in1, slide_val
509                  Outputs - out0, out1
510                  Return Type - as per RTYPE
511    Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
512                  value specified in the 'slide_val'
513 */
514 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)             \
515   {                                                                   \
516     v16i8 zero_m = { 0 };                                             \
517                                                                       \
518     out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
519     out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
520   }
521 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
522 
523 /* Description : Immediate number of elements to slide
524    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
525                  Outputs - out0, out1
526                  Return Type - as per RTYPE
527    Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
528                  value specified in the 'slide_val'
529 */
530 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
531   {                                                                       \
532     out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);    \
533     out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);    \
534   }
535 
536 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
537                 out2, slide_val)                                             \
538   {                                                                          \
539     SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val);       \
540     out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);       \
541   }
542 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
543 
544 /* Description : Shuffle byte vector elements as per mask vector
545    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
546                  Outputs - out0, out1
547                  Return Type - as per RTYPE
548    Details     : Byte elements from 'in0' & 'in1' are copied selectively to
549                  'out0' as per control vector 'mask0'
550 */
551 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
552   {                                                                   \
553     out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
554     out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
555   }
556 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
557 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
558 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
559 
560 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
561                 out0, out1, out2)                                         \
562   {                                                                       \
563     VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);         \
564     out2 = (RTYPE)__msa_vshf_b((v16i8)mask2, (v16i8)in5, (v16i8)in4);     \
565   }
566 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
567 
568 /* Description : Shuffle halfword vector elements as per mask vector
569    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
570                  Outputs - out0, out1
571                  Return Type - as per RTYPE
572    Details     : halfword elements from 'in0' & 'in1' are copied selectively to
573                  'out0' as per control vector 'mask0'
574 */
575 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
576   {                                                                   \
577     out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \
578     out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \
579   }
580 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
581 
582 /* Description : Dot product of byte vector elements
583    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
584                  Outputs - out0, out1
585                  Return Type - as per RTYPE
586    Details     : Unsigned byte elements from 'mult0' are multiplied with
587                  unsigned byte elements from 'cnst0' producing a result
588                  twice the size of input i.e. unsigned halfword.
589                  The multiplication result of adjacent odd-even elements
590                  are added together and written to the 'out0' vector
591 */
592 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
593   {                                                             \
594     out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);   \
595     out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);   \
596   }
597 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
598 
599 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
600                  cnst3, out0, out1, out2, out3)                          \
601   {                                                                      \
602     DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
603     DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
604   }
605 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
606 
607 /* Description : Dot product of byte vector elements
608    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
609                  Outputs - out0, out1
610                  Return Type - as per RTYPE
611    Details     : Signed byte elements from 'mult0' are multiplied with
612                  signed byte elements from 'cnst0' producing a result
613                  twice the size of input i.e. signed halfword.
614                  The multiplication result of adjacent odd-even elements
615                  are added together and written to the 'out0' vector
616 */
617 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
618   {                                                             \
619     out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);   \
620     out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);   \
621   }
622 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
623 
624 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
625                  cnst3, out0, out1, out2, out3)                          \
626   {                                                                      \
627     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
628     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
629   }
630 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
631 
632 /* Description : Dot product of halfword vector elements
633    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
634                  Outputs - out0, out1
635                  Return Type - as per RTYPE
636    Details     : Signed halfword elements from 'mult0' are multiplied with
637                  signed halfword elements from 'cnst0' producing a result
638                  twice the size of input i.e. signed word.
639                  The multiplication result of adjacent odd-even elements
640                  are added together and written to the 'out0' vector
641 */
642 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
643   {                                                             \
644     out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);   \
645     out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);   \
646   }
647 
648 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
649                  cnst3, out0, out1, out2, out3)                          \
650   {                                                                      \
651     DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
652     DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
653   }
654 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
655 
656 /* Description : Dot product of word vector elements
657    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
658                  Outputs - out0, out1
659                  Return Type - as per RTYPE
660    Details     : Signed word elements from 'mult0' are multiplied with
661                  signed word elements from 'cnst0' producing a result
662                  twice the size of input i.e. signed double word.
663                  The multiplication result of adjacent odd-even elements
664                  are added together and written to the 'out0' vector
665 */
666 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
667   {                                                             \
668     out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);   \
669     out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);   \
670   }
671 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
672 
673 /* Description : Dot product & addition of byte vector elements
674    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
675                  Outputs - out0, out1
676                  Return Type - as per RTYPE
677    Details     : Signed byte elements from 'mult0' are multiplied with
678                  signed byte elements from 'cnst0' producing a result
679                  twice the size of input i.e. signed halfword.
680                  The multiplication result of adjacent odd-even elements
681                  are added to the 'out0' vector
682 */
683 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
684   {                                                                         \
685     out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
686     out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
687   }
688 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
689 
690 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
691                   cnst3, out0, out1, out2, out3)                          \
692   {                                                                       \
693     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
694     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
695   }
696 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
697 
698 /* Description : Dot product & addition of halfword vector elements
699    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
700                  Outputs - out0, out1
701                  Return Type - as per RTYPE
702    Details     : Signed halfword elements from 'mult0' are multiplied with
703                  signed halfword elements from 'cnst0' producing a result
704                  twice the size of input i.e. signed word.
705                  The multiplication result of adjacent odd-even elements
706                  are added to the 'out0' vector
707 */
708 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
709   {                                                                         \
710     out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
711     out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
712   }
713 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
714 
715 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
716                   cnst3, out0, out1, out2, out3)                          \
717   {                                                                       \
718     DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
719     DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
720   }
721 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
722 
723 /* Description : Dot product & addition of double word vector elements
724    Arguments   : Inputs  - mult0, mult1
725                  Outputs - out0, out1
726                  Return Type - as per RTYPE
727    Details     : Each signed word element from 'mult0' is multiplied with itself
728                  producing an intermediate result twice the size of it
729                  i.e. signed double word
730                  The multiplication result of adjacent odd-even elements
731                  are added to the 'out0' vector
732 */
733 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1)                          \
734   {                                                                         \
735     out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
736     out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
737   }
738 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
739 
740 /* Description : Clips all signed halfword elements of input vector
741                  between 0 & 255
742    Arguments   : Input  - in
743                  Output - out_m
744                  Return Type - signed halfword
745 */
746 #define CLIP_SH_0_255(in)                              \
747   ({                                                   \
748     v8i16 max_m = __msa_ldi_h(255);                    \
749     v8i16 out_m;                                       \
750                                                        \
751     out_m = __msa_maxi_s_h((v8i16)in, 0);              \
752     out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
753     out_m;                                             \
754   })
755 #define CLIP_SH2_0_255(in0, in1) \
756   {                              \
757     in0 = CLIP_SH_0_255(in0);    \
758     in1 = CLIP_SH_0_255(in1);    \
759   }
760 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
761   {                                        \
762     CLIP_SH2_0_255(in0, in1);              \
763     CLIP_SH2_0_255(in2, in3);              \
764   }
765 
766 /* Description : Clips all signed word elements of input vector
767                  between 0 & 255
768    Arguments   : Input  - in
769                  Output - out_m
770                  Return Type - signed word
771 */
772 #define CLIP_SW_0_255(in)                              \
773   ({                                                   \
774     v4i32 max_m = __msa_ldi_w(255);                    \
775     v4i32 out_m;                                       \
776                                                        \
777     out_m = __msa_maxi_s_w((v4i32)in, 0);              \
778     out_m = __msa_min_s_w((v4i32)max_m, (v4i32)out_m); \
779     out_m;                                             \
780   })
781 
782 /* Description : Horizontal addition of 4 signed word elements of input vector
783    Arguments   : Input  - in       (signed word vector)
784                  Output - sum_m    (i32 sum)
785                  Return Type - signed word (GP)
786    Details     : 4 signed word elements of 'in' vector are added together and
787                  the resulting integer sum is returned
788 */
789 #define HADD_SW_S32(in)                            \
790   ({                                               \
791     v2i64 res0_m, res1_m;                          \
792     int32_t sum_m;                                 \
793                                                    \
794     res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
795     res1_m = __msa_splati_d(res0_m, 1);            \
796     res0_m = res0_m + res1_m;                      \
797     sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
798     sum_m;                                         \
799   })
800 
801 /* Description : Horizontal addition of 8 unsigned halfword elements
802    Arguments   : Inputs  - in       (unsigned halfword vector)
803                  Outputs - sum_m    (u32 sum)
804                  Return Type - unsigned word
805    Details     : 8 unsigned halfword elements of input vector are added
806                  together and the resulting integer sum is returned
807 */
808 #define HADD_UH_U32(in)                               \
809   ({                                                  \
810     v4u32 res_m;                                      \
811     v2u64 res0_m, res1_m;                             \
812     uint32_t sum_m;                                   \
813                                                       \
814     res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);     \
815     res0_m = __msa_hadd_u_d(res_m, res_m);            \
816     res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
817     res0_m = res0_m + res1_m;                         \
818     sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
819     sum_m;                                            \
820   })
821 
822 /* Description : Horizontal addition of unsigned byte vector elements
823    Arguments   : Inputs  - in0, in1
824                  Outputs - out0, out1
825                  Return Type - as per RTYPE
826    Details     : Each unsigned odd byte element from 'in0' is added to
827                  even unsigned byte element from 'in0' (pairwise) and the
828                  halfword result is written to 'out0'
829 */
830 #define HADD_UB2(RTYPE, in0, in1, out0, out1)             \
831   {                                                       \
832     out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
833     out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
834   }
835 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
836 
837 /* Description : Horizontal subtraction of unsigned byte vector elements
838    Arguments   : Inputs  - in0, in1
839                  Outputs - out0, out1
840                  Return Type - as per RTYPE
841    Details     : Each unsigned odd byte element from 'in0' is subtracted from
842                  even unsigned byte element from 'in0' (pairwise) and the
843                  halfword result is written to 'out0'
844 */
845 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)             \
846   {                                                       \
847     out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
848     out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
849   }
850 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
851 
852 /* Description : Horizontal subtraction of signed halfword vector elements
853    Arguments   : Inputs  - in0, in1
854                  Outputs - out0, out1
855                  Return Type - as per RTYPE
856    Details     : Each signed odd halfword element from 'in0' is subtracted from
857                  even signed halfword element from 'in0' (pairwise) and the
858                  word result is written to 'out0'
859 */
860 #define HSUB_UH2(RTYPE, in0, in1, out0, out1)             \
861   {                                                       \
862     out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
863     out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
864   }
865 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
866 
867 /* Description : Set element n input vector to GPR value
868    Arguments   : Inputs - in0, in1, in2, in3
869                  Output - out
870                  Return Type - as per RTYPE
871    Details     : Set element 0 in vector 'out' to value specified in 'in0'
872 */
873 #define INSERT_D2(RTYPE, in0, in1, out)              \
874   {                                                  \
875     out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
876     out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
877   }
878 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
879 
880 /* Description : Interleave even byte elements from vectors
881    Arguments   : Inputs  - in0, in1, in2, in3
882                  Outputs - out0, out1
883                  Return Type - as per RTYPE
884    Details     : Even byte elements of 'in0' and 'in1' are interleaved
885                  and written to 'out0'
886 */
887 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
888   {                                                      \
889     out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
890     out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
891   }
892 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
893 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
894 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
895 
896 /* Description : Interleave even halfword elements from vectors
897    Arguments   : Inputs  - in0, in1, in2, in3
898                  Outputs - out0, out1
899                  Return Type - as per RTYPE
900    Details     : Even halfword elements of 'in0' and 'in1' are interleaved
901                  and written to 'out0'
902 */
903 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
904   {                                                      \
905     out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
906     out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
907   }
908 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
909 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
910 
911 /* Description : Interleave even word elements from vectors
912    Arguments   : Inputs  - in0, in1, in2, in3
913                  Outputs - out0, out1
914                  Return Type - as per RTYPE
915    Details     : Even word elements of 'in0' and 'in1' are interleaved
916                  and written to 'out0'
917 */
918 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
919   {                                                      \
920     out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
921     out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
922   }
923 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
924 
925 /* Description : Interleave even double word elements from vectors
926    Arguments   : Inputs  - in0, in1, in2, in3
927                  Outputs - out0, out1
928                  Return Type - as per RTYPE
929    Details     : Even double word elements of 'in0' and 'in1' are interleaved
930                  and written to 'out0'
931 */
932 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
933   {                                                      \
934     out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
935     out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
936   }
937 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
938 
939 /* Description : Interleave left half of byte elements from vectors
940    Arguments   : Inputs  - in0, in1, in2, in3
941                  Outputs - out0, out1
942                  Return Type - as per RTYPE
943    Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
944                  and written to 'out0'.
945 */
946 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
947   {                                                     \
948     out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
949     out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
950   }
951 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
952 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
953 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
954 
955 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
956                 out2, out3)                                                \
957   {                                                                        \
958     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
959     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
960   }
961 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
962 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
963 
964 /* Description : Interleave left half of halfword elements from vectors
965    Arguments   : Inputs  - in0, in1, in2, in3
966                  Outputs - out0, out1
967                  Return Type - as per RTYPE
968    Details     : Left half of halfword elements of 'in0' and 'in1' are
969                  interleaved and written to 'out0'.
970 */
971 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
972   {                                                     \
973     out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
974     out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
975   }
976 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
977 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
978 
979 /* Description : Interleave left half of word elements from vectors
980    Arguments   : Inputs  - in0, in1, in2, in3
981                  Outputs - out0, out1
982                  Return Type - as per RTYPE
983    Details     : Left half of word elements of 'in0' and 'in1' are interleaved
984                  and written to 'out0'.
985 */
986 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
987   {                                                     \
988     out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
989     out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
990   }
991 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
992 
993 /* Description : Interleave right half of byte elements from vectors
994    Arguments   : Inputs  - in0, in1, in2, in3
995                  Outputs - out0, out1
996                  Return Type - as per RTYPE
997    Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
998                  and written to out0.
999 */
1000 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1001   {                                                     \
1002     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
1003     out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
1004   }
1005 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1006 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1007 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1008 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1009 
1010 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1011                 out2, out3)                                                \
1012   {                                                                        \
1013     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1014     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1015   }
1016 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1017 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1018 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1019 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1020 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1021 
1022 /* Description : Interleave right half of halfword elements from vectors
1023    Arguments   : Inputs  - in0, in1, in2, in3
1024                  Outputs - out0, out1
1025                  Return Type - as per RTYPE
1026    Details     : Right half of halfword elements of 'in0' and 'in1' are
1027                  interleaved and written to 'out0'.
1028 */
1029 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1030   {                                                     \
1031     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
1032     out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
1033   }
1034 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1035 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1036 
1037 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1038                 out2, out3)                                                \
1039   {                                                                        \
1040     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1041     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1042   }
1043 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1044 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1045 
1046 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1047   {                                                     \
1048     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
1049     out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
1050   }
1051 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1052 
1053 /* Description : Interleave right half of double word elements from vectors
1054    Arguments   : Inputs  - in0, in1, in2, in3
1055                  Outputs - out0, out1
1056                  Return Type - as per RTYPE
1057    Details     : Right half of double word elements of 'in0' and 'in1' are
1058                  interleaved and written to 'out0'.
1059 */
1060 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1061   {                                                         \
1062     out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
1063     out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
1064   }
1065 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1066 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1067 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1068 
1069 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1070                 out2, out3)                                                \
1071   {                                                                        \
1072     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1073     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1074   }
1075 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1076 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1077 
1078 /* Description : Interleave both left and right half of input vectors
1079    Arguments   : Inputs  - in0, in1
1080                  Outputs - out0, out1
1081                  Return Type - as per RTYPE
1082    Details     : Right half of byte elements from 'in0' and 'in1' are
1083                  interleaved and written to 'out0'
1084 */
1085 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
1086   {                                                     \
1087     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
1088     out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
1089   }
1090 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1091 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1092 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1093 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1094 
1095 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)           \
1096   {                                                     \
1097     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
1098     out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
1099   }
1100 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1101 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1102 
1103 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)           \
1104   {                                                     \
1105     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
1106     out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
1107   }
1108 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1109 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1110 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1111 
1112 /* Description : Maximum values between signed elements of vector and
1113                  5-bit signed immediate value are copied to the output vector
1114    Arguments   : Inputs  - in0, in1, in2, in3, max_val
1115                  Outputs - in place operation
1116                  Return Type - unsigned halfword
1117    Details     : Maximum of signed halfword element values from 'in0' and
1118                  'max_val' are written in place
1119 */
1120 #define MAXI_SH2(RTYPE, in0, in1, max_val)              \
1121   {                                                     \
1122     in0 = (RTYPE)__msa_maxi_s_h((v8i16)in0, (max_val)); \
1123     in1 = (RTYPE)__msa_maxi_s_h((v8i16)in1, (max_val)); \
1124   }
1125 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1126 
1127 /* Description : Saturate the halfword element values to the max
1128                  unsigned value of (sat_val + 1) bits
1129                  The element data width remains unchanged
1130    Arguments   : Inputs  - in0, in1, sat_val
1131                  Outputs - in place operation
1132                  Return Type - as per RTYPE
1133    Details     : Each unsigned halfword element from 'in0' is saturated to the
1134                  value generated with (sat_val + 1) bit range.
1135                  The results are written in place
1136 */
1137 #define SAT_UH2(RTYPE, in0, in1, sat_val)            \
1138   {                                                  \
1139     in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
1140     in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
1141   }
1142 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1143 
1144 /* Description : Saturate the halfword element values to the max
1145                  unsigned value of (sat_val + 1) bits
1146                  The element data width remains unchanged
1147    Arguments   : Inputs  - in0, in1, sat_val
1148                  Outputs - in place operation
1149                  Return Type - as per RTYPE
1150    Details     : Each unsigned halfword element from 'in0' is saturated to the
1151                  value generated with (sat_val + 1) bit range
1152                  The results are written in place
1153 */
1154 #define SAT_SH2(RTYPE, in0, in1, sat_val)            \
1155   {                                                  \
1156     in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
1157     in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
1158   }
1159 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1160 
1161 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1162   {                                                 \
1163     SAT_SH2(RTYPE, in0, in1, sat_val);              \
1164     SAT_SH2(RTYPE, in2, in3, sat_val);              \
1165   }
1166 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1167 
1168 /* Description : Indexed halfword element values are replicated to all
1169                  elements in output vector
1170    Arguments   : Inputs  - in, idx0, idx1
1171                  Outputs - out0, out1
1172                  Return Type - as per RTYPE
1173    Details     : 'idx0' element value from 'in' vector is replicated to all
1174                   elements in 'out0' vector
1175                   Valid index range for halfword operation is 0-7
1176 */
1177 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1178   {                                                  \
1179     out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);   \
1180     out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);   \
1181   }
1182 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1183 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1184 
1185 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, out0, out1, out2) \
1186   {                                                              \
1187     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);                \
1188     out2 = (RTYPE)__msa_splati_h((v8i16)in, idx2);               \
1189   }
1190 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1191 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1192 
1193 /* Description : Indexed word element values are replicated to all
1194                  elements in output vector
1195    Arguments   : Inputs  - in, stidx
1196                  Outputs - out0, out1
1197                  Return Type - as per RTYPE
1198    Details     : 'stidx' element value from 'in' vector is replicated to all
1199                  elements in 'out0' vector
1200                  'stidx + 1' element value from 'in' vector is replicated to all
1201                  elements in 'out1' vector
1202                  Valid index range for word operation is 0-3
1203 */
1204 #define SPLATI_W2(RTYPE, in, stidx, out0, out1)           \
1205   {                                                       \
1206     out0 = (RTYPE)__msa_splati_w((v4i32)in, stidx);       \
1207     out1 = (RTYPE)__msa_splati_w((v4i32)in, (stidx + 1)); \
1208   }
1209 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1210 
1211 /* Description : Pack even byte elements of vector pairs
1212    Arguments   : Inputs  - in0, in1, in2, in3
1213                  Outputs - out0, out1
1214                  Return Type - as per RTYPE
1215    Details     : Even byte elements of 'in0' are copied to the left half of
1216                  'out0' & even byte elements of 'in1' are copied to the right
1217                  half of 'out0'.
1218 */
1219 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1220   {                                                      \
1221     out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
1222     out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
1223   }
1224 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1225 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1226 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1227 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1228 
1229 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1230                  out2, out3)                                                \
1231   {                                                                         \
1232     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1233     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1234   }
1235 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1236 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1237 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1238 
1239 /* Description : Pack even halfword elements of vector pairs
1240    Arguments   : Inputs  - in0, in1, in2, in3
1241                  Outputs - out0, out1
1242                  Return Type - as per RTYPE
1243    Details     : Even halfword elements of 'in0' are copied to the left half of
1244                  'out0' & even halfword elements of 'in1' are copied to the
1245                  right half of 'out0'.
1246 */
1247 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1248   {                                                      \
1249     out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
1250     out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
1251   }
1252 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1253 
1254 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1255                  out2, out3)                                                \
1256   {                                                                         \
1257     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1258     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1259   }
1260 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1261 
1262 /* Description : Pack even double word elements of vector pairs
1263    Arguments   : Inputs  - in0, in1, in2, in3
1264                  Outputs - out0, out1
1265                  Return Type - as per RTYPE
1266    Details     : Even double elements of 'in0' are copied to the left half of
1267                  'out0' & even double elements of 'in1' are copied to the right
1268                  half of 'out0'.
1269 */
1270 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1271   {                                                      \
1272     out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
1273     out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
1274   }
1275 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1276 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1277 
1278 /* Description : Pack odd double word elements of vector pairs
1279    Arguments   : Inputs  - in0, in1, in2, in3
1280                  Outputs - out0, out1
1281                  Return Type - as per RTYPE
1282    Details     : Odd double word elements of 'in0' are copied to the left half
1283                  of 'out0' & odd double word elements of 'in1' are copied to
1284                  the right half of 'out0'.
1285 */
1286 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1287   {                                                      \
1288     out0 = (RTYPE)__msa_pckod_d((v2i64)in0, (v2i64)in1); \
1289     out1 = (RTYPE)__msa_pckod_d((v2i64)in2, (v2i64)in3); \
1290   }
1291 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1292 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1293 
1294 /* Description : Each byte element is logically xor'ed with immediate 128
1295    Arguments   : Inputs  - in0, in1
1296                  Outputs - in place operation
1297                  Return Type - as per RTYPE
1298    Details     : Each unsigned byte element from input vector 'in0' is
1299                  logically xor'ed with 128 and the result is stored in-place.
1300 */
1301 #define XORI_B2_128(RTYPE, in0, in1)            \
1302   {                                             \
1303     in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
1304     in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
1305   }
1306 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1307 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1308 
1309 #define XORI_B3_128(RTYPE, in0, in1, in2)       \
1310   {                                             \
1311     XORI_B2_128(RTYPE, in0, in1);               \
1312     in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
1313   }
1314 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1315 
1316 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1317   {                                            \
1318     XORI_B2_128(RTYPE, in0, in1);              \
1319     XORI_B2_128(RTYPE, in2, in3);              \
1320   }
1321 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1322 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1323 
1324 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
1325   {                                                 \
1326     XORI_B3_128(RTYPE, in0, in1, in2);              \
1327     XORI_B2_128(RTYPE, in3, in4);                   \
1328   }
1329 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1330 
1331 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
1332   {                                                                \
1333     XORI_B4_128(RTYPE, in0, in1, in2, in3);                        \
1334     XORI_B4_128(RTYPE, in4, in5, in6, in7);                        \
1335   }
1336 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1337 
1338 /* Description : Shift left all elements of vector (generic for all data types)
1339    Arguments   : Inputs  - in0, in1, in2, in3, shift
1340                  Outputs - in place operation
1341                  Return Type - as per input vector RTYPE
1342    Details     : Each element of vector 'in0' is left shifted by 'shift' and
1343                  the result is written in-place.
1344 */
1345 #define SLLI_4V(in0, in1, in2, in3, shift) \
1346   {                                        \
1347     in0 = in0 << shift;                    \
1348     in1 = in1 << shift;                    \
1349     in2 = in2 << shift;                    \
1350     in3 = in3 << shift;                    \
1351   }
1352 
1353 /* Description : Arithmetic shift right all elements of vector
1354                  (generic for all data types)
1355    Arguments   : Inputs  - in0, in1, in2, in3, shift
1356                  Outputs - in place operation
1357                  Return Type - as per input vector RTYPE
1358    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1359                  the result is written in-place. 'shift' is a GP variable.
1360 */
1361 #define SRA_4V(in0, in1, in2, in3, shift) \
1362   {                                       \
1363     in0 = in0 >> shift;                   \
1364     in1 = in1 >> shift;                   \
1365     in2 = in2 >> shift;                   \
1366     in3 = in3 >> shift;                   \
1367   }
1368 
1369 /* Description : Shift right arithmetic rounded words
1370    Arguments   : Inputs  - in0, in1, shift
1371                  Outputs - in place operation
1372                  Return Type - as per RTYPE
1373    Details     : Each element of vector 'in0' is shifted right arithmetically by
1374                  the number of bits in the corresponding element in the vector
1375                  'shift'. The last discarded bit is added to shifted value for
1376                  rounding and the result is written in-place.
1377                  'shift' is a vector.
1378 */
1379 #define SRAR_W2(RTYPE, in0, in1, shift)                  \
1380   {                                                      \
1381     in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
1382     in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
1383   }
1384 
1385 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
1386   {                                               \
1387     SRAR_W2(RTYPE, in0, in1, shift);              \
1388     SRAR_W2(RTYPE, in2, in3, shift);              \
1389   }
1390 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
1391 
1392 /* Description : Shift right arithmetic rounded (immediate)
1393    Arguments   : Inputs  - in0, in1, shift
1394                  Outputs - in place operation
1395                  Return Type - as per RTYPE
1396    Details     : Each element of vector 'in0' is shifted right arithmetically by
1397                  the value in 'shift'. The last discarded bit is added to the
1398                  shifted value for rounding and the result is written in-place.
1399                  'shift' is an immediate value.
1400 */
1401 #define SRARI_H2(RTYPE, in0, in1, shift)           \
1402   {                                                \
1403     in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
1404     in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
1405   }
1406 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1407 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1408 
1409 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
1410   {                                                \
1411     SRARI_H2(RTYPE, in0, in1, shift);              \
1412     SRARI_H2(RTYPE, in2, in3, shift);              \
1413   }
1414 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
1415 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
1416 
1417 #define SRARI_W2(RTYPE, in0, in1, shift)           \
1418   {                                                \
1419     in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
1420     in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
1421   }
1422 
1423 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
1424   {                                                \
1425     SRARI_W2(RTYPE, in0, in1, shift);              \
1426     SRARI_W2(RTYPE, in2, in3, shift);              \
1427   }
1428 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1429 
1430 /* Description : Multiplication of pairs of vectors
1431    Arguments   : Inputs  - in0, in1, in2, in3
1432                  Outputs - out0, out1
1433    Details     : Each element from 'in0' is multiplied with elements from 'in1'
1434                  and the result is written to 'out0'
1435 */
1436 #define MUL2(in0, in1, in2, in3, out0, out1) \
1437   {                                          \
1438     out0 = in0 * in1;                        \
1439     out1 = in2 * in3;                        \
1440   }
1441 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1442   {                                                                          \
1443     MUL2(in0, in1, in2, in3, out0, out1);                                    \
1444     MUL2(in4, in5, in6, in7, out2, out3);                                    \
1445   }
1446 
1447 /* Description : Addition of 2 pairs of vectors
1448    Arguments   : Inputs  - in0, in1, in2, in3
1449                  Outputs - out0, out1
1450    Details     : Each element in 'in0' is added to 'in1' and result is written
1451                  to 'out0'.
1452 */
1453 #define ADD2(in0, in1, in2, in3, out0, out1) \
1454   {                                          \
1455     out0 = in0 + in1;                        \
1456     out1 = in2 + in3;                        \
1457   }
1458 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1459   {                                                                          \
1460     ADD2(in0, in1, in2, in3, out0, out1);                                    \
1461     ADD2(in4, in5, in6, in7, out2, out3);                                    \
1462   }
1463 
1464 /* Description : Subtraction of 2 pairs of vectors
1465    Arguments   : Inputs  - in0, in1, in2, in3
1466                  Outputs - out0, out1
1467    Details     : Each element in 'in1' is subtracted from 'in0' and result is
1468                  written to 'out0'.
1469 */
1470 #define SUB2(in0, in1, in2, in3, out0, out1) \
1471   {                                          \
1472     out0 = in0 - in1;                        \
1473     out1 = in2 - in3;                        \
1474   }
1475 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1476   {                                                                          \
1477     out0 = in0 - in1;                                                        \
1478     out1 = in2 - in3;                                                        \
1479     out2 = in4 - in5;                                                        \
1480     out3 = in6 - in7;                                                        \
1481   }
1482 
1483 /* Description : Sign extend halfword elements from right half of the vector
1484    Arguments   : Input  - in    (halfword vector)
1485                  Output - out   (sign extended word vector)
1486                  Return Type - signed word
1487    Details     : Sign bit of halfword elements from input vector 'in' is
1488                  extracted and interleaved with same vector 'in0' to generate
1489                  4 word elements keeping sign intact
1490 */
1491 #define UNPCK_R_SH_SW(in, out)                    \
1492   {                                               \
1493     v8i16 sign_m;                                 \
1494                                                   \
1495     sign_m = __msa_clti_s_h((v8i16)in, 0);        \
1496     out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
1497   }
1498 
1499 /* Description : Zero extend unsigned byte elements to halfword elements
1500    Arguments   : Input   - in          (unsigned byte vector)
1501                  Outputs - out0, out1  (unsigned  halfword vectors)
1502                  Return Type - signed halfword
1503    Details     : Zero extended right half of vector is returned in 'out0'
1504                  Zero extended left half of vector is returned in 'out1'
1505 */
1506 #define UNPCK_UB_SH(in, out0, out1)      \
1507   {                                      \
1508     v16i8 zero_m = { 0 };                \
1509                                          \
1510     ILVRL_B2_SH(zero_m, in, out0, out1); \
1511   }
1512 
1513 /* Description : Sign extend halfword elements from input vector and return
1514                  the result in pair of vectors
1515    Arguments   : Input   - in            (halfword vector)
1516                  Outputs - out0, out1   (sign extended word vectors)
1517                  Return Type - signed word
1518    Details     : Sign bit of halfword elements from input vector 'in' is
1519                  extracted and interleaved right with same vector 'in0' to
1520                  generate 4 signed word elements in 'out0'
1521                  Then interleaved left with same vector 'in0' to
1522                  generate 4 signed word elements in 'out1'
1523 */
1524 #define UNPCK_SH_SW(in, out0, out1)       \
1525   {                                       \
1526     v8i16 tmp_m;                          \
1527                                           \
1528     tmp_m = __msa_clti_s_h((v8i16)in, 0); \
1529     ILVRL_H2_SW(tmp_m, in, out0, out1);   \
1530   }
1531 
1532 /* Description : Butterfly of 4 input vectors
1533    Arguments   : Inputs  - in0, in1, in2, in3
1534                  Outputs - out0, out1, out2, out3
1535    Details     : Butterfly operation
1536 */
1537 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
1538   {                                                             \
1539     out0 = in0 + in3;                                           \
1540     out1 = in1 + in2;                                           \
1541                                                                 \
1542     out2 = in1 - in2;                                           \
1543     out3 = in0 - in3;                                           \
1544   }
1545 
1546 /* Description : Transpose input 8x8 byte block
1547    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1548                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1549                  Return Type - as per RTYPE
1550 */
1551 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0,   \
1552                         out1, out2, out3, out4, out5, out6, out7)              \
1553   {                                                                            \
1554     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
1555     v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
1556                                                                                \
1557     ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
1558                tmp3_m);                                                        \
1559     ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                               \
1560     ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                               \
1561     ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                               \
1562     ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                               \
1563     SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                               \
1564     SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                               \
1565   }
1566 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
1567 
1568 /* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
1569    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1570                            in8, in9, in10, in11, in12, in13, in14, in15
1571                  Outputs - out0, out1, out2, out3
1572                  Return Type - unsigned byte
1573 */
1574 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
1575                             in10, in11, in12, in13, in14, in15, out0, out1,   \
1576                             out2, out3)                                       \
1577   {                                                                           \
1578     v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
1579                                                                               \
1580     ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m);                         \
1581     out1 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m);                              \
1582                                                                               \
1583     ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                         \
1584     out3 = (v16u8)__msa_ilvev_d(tmp1_m, tmp0_m);                              \
1585                                                                               \
1586     ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m);                        \
1587                                                                               \
1588     tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                   \
1589     ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                        \
1590                                                                               \
1591     tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                   \
1592     ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);                  \
1593     out0 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m);                \
1594     out2 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                \
1595                                                                               \
1596     tmp0_m = (v2i64)__msa_ilvod_b((v16i8)out3, (v16i8)out1);                  \
1597     tmp1_m = (v2i64)__msa_ilvod_b((v16i8)tmp3_m, (v16i8)tmp2_m);              \
1598     out1 = (v16u8)__msa_ilvev_h((v8i16)tmp1_m, (v8i16)tmp0_m);                \
1599     out3 = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                \
1600   }
1601 
1602 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
1603    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1604                            in8, in9, in10, in11, in12, in13, in14, in15
1605                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1606                  Return Type - unsigned byte
1607 */
1608 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
1609                             in10, in11, in12, in13, in14, in15, out0, out1,   \
1610                             out2, out3, out4, out5, out6, out7)               \
1611   {                                                                           \
1612     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
1613     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                     \
1614                                                                               \
1615     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                              \
1616     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                            \
1617     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                            \
1618     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                            \
1619                                                                               \
1620     tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                  \
1621     tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                  \
1622     tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                  \
1623     tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                  \
1624     out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                    \
1625     tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                  \
1626     out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                    \
1627     tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                  \
1628                                                                               \
1629     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                  \
1630     out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1631     out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1632                                                                               \
1633     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);              \
1634     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                  \
1635     out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1636     out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1637                                                                               \
1638     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);              \
1639     out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1640     out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1641                                                                               \
1642     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
1643     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
1644     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
1645     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
1646     out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1647     out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1648   }
1649 
1650 /* Description : Transpose 4x4 block with half word elements in vectors
1651    Arguments   : Inputs  - in0, in1, in2, in3
1652                  Outputs - out0, out1, out2, out3
1653                  Return Type - signed halfword
1654 */
1655 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
1656   {                                                                    \
1657     v8i16 s0_m, s1_m;                                                  \
1658                                                                        \
1659     ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                        \
1660     ILVRL_W2_SH(s1_m, s0_m, out0, out2);                               \
1661     out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);              \
1662     out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);              \
1663   }
1664 
1665 /* Description : Transpose 8x4 block with half word elements in vectors
1666    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1667                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1668                  Return Type - signed halfword
1669 */
1670 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
1671   {                                                                    \
1672     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
1673                                                                        \
1674     ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                    \
1675     ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                    \
1676     ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);            \
1677     ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);            \
1678   }
1679 
1680 /* Description : Transpose 4x4 block with word elements in vectors
1681    Arguments   : Inputs  - in0, in1, in2, in3
1682                  Outputs - out0, out1, out2, out3
1683                  Return Type - signed word
1684 */
1685 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
1686   {                                                                    \
1687     v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
1688                                                                        \
1689     ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                 \
1690     ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                 \
1691                                                                        \
1692     out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);              \
1693     out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);              \
1694     out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);              \
1695     out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);              \
1696   }
1697 
1698 /* Description : Dot product and addition of 3 signed halfword input vectors
1699    Arguments   : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
1700                  Output - out0_m
1701                  Return Type - signed halfword
1702    Details     : Dot product of 'in0' with 'coeff0'
1703                  Dot product of 'in1' with 'coeff1'
1704                  Dot product of 'in2' with 'coeff2'
1705                  Addition of all the 3 vector results
1706                  out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
1707 */
1708 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)      \
1709   ({                                                             \
1710     v8i16 tmp1_m;                                                \
1711     v8i16 out0_m;                                                \
1712                                                                  \
1713     out0_m = __msa_dotp_s_h((v16i8)in0, (v16i8)coeff0);          \
1714     out0_m = __msa_dpadd_s_h(out0_m, (v16i8)in1, (v16i8)coeff1); \
1715     tmp1_m = __msa_dotp_s_h((v16i8)in2, (v16i8)coeff2);          \
1716     out0_m = __msa_adds_s_h(out0_m, tmp1_m);                     \
1717                                                                  \
1718     out0_m;                                                      \
1719   })
1720 
1721 /* Description : Pack even elements of input vectors & xor with 128
1722    Arguments   : Inputs - in0, in1
1723                  Output - out_m
1724                  Return Type - unsigned byte
1725    Details     : Signed byte even elements from 'in0' and 'in1' are packed
1726                  together in one vector and the resulting vector is xor'ed with
1727                  128 to shift the range from signed to unsigned byte
1728 */
1729 #define PCKEV_XORI128_UB(in0, in1)                        \
1730   ({                                                      \
1731     v16u8 out_m;                                          \
1732     out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
1733     out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);       \
1734     out_m;                                                \
1735   })
1736 
1737 /* Description : Pack even byte elements and store byte vector in destination
1738                  memory
1739    Arguments   : Inputs - in0, in1, pdst
1740 */
1741 #define PCKEV_ST_SB(in0, in1, pdst)                \
1742   {                                                \
1743     v16i8 tmp_m;                                   \
1744     tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
1745     ST_SB(tmp_m, (pdst));                          \
1746   }
1747 
1748 /* Description : Horizontal 2 tap filter kernel code
1749    Arguments   : Inputs - in0, in1, mask, coeff, shift
1750 */
1751 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)        \
1752   ({                                                            \
1753     v16i8 tmp0_m;                                               \
1754     v8u16 tmp1_m;                                               \
1755                                                                 \
1756     tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
1757     tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);       \
1758     tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);        \
1759                                                                 \
1760     tmp1_m;                                                     \
1761   })
1762 #endif  // VPX_VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
1763