• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef VPX_VPX_DSP_MIPS_MACROS_MSA_H_
12 #define VPX_VPX_DSP_MIPS_MACROS_MSA_H_
13 
14 #include <msa.h>
15 
16 #include "./vpx_config.h"
17 #include "vpx/vpx_integer.h"
18 
19 #define LD_V(RTYPE, psrc) *((const RTYPE *)(psrc))
20 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
21 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
22 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
23 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
24 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
25 
26 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
27 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
28 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
29 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
30 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
31 
32 #if (__mips_isa_rev >= 6)
33 #define LH(psrc)                                   \
34   ({                                               \
35     uint16_t val_lh_m = *(const uint16_t *)(psrc); \
36     val_lh_m;                                      \
37   })
38 
39 #define LW(psrc)                                   \
40   ({                                               \
41     uint32_t val_lw_m = *(const uint32_t *)(psrc); \
42     val_lw_m;                                      \
43   })
44 
45 #if (__mips == 64)
46 #define LD(psrc)                                   \
47   ({                                               \
48     uint64_t val_ld_m = *(const uint64_t *)(psrc); \
49     val_ld_m;                                      \
50   })
51 #else  // !(__mips == 64)
52 #define LD(psrc)                                                  \
53   ({                                                              \
54     const uint8_t *psrc_ld_m = (const uint8_t *)(psrc);           \
55     uint32_t val0_ld_m, val1_ld_m;                                \
56     uint64_t val_ld_m = 0;                                        \
57                                                                   \
58     val0_ld_m = LW(psrc_ld_m);                                    \
59     val1_ld_m = LW(psrc_ld_m + 4);                                \
60                                                                   \
61     val_ld_m = (uint64_t)(val1_ld_m);                             \
62     val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \
63     val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m);        \
64                                                                   \
65     val_ld_m;                                                     \
66   })
67 #endif  // (__mips == 64)
68 
69 #define SH(val, pdst) *(uint16_t *)(pdst) = (val);
70 #define SW(val, pdst) *(uint32_t *)(pdst) = (val);
71 #define SD(val, pdst) *(uint64_t *)(pdst) = (val);
72 #else  // !(__mips_isa_rev >= 6)
73 #define LH(psrc)                                                 \
74   ({                                                             \
75     const uint8_t *psrc_lh_m = (const uint8_t *)(psrc);          \
76     uint16_t val_lh_m;                                           \
77                                                                  \
78     __asm__ __volatile__("ulh  %[val_lh_m],  %[psrc_lh_m]  \n\t" \
79                                                                  \
80                          : [val_lh_m] "=r"(val_lh_m)             \
81                          : [psrc_lh_m] "m"(*psrc_lh_m));         \
82                                                                  \
83     val_lh_m;                                                    \
84   })
85 
86 #define LW(psrc)                                        \
87   ({                                                    \
88     const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \
89     uint32_t val_lw_m;                                  \
90                                                         \
91     __asm__ __volatile__(                               \
92         "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t"         \
93         "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t"         \
94         : [val_lw_m] "=&r"(val_lw_m)                    \
95         : [psrc_lw_m] "r"(psrc_lw_m));                  \
96                                                         \
97     val_lw_m;                                           \
98   })
99 
100 #if (__mips == 64)
101 #define LD(psrc)                                        \
102   ({                                                    \
103     const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \
104     uint64_t val_ld_m = 0;                              \
105                                                         \
106     __asm__ __volatile__(                               \
107         "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t"         \
108         "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t"         \
109         : [val_ld_m] "=&r"(val_ld_m)                    \
110         : [psrc_ld_m] "r"(psrc_ld_m));                  \
111                                                         \
112     val_ld_m;                                           \
113   })
114 #else  // !(__mips == 64)
115 #define LD(psrc)                                                  \
116   ({                                                              \
117     const uint8_t *psrc_ld_m = (const uint8_t *)(psrc);           \
118     uint32_t val0_ld_m, val1_ld_m;                                \
119     uint64_t val_ld_m = 0;                                        \
120                                                                   \
121     val0_ld_m = LW(psrc_ld_m);                                    \
122     val1_ld_m = LW(psrc_ld_m + 4);                                \
123                                                                   \
124     val_ld_m = (uint64_t)(val1_ld_m);                             \
125     val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \
126     val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m);        \
127                                                                   \
128     val_ld_m;                                                     \
129   })
130 #endif  // (__mips == 64)
131 
132 #define SH(val, pdst)                                            \
133   {                                                              \
134     uint8_t *pdst_sh_m = (uint8_t *)(pdst);                      \
135     const uint16_t val_sh_m = (val);                             \
136                                                                  \
137     __asm__ __volatile__("ush  %[val_sh_m],  %[pdst_sh_m]  \n\t" \
138                                                                  \
139                          : [pdst_sh_m] "=m"(*pdst_sh_m)          \
140                          : [val_sh_m] "r"(val_sh_m));            \
141   }
142 
143 #define SW(val, pdst)                                            \
144   {                                                              \
145     uint8_t *pdst_sw_m = (uint8_t *)(pdst);                      \
146     const uint32_t val_sw_m = (val);                             \
147                                                                  \
148     __asm__ __volatile__("usw  %[val_sw_m],  %[pdst_sw_m]  \n\t" \
149                                                                  \
150                          : [pdst_sw_m] "=m"(*pdst_sw_m)          \
151                          : [val_sw_m] "r"(val_sw_m));            \
152   }
153 
154 #define SD(val, pdst)                                           \
155   {                                                             \
156     uint8_t *pdst_sd_m = (uint8_t *)(pdst);                     \
157     uint32_t val0_sd_m, val1_sd_m;                              \
158                                                                 \
159     val0_sd_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
160     val1_sd_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
161                                                                 \
162     SW(val0_sd_m, pdst_sd_m);                                   \
163     SW(val1_sd_m, pdst_sd_m + 4);                               \
164   }
165 #endif  // (__mips_isa_rev >= 6)
166 
167 /* Description : Load 4 words with stride
168    Arguments   : Inputs  - psrc, stride
169                  Outputs - out0, out1, out2, out3
170    Details     : Load word in 'out0' from (psrc)
171                  Load word in 'out1' from (psrc + stride)
172                  Load word in 'out2' from (psrc + 2 * stride)
173                  Load word in 'out3' from (psrc + 3 * stride)
174 */
175 #define LW4(psrc, stride, out0, out1, out2, out3) \
176   {                                               \
177     out0 = LW((psrc));                            \
178     out1 = LW((psrc) + stride);                   \
179     out2 = LW((psrc) + 2 * stride);               \
180     out3 = LW((psrc) + 3 * stride);               \
181   }
182 
183 /* Description : Load double words with stride
184    Arguments   : Inputs  - psrc, stride
185                  Outputs - out0, out1
186    Details     : Load double word in 'out0' from (psrc)
187                  Load double word in 'out1' from (psrc + stride)
188 */
189 #define LD2(psrc, stride, out0, out1) \
190   {                                   \
191     out0 = LD((psrc));                \
192     out1 = LD((psrc) + stride);       \
193   }
194 #define LD4(psrc, stride, out0, out1, out2, out3) \
195   {                                               \
196     LD2((psrc), stride, out0, out1);              \
197     LD2((psrc) + 2 * stride, stride, out2, out3); \
198   }
199 
200 /* Description : Store 4 words with stride
201    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
202    Details     : Store word from 'in0' to (pdst)
203                  Store word from 'in1' to (pdst + stride)
204                  Store word from 'in2' to (pdst + 2 * stride)
205                  Store word from 'in3' to (pdst + 3 * stride)
206 */
207 #define SW4(in0, in1, in2, in3, pdst, stride) \
208   {                                           \
209     SW(in0, (pdst))                           \
210     SW(in1, (pdst) + stride);                 \
211     SW(in2, (pdst) + 2 * stride);             \
212     SW(in3, (pdst) + 3 * stride);             \
213   }
214 
215 /* Description : Store 4 double words with stride
216    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
217    Details     : Store double word from 'in0' to (pdst)
218                  Store double word from 'in1' to (pdst + stride)
219                  Store double word from 'in2' to (pdst + 2 * stride)
220                  Store double word from 'in3' to (pdst + 3 * stride)
221 */
222 #define SD4(in0, in1, in2, in3, pdst, stride) \
223   {                                           \
224     SD(in0, (pdst))                           \
225     SD(in1, (pdst) + stride);                 \
226     SD(in2, (pdst) + 2 * stride);             \
227     SD(in3, (pdst) + 3 * stride);             \
228   }
229 
230 /* Description : Load vector elements with stride
231    Arguments   : Inputs  - psrc, stride
232                  Outputs - out0, out1
233                  Return Type - as per RTYPE
234    Details     : Load 16 byte elements in 'out0' from (psrc)
235                  Load 16 byte elements in 'out1' from (psrc + stride)
236 */
237 #define LD_V2(RTYPE, psrc, stride, out0, out1) \
238   {                                            \
239     out0 = LD_V(RTYPE, (psrc));                \
240     out1 = LD_V(RTYPE, (psrc) + stride);       \
241   }
242 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
243 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
244 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
245 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
246 
247 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
248   {                                                  \
249     LD_V2(RTYPE, (psrc), stride, out0, out1);        \
250     out2 = LD_V(RTYPE, (psrc) + 2 * stride);         \
251   }
252 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
253 
254 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
255   {                                                        \
256     LD_V2(RTYPE, (psrc), stride, out0, out1);              \
257     LD_V2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
258   }
259 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
260 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
261 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
262 
263 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
264   {                                                              \
265     LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
266     out4 = LD_V(RTYPE, (psrc) + 4 * stride);                     \
267   }
268 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
269 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
270 
271 #define LD_V7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
272   {                                                                          \
273     LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
274     LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
275   }
276 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
277 
278 #define LD_V8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
279               out7)                                                          \
280   {                                                                          \
281     LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
282     LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
283   }
284 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
285 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
286 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
287 
288 #define LD_V16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
289                out7, out8, out9, out10, out11, out12, out13, out14, out15)     \
290   {                                                                            \
291     LD_V8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
292           out7);                                                               \
293     LD_V8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
294           out13, out14, out15);                                                \
295   }
296 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
297 
298 /* Description : Load 4x4 block of signed halfword elements from 1D source
299                  data into 4 vectors (Each vector with 4 signed halfwords)
300    Arguments   : Input   - psrc
301                  Outputs - out0, out1, out2, out3
302 */
303 #define LD4x4_SH(psrc, out0, out1, out2, out3)            \
304   {                                                       \
305     out0 = LD_SH(psrc);                                   \
306     out2 = LD_SH(psrc + 8);                               \
307     out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0); \
308     out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
309   }
310 
311 /* Description : Store vectors with stride
312    Arguments   : Inputs - in0, in1, pdst, stride
313    Details     : Store 16 byte elements from 'in0' to (pdst)
314                  Store 16 byte elements from 'in1' to (pdst + stride)
315 */
316 #define ST_V2(RTYPE, in0, in1, pdst, stride) \
317   {                                          \
318     ST_V(RTYPE, in0, (pdst));                \
319     ST_V(RTYPE, in1, (pdst) + stride);       \
320   }
321 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
322 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
323 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
324 
325 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
326   {                                                      \
327     ST_V2(RTYPE, in0, in1, (pdst), stride);              \
328     ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
329   }
330 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
331 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
332 
333 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
334   {                                                                        \
335     ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
336     ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
337   }
338 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
339 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
340 
341 /* Description : Store 2x4 byte block to destination memory from input vector
342    Arguments   : Inputs - in, stidx, pdst, stride
343    Details     : Index 'stidx' halfword element from 'in' vector is copied to
344                  the GP register and stored to (pdst)
345                  Index 'stidx+1' halfword element from 'in' vector is copied to
346                  the GP register and stored to (pdst + stride)
347                  Index 'stidx+2' halfword element from 'in' vector is copied to
348                  the GP register and stored to (pdst + 2 * stride)
349                  Index 'stidx+3' halfword element from 'in' vector is copied to
350                  the GP register and stored to (pdst + 3 * stride)
351 */
352 #define ST2x4_UB(in, stidx, pdst, stride)            \
353   {                                                  \
354     uint16_t out0_m, out1_m, out2_m, out3_m;         \
355     uint8_t *pblk_2x4_m = (uint8_t *)(pdst);         \
356                                                      \
357     out0_m = __msa_copy_u_h((v8i16)in, (stidx));     \
358     out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1)); \
359     out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2)); \
360     out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3)); \
361                                                      \
362     SH(out0_m, pblk_2x4_m);                          \
363     SH(out1_m, pblk_2x4_m + stride);                 \
364     SH(out2_m, pblk_2x4_m + 2 * stride);             \
365     SH(out3_m, pblk_2x4_m + 3 * stride);             \
366   }
367 
368 /* Description : Store 4x2 byte block to destination memory from input vector
369    Arguments   : Inputs - in, pdst, stride
370    Details     : Index 0 word element from 'in' vector is copied to the GP
371                  register and stored to (pdst)
372                  Index 1 word element from 'in' vector is copied to the GP
373                  register and stored to (pdst + stride)
374 */
375 #define ST4x2_UB(in, pdst, stride)           \
376   {                                          \
377     uint32_t out0_m, out1_m;                 \
378     uint8_t *pblk_4x2_m = (uint8_t *)(pdst); \
379                                              \
380     out0_m = __msa_copy_u_w((v4i32)in, 0);   \
381     out1_m = __msa_copy_u_w((v4i32)in, 1);   \
382                                              \
383     SW(out0_m, pblk_4x2_m);                  \
384     SW(out1_m, pblk_4x2_m + stride);         \
385   }
386 
387 /* Description : Store 4x4 byte block to destination memory from input vector
388    Arguments   : Inputs - in0, in1, pdst, stride
389    Details     : 'Idx0' word element from input vector 'in0' is copied to the
390                  GP register and stored to (pdst)
391                  'Idx1' word element from input vector 'in0' is copied to the
392                  GP register and stored to (pdst + stride)
393                  'Idx2' word element from input vector 'in0' is copied to the
394                  GP register and stored to (pdst + 2 * stride)
395                  'Idx3' word element from input vector 'in0' is copied to the
396                  GP register and stored to (pdst + 3 * stride)
397 */
398 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
399   {                                                              \
400     uint32_t out0_m, out1_m, out2_m, out3_m;                     \
401     uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                     \
402                                                                  \
403     out0_m = __msa_copy_u_w((v4i32)in0, idx0);                   \
404     out1_m = __msa_copy_u_w((v4i32)in0, idx1);                   \
405     out2_m = __msa_copy_u_w((v4i32)in1, idx2);                   \
406     out3_m = __msa_copy_u_w((v4i32)in1, idx3);                   \
407                                                                  \
408     SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);     \
409   }
410 #define ST4x8_UB(in0, in1, pdst, stride)                           \
411   {                                                                \
412     uint8_t *pblk_4x8 = (uint8_t *)(pdst);                         \
413                                                                    \
414     ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);              \
415     ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
416   }
417 
418 /* Description : Store 8x1 byte block to destination memory from input vector
419    Arguments   : Inputs - in, pdst
420    Details     : Index 0 double word element from 'in' vector is copied to the
421                  GP register and stored to (pdst)
422 */
423 #define ST8x1_UB(in, pdst)                 \
424   {                                        \
425     uint64_t out0_m;                       \
426                                            \
427     out0_m = __msa_copy_u_d((v2i64)in, 0); \
428     SD(out0_m, pdst);                      \
429   }
430 
431 /* Description : Store 8x2 byte block to destination memory from input vector
432    Arguments   : Inputs - in, pdst, stride
433    Details     : Index 0 double word element from 'in' vector is copied to the
434                  GP register and stored to (pdst)
435                  Index 1 double word element from 'in' vector is copied to the
436                  GP register and stored to (pdst + stride)
437 */
438 #define ST8x2_UB(in, pdst, stride)           \
439   {                                          \
440     uint64_t out0_m, out1_m;                 \
441     uint8_t *pblk_8x2_m = (uint8_t *)(pdst); \
442                                              \
443     out0_m = __msa_copy_u_d((v2i64)in, 0);   \
444     out1_m = __msa_copy_u_d((v2i64)in, 1);   \
445                                              \
446     SD(out0_m, pblk_8x2_m);                  \
447     SD(out1_m, pblk_8x2_m + stride);         \
448   }
449 
450 /* Description : Store 8x4 byte block to destination memory from input
451                  vectors
452    Arguments   : Inputs - in0, in1, pdst, stride
453    Details     : Index 0 double word element from 'in0' vector is copied to the
454                  GP register and stored to (pdst)
455                  Index 1 double word element from 'in0' vector is copied to the
456                  GP register and stored to (pdst + stride)
457                  Index 0 double word element from 'in1' vector is copied to the
458                  GP register and stored to (pdst + 2 * stride)
459                  Index 1 double word element from 'in1' vector is copied to the
460                  GP register and stored to (pdst + 3 * stride)
461 */
462 #define ST8x4_UB(in0, in1, pdst, stride)                     \
463   {                                                          \
464     uint64_t out0_m, out1_m, out2_m, out3_m;                 \
465     uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                 \
466                                                              \
467     out0_m = __msa_copy_u_d((v2i64)in0, 0);                  \
468     out1_m = __msa_copy_u_d((v2i64)in0, 1);                  \
469     out2_m = __msa_copy_u_d((v2i64)in1, 0);                  \
470     out3_m = __msa_copy_u_d((v2i64)in1, 1);                  \
471                                                              \
472     SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
473   }
474 
475 /* Description : average with rounding (in0 + in1 + 1) / 2.
476    Arguments   : Inputs  - in0, in1, in2, in3,
477                  Outputs - out0, out1
478                  Return Type - as per RTYPE
479    Details     : Each unsigned byte element from 'in0' vector is added with
480                  each unsigned byte element from 'in1' vector. Then the average
481                  with rounding is calculated and written to 'out0'
482 */
483 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)   \
484   {                                                       \
485     out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1); \
486     out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3); \
487   }
488 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
489 
490 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
491                  out2, out3)                                                \
492   {                                                                         \
493     AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                         \
494     AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                         \
495   }
496 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
497 
498 /* Description : Immediate number of elements to slide with zero
499    Arguments   : Inputs  - in0, in1, slide_val
500                  Outputs - out0, out1
501                  Return Type - as per RTYPE
502    Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
503                  value specified in the 'slide_val'
504 */
505 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)             \
506   {                                                                   \
507     v16i8 zero_m = { 0 };                                             \
508     out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val); \
509     out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val); \
510   }
511 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
512 
513 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, \
514                   slide_val)                                         \
515   {                                                                  \
516     SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);               \
517     SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);               \
518   }
519 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
520 
521 /* Description : Immediate number of elements to slide
522    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
523                  Outputs - out0, out1
524                  Return Type - as per RTYPE
525    Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
526                  value specified in the 'slide_val'
527 */
528 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
529   {                                                                       \
530     out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);    \
531     out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);    \
532   }
533 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
534 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
535 
536 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, out0, out1, \
537                 out2, slide_val)                                             \
538   {                                                                          \
539     SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)        \
540     out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);       \
541   }
542 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
543 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
544 
545 /* Description : Shuffle byte vector elements as per mask vector
546    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
547                  Outputs - out0, out1
548                  Return Type - as per RTYPE
549    Details     : Byte elements from 'in0' & 'in1' are copied selectively to
550                  'out0' as per control vector 'mask0'
551 */
552 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)  \
553   {                                                                   \
554     out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0); \
555     out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2); \
556   }
557 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
558 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
559 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
560 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
561 
562 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
563                 out3)                                                          \
564   {                                                                            \
565     VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);              \
566     VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);              \
567   }
568 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
569 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
570 
571 /* Description : Dot product of byte vector elements
572    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
573                  Outputs - out0, out1
574                  Return Type - as per RTYPE
575    Details     : Unsigned byte elements from 'mult0' are multiplied with
576                  unsigned byte elements from 'cnst0' producing a result
577                  twice the size of input i.e. unsigned halfword.
578                  The multiplication result of adjacent odd-even elements
579                  are added together and written to the 'out0' vector
580 */
581 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
582   {                                                             \
583     out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);   \
584     out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);   \
585   }
586 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
587 
588 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
589                  cnst3, out0, out1, out2, out3)                          \
590   {                                                                      \
591     DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
592     DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
593   }
594 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
595 
596 /* Description : Dot product of byte vector elements
597    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
598                  Outputs - out0, out1
599                  Return Type - as per RTYPE
600    Details     : Signed byte elements from 'mult0' are multiplied with
601                  signed byte elements from 'cnst0' producing a result
602                  twice the size of input i.e. signed halfword.
603                  The multiplication result of adjacent odd-even elements
604                  are added together and written to the 'out0' vector
605 */
606 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
607   {                                                             \
608     out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);   \
609     out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);   \
610   }
611 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
612 
613 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
614                  cnst3, out0, out1, out2, out3)                          \
615   {                                                                      \
616     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
617     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
618   }
619 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
620 
621 /* Description : Dot product of halfword vector elements
622    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
623                  Outputs - out0, out1
624                  Return Type - as per RTYPE
625    Details     : Signed halfword elements from 'mult0' are multiplied with
626                  signed halfword elements from 'cnst0' producing a result
627                  twice the size of input i.e. signed word.
628                  The multiplication result of adjacent odd-even elements
629                  are added together and written to the 'out0' vector
630 */
631 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
632   {                                                             \
633     out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);   \
634     out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);   \
635   }
636 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
637 
638 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
639                  cnst3, out0, out1, out2, out3)                          \
640   {                                                                      \
641     DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
642     DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
643   }
644 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
645 
646 /* Description : Dot product of word vector elements
647    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
648                  Outputs - out0, out1
649                  Return Type - as per RTYPE
650    Details     : Signed word elements from 'mult0' are multiplied with
651                  signed word elements from 'cnst0' producing a result
652                  twice the size of input i.e. signed double word.
653                  The multiplication result of adjacent odd-even elements
654                  are added together and written to the 'out0' vector
655 */
656 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
657   {                                                             \
658     out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);   \
659     out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);   \
660   }
661 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
662 
663 /* Description : Dot product & addition of byte vector elements
664    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
665                  Outputs - out0, out1
666                  Return Type - as per RTYPE
667    Details     : Signed byte elements from 'mult0' are multiplied with
668                  signed byte elements from 'cnst0' producing a result
669                  twice the size of input i.e. signed halfword.
670                  The multiplication result of adjacent odd-even elements
671                  are added to the 'out0' vector
672 */
673 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
674   {                                                                         \
675     out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0); \
676     out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1); \
677   }
678 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
679 
680 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, cnst0, cnst1, cnst2, \
681                   cnst3, out0, out1, out2, out3)                          \
682   {                                                                       \
683     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);             \
684     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);             \
685   }
686 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
687 
688 /* Description : Dot product & addition of halfword vector elements
689    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
690                  Outputs - out0, out1
691                  Return Type - as per RTYPE
692    Details     : Signed halfword elements from 'mult0' are multiplied with
693                  signed halfword elements from 'cnst0' producing a result
694                  twice the size of input i.e. signed word.
695                  The multiplication result of adjacent odd-even elements
696                  are added to the 'out0' vector
697 */
698 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)            \
699   {                                                                         \
700     out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0); \
701     out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1); \
702   }
703 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
704 
705 /* Description : Dot product & addition of double word vector elements
706    Arguments   : Inputs  - mult0, mult1
707                  Outputs - out0, out1
708                  Return Type - as per RTYPE
709    Details     : Each signed word element from 'mult0' is multiplied with itself
710                  producing an intermediate result twice the size of input
711                  i.e. signed double word
712                  The multiplication result of adjacent odd-even elements
713                  are added to the 'out0' vector
714 */
715 #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1)                          \
716   {                                                                         \
717     out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0); \
718     out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1); \
719   }
720 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
721 
722 /* Description : Minimum values between unsigned elements of
723                  either vector are copied to the output vector
724    Arguments   : Inputs  - in0, in1, min_vec
725                  Outputs - in place operation
726                  Return Type - as per RTYPE
727    Details     : Minimum of unsigned halfword element values from 'in0' and
728                  'min_vec' are written to output vector 'in0'
729 */
730 #define MIN_UH2(RTYPE, in0, in1, min_vec)            \
731   {                                                  \
732     in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec); \
733     in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec); \
734   }
735 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
736 
737 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
738   {                                                 \
739     MIN_UH2(RTYPE, in0, in1, min_vec);              \
740     MIN_UH2(RTYPE, in2, in3, min_vec);              \
741   }
742 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
743 
744 /* Description : Clips all signed halfword elements of input vector
745                  between 0 & 255
746    Arguments   : Input  - in
747                  Output - out_m
748                  Return Type - signed halfword
749 */
750 #define CLIP_SH_0_255(in)                              \
751   ({                                                   \
752     v8i16 max_m = __msa_ldi_h(255);                    \
753     v8i16 out_m;                                       \
754                                                        \
755     out_m = __msa_maxi_s_h((v8i16)in, 0);              \
756     out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m); \
757     out_m;                                             \
758   })
759 #define CLIP_SH2_0_255(in0, in1) \
760   {                              \
761     in0 = CLIP_SH_0_255(in0);    \
762     in1 = CLIP_SH_0_255(in1);    \
763   }
764 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
765   {                                        \
766     CLIP_SH2_0_255(in0, in1);              \
767     CLIP_SH2_0_255(in2, in3);              \
768   }
769 
770 /* Description : Horizontal addition of 4 signed word elements of input vector
771    Arguments   : Input  - in       (signed word vector)
772                  Output - sum_m    (i32 sum)
773                  Return Type - signed word (GP)
774    Details     : 4 signed word elements of 'in' vector are added together and
775                  the resulting integer sum is returned
776 */
777 #define HADD_SW_S32(in)                            \
778   ({                                               \
779     v2i64 res0_m, res1_m;                          \
780     int32_t sum_m;                                 \
781                                                    \
782     res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
783     res1_m = __msa_splati_d(res0_m, 1);            \
784     res0_m = res0_m + res1_m;                      \
785     sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
786     sum_m;                                         \
787   })
788 
789 /* Description : Horizontal addition of 4 unsigned word elements
790    Arguments   : Input  - in       (unsigned word vector)
791                  Output - sum_m    (u32 sum)
792                  Return Type - unsigned word (GP)
793    Details     : 4 unsigned word elements of 'in' vector are added together and
794                  the resulting integer sum is returned
795 */
796 #define HADD_UW_U32(in)                               \
797   ({                                                  \
798     v2u64 res0_m, res1_m;                             \
799     uint32_t sum_m;                                   \
800                                                       \
801     res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in);    \
802     res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
803     res0_m += res1_m;                                 \
804     sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
805     sum_m;                                            \
806   })
807 
808 /* Description : Horizontal addition of 8 unsigned halfword elements
809    Arguments   : Input  - in       (unsigned halfword vector)
810                  Output - sum_m    (u32 sum)
811                  Return Type - unsigned word
812    Details     : 8 unsigned halfword elements of 'in' vector are added
813                  together and the resulting integer sum is returned
814 */
815 #define HADD_UH_U32(in)                           \
816   ({                                              \
817     v4u32 res_m;                                  \
818     uint32_t sum_m;                               \
819                                                   \
820     res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
821     sum_m = HADD_UW_U32(res_m);                   \
822     sum_m;                                        \
823   })
824 
825 /* Description : Horizontal addition of unsigned byte vector elements
826    Arguments   : Inputs  - in0, in1
827                  Outputs - out0, out1
828                  Return Type - as per RTYPE
829    Details     : Each unsigned odd byte element from 'in0' is added to
830                  even unsigned byte element from 'in0' (pairwise) and the
831                  halfword result is written to 'out0'
832 */
833 #define HADD_UB2(RTYPE, in0, in1, out0, out1)             \
834   {                                                       \
835     out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0); \
836     out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1); \
837   }
838 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
839 
840 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
841   {                                                                 \
842     HADD_UB2(RTYPE, in0, in1, out0, out1);                          \
843     HADD_UB2(RTYPE, in2, in3, out2, out3);                          \
844   }
845 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
846 
847 /* Description : Horizontal subtraction of unsigned byte vector elements
848    Arguments   : Inputs  - in0, in1
849                  Outputs - out0, out1
850                  Return Type - as per RTYPE
851    Details     : Each unsigned odd byte element from 'in0' is subtracted from
852                  even unsigned byte element from 'in0' (pairwise) and the
853                  halfword result is written to 'out0'
854 */
855 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)             \
856   {                                                       \
857     out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0); \
858     out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1); \
859   }
860 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
861 
862 /* Description : SAD (Sum of Absolute Difference)
863    Arguments   : Inputs  - in0, in1, ref0, ref1
864                  Outputs - sad_m                 (halfword vector)
865                  Return Type - unsigned halfword
866    Details     : Absolute difference of all the byte elements from 'in0' with
867                  'ref0' is calculated and preserved in 'diff0'. Then even-odd
868                  pairs are added together to generate 8 halfword results.
869 */
870 #define SAD_UB2_UH(in0, in1, ref0, ref1)                     \
871   ({                                                         \
872     v16u8 diff0_m, diff1_m;                                  \
873     v8u16 sad_m = { 0 };                                     \
874                                                              \
875     diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);       \
876     diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);       \
877                                                              \
878     sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m); \
879     sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m); \
880                                                              \
881     sad_m;                                                   \
882   })
883 
884 /* Description : Horizontal subtraction of signed halfword vector elements
885    Arguments   : Inputs  - in0, in1
886                  Outputs - out0, out1
887                  Return Type - as per RTYPE
888    Details     : Each signed odd halfword element from 'in0' is subtracted from
889                  even signed halfword element from 'in0' (pairwise) and the
890                  word result is written to 'out0'
891 */
892 #define HSUB_UH2(RTYPE, in0, in1, out0, out1)             \
893   {                                                       \
894     out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0); \
895     out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1); \
896   }
897 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
898 
899 /* Description : Set element n input vector to GPR value
900    Arguments   : Inputs - in0, in1, in2, in3
901                  Output - out
902                  Return Type - as per RTYPE
903    Details     : Set element 0 in vector 'out' to value specified in 'in0'
904 */
905 #define INSERT_W2(RTYPE, in0, in1, out)              \
906   {                                                  \
907     out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
908     out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
909   }
910 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
911 
912 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out)    \
913   {                                                  \
914     out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \
915     out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \
916     out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \
917     out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \
918   }
919 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
920 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
921 
922 #define INSERT_D2(RTYPE, in0, in1, out)              \
923   {                                                  \
924     out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0); \
925     out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1); \
926   }
927 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
928 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
929 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
930 
931 /* Description : Interleave even byte elements from vectors
932    Arguments   : Inputs  - in0, in1, in2, in3
933                  Outputs - out0, out1
934                  Return Type - as per RTYPE
935    Details     : Even byte elements of 'in0' and 'in1' are interleaved
936                  and written to 'out0'
937 */
938 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
939   {                                                      \
940     out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0); \
941     out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2); \
942   }
943 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
944 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
945 
946 /* Description : Interleave even halfword elements from vectors
947    Arguments   : Inputs  - in0, in1, in2, in3
948                  Outputs - out0, out1
949                  Return Type - as per RTYPE
950    Details     : Even halfword elements of 'in0' and 'in1' are interleaved
951                  and written to 'out0'
952 */
953 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
954   {                                                      \
955     out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0); \
956     out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2); \
957   }
958 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
959 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
960 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
961 
962 /* Description : Interleave even word elements from vectors
963    Arguments   : Inputs  - in0, in1, in2, in3
964                  Outputs - out0, out1
965                  Return Type - as per RTYPE
966    Details     : Even word elements of 'in0' and 'in1' are interleaved
967                  and written to 'out0'
968 */
969 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
970   {                                                      \
971     out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0); \
972     out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2); \
973   }
974 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
975 
976 /* Description : Interleave even double word elements from vectors
977    Arguments   : Inputs  - in0, in1, in2, in3
978                  Outputs - out0, out1
979                  Return Type - as per RTYPE
980    Details     : Even double word elements of 'in0' and 'in1' are interleaved
981                  and written to 'out0'
982 */
983 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
984   {                                                      \
985     out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0); \
986     out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2); \
987   }
988 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
989 
990 /* Description : Interleave left half of byte elements from vectors
991    Arguments   : Inputs  - in0, in1, in2, in3
992                  Outputs - out0, out1
993                  Return Type - as per RTYPE
994    Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
995                  and written to 'out0'.
996 */
997 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
998   {                                                     \
999     out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
1000     out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3); \
1001   }
1002 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1003 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1004 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1005 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1006 
1007 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1008                 out2, out3)                                                \
1009   {                                                                        \
1010     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1011     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1012   }
1013 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1014 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1015 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1016 
1017 /* Description : Interleave left half of halfword elements from vectors
1018    Arguments   : Inputs  - in0, in1, in2, in3
1019                  Outputs - out0, out1
1020                  Return Type - as per RTYPE
1021    Details     : Left half of halfword elements of 'in0' and 'in1' are
1022                  interleaved and written to 'out0'.
1023 */
1024 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1025   {                                                     \
1026     out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
1027     out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3); \
1028   }
1029 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1030 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1031 
1032 /* Description : Interleave left half of word elements from vectors
1033    Arguments   : Inputs  - in0, in1, in2, in3
1034                  Outputs - out0, out1
1035                  Return Type - as per RTYPE
1036    Details     : Left half of word elements of 'in0' and 'in1' are interleaved
1037                  and written to 'out0'.
1038 */
1039 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1040   {                                                     \
1041     out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
1042     out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3); \
1043   }
1044 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1045 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1046 
1047 /* Description : Interleave right half of byte elements from vectors
1048    Arguments   : Inputs  - in0, in1, in2, in3
1049                  Outputs - out0, out1
1050                  Return Type - as per RTYPE
1051    Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
1052                  and written to out0.
1053 */
1054 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1055   {                                                     \
1056     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
1057     out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \
1058   }
1059 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1060 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1061 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1062 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1063 
1064 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1065                 out2, out3)                                                \
1066   {                                                                        \
1067     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1068     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1069   }
1070 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1071 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1072 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1073 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1074 
1075 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
1076                 in11, in12, in13, in14, in15, out0, out1, out2, out3, out4,    \
1077                 out5, out6, out7)                                              \
1078   {                                                                            \
1079     ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2,   \
1080             out3);                                                             \
1081     ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, out4, out5,   \
1082             out6, out7);                                                       \
1083   }
1084 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1085 
1086 /* Description : Interleave right half of halfword elements from vectors
1087    Arguments   : Inputs  - in0, in1, in2, in3
1088                  Outputs - out0, out1
1089                  Return Type - as per RTYPE
1090    Details     : Right half of halfword elements of 'in0' and 'in1' are
1091                  interleaved and written to 'out0'.
1092 */
1093 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1094   {                                                     \
1095     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
1096     out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \
1097   }
1098 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1099 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1100 
1101 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1102                 out2, out3)                                                \
1103   {                                                                        \
1104     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1105     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1106   }
1107 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1108 
1109 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1110   {                                                     \
1111     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
1112     out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3); \
1113   }
1114 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1115 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1116 
1117 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1118                 out2, out3)                                                \
1119   {                                                                        \
1120     ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1121     ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1122   }
1123 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1124 
1125 /* Description : Interleave right half of double word elements from vectors
1126    Arguments   : Inputs  - in0, in1, in2, in3
1127                  Outputs - out0, out1
1128                  Return Type - as per RTYPE
1129    Details     : Right half of double word elements of 'in0' and 'in1' are
1130                  interleaved and written to 'out0'.
1131 */
1132 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1133   {                                                         \
1134     out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1)); \
1135     out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3)); \
1136   }
1137 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1138 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1139 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1140 
1141 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1142   {                                                                    \
1143     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                    \
1144     out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));            \
1145   }
1146 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1147 
1148 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1149                 out2, out3)                                                \
1150   {                                                                        \
1151     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1152     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1153   }
1154 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1155 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1156 
1157 /* Description : Interleave both left and right half of input vectors
1158    Arguments   : Inputs  - in0, in1
1159                  Outputs - out0, out1
1160                  Return Type - as per RTYPE
1161    Details     : Right half of byte elements from 'in0' and 'in1' are
1162                  interleaved and written to 'out0'
1163 */
1164 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)           \
1165   {                                                     \
1166     out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \
1167     out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1); \
1168   }
1169 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1170 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1171 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1172 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1173 
1174 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)           \
1175   {                                                     \
1176     out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \
1177     out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \
1178   }
1179 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1180 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1181 
1182 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)           \
1183   {                                                     \
1184     out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \
1185     out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
1186   }
1187 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1188 #define ILVRL_W2_SB(...) ILVRL_W2(v16i8, __VA_ARGS__)
1189 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1190 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1191 
1192 /* Description : Saturate the halfword element values to the max
1193                  unsigned value of (sat_val + 1) bits
1194                  The element data width remains unchanged
1195    Arguments   : Inputs  - in0, in1, sat_val
1196                  Outputs - in place operation
1197                  Return Type - as per RTYPE
1198    Details     : Each unsigned halfword element from 'in0' is saturated to the
1199                  value generated with (sat_val + 1) bit range.
1200                  The results are written in place
1201 */
1202 #define SAT_UH2(RTYPE, in0, in1, sat_val)            \
1203   {                                                  \
1204     in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val); \
1205     in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val); \
1206   }
1207 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1208 
1209 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1210   {                                                 \
1211     SAT_UH2(RTYPE, in0, in1, sat_val);              \
1212     SAT_UH2(RTYPE, in2, in3, sat_val)               \
1213   }
1214 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1215 
1216 /* Description : Saturate the halfword element values to the max
1217                  unsigned value of (sat_val + 1) bits
1218                  The element data width remains unchanged
1219    Arguments   : Inputs  - in0, in1, sat_val
1220                  Outputs - in place operation
1221                  Return Type - as per RTYPE
1222    Details     : Each unsigned halfword element from 'in0' is saturated to the
1223                  value generated with (sat_val + 1) bit range
1224                  The results are written in place
1225 */
1226 #define SAT_SH2(RTYPE, in0, in1, sat_val)            \
1227   {                                                  \
1228     in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val); \
1229     in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val); \
1230   }
1231 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1232 
1233 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1234   {                                                 \
1235     SAT_SH2(RTYPE, in0, in1, sat_val);              \
1236     SAT_SH2(RTYPE, in2, in3, sat_val);              \
1237   }
1238 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1239 
1240 /* Description : Indexed halfword element values are replicated to all
1241                  elements in output vector
1242    Arguments   : Inputs  - in, idx0, idx1
1243                  Outputs - out0, out1
1244                  Return Type - as per RTYPE
1245    Details     : 'idx0' element value from 'in' vector is replicated to all
1246                   elements in 'out0' vector
1247                   Valid index range for halfword operation is 0-7
1248 */
1249 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1250   {                                                  \
1251     out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);   \
1252     out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);   \
1253   }
1254 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1255 
1256 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, out0, out1, out2, out3) \
1257   {                                                                          \
1258     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);                            \
1259     SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);                            \
1260   }
1261 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1262 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1263 
1264 /* Description : Pack even byte elements of vector pairs
1265    Arguments   : Inputs  - in0, in1, in2, in3
1266                  Outputs - out0, out1
1267                  Return Type - as per RTYPE
1268    Details     : Even byte elements of 'in0' are copied to the left half of
1269                  'out0' & even byte elements of 'in1' are copied to the right
1270                  half of 'out0'.
1271 */
1272 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1273   {                                                      \
1274     out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \
1275     out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \
1276   }
1277 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1278 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1279 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1280 
1281 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1282                  out2, out3)                                                \
1283   {                                                                         \
1284     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1285     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1286   }
1287 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1288 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1289 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1290 
1291 /* Description : Pack even halfword elements of vector pairs
1292    Arguments   : Inputs  - in0, in1, in2, in3
1293                  Outputs - out0, out1
1294                  Return Type - as per RTYPE
1295    Details     : Even halfword elements of 'in0' are copied to the left half of
1296                  'out0' & even halfword elements of 'in1' are copied to the
1297                  right half of 'out0'.
1298 */
1299 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1300   {                                                      \
1301     out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1); \
1302     out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3); \
1303   }
1304 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1305 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1306 
1307 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1308                  out2, out3)                                                \
1309   {                                                                         \
1310     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1311     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1312   }
1313 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1314 
1315 /* Description : Pack even double word elements of vector pairs
1316    Arguments   : Inputs  - in0, in1, in2, in3
1317                  Outputs - out0, out1
1318                  Return Type - as per RTYPE
1319    Details     : Even double elements of 'in0' are copied to the left half of
1320                  'out0' & even double elements of 'in1' are copied to the right
1321                  half of 'out0'.
1322 */
1323 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)  \
1324   {                                                      \
1325     out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1); \
1326     out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3); \
1327   }
1328 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1329 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1330 
1331 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1332                  out2, out3)                                                \
1333   {                                                                         \
1334     PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1335     PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1336   }
1337 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1338 
1339 /* Description : Each byte element is logically xor'ed with immediate 128
1340    Arguments   : Inputs  - in0, in1
1341                  Outputs - in place operation
1342                  Return Type - as per RTYPE
1343    Details     : Each unsigned byte element from input vector 'in0' is
1344                  logically xor'ed with 128 and the result is stored in-place.
1345 */
1346 #define XORI_B2_128(RTYPE, in0, in1)            \
1347   {                                             \
1348     in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128); \
1349     in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128); \
1350   }
1351 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1352 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1353 
1354 #define XORI_B3_128(RTYPE, in0, in1, in2)       \
1355   {                                             \
1356     XORI_B2_128(RTYPE, in0, in1);               \
1357     in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128); \
1358   }
1359 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1360 
1361 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1362   {                                            \
1363     XORI_B2_128(RTYPE, in0, in1);              \
1364     XORI_B2_128(RTYPE, in2, in3);              \
1365   }
1366 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1367 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1368 
1369 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1370   {                                                           \
1371     XORI_B4_128(RTYPE, in0, in1, in2, in3);                   \
1372     XORI_B3_128(RTYPE, in4, in5, in6);                        \
1373   }
1374 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1375 
1376 /* Description : Average of signed halfword elements -> (a + b) / 2
1377    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1378                  Outputs - out0, out1, out2, out3
1379                  Return Type - as per RTYPE
1380    Details     : Each signed halfword element from 'in0' is added to each
1381                  signed halfword element of 'in1' with full precision resulting
1382                  in one extra bit in the result. The result is then divided by
1383                  2 and written to 'out0'
1384 */
1385 #define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1386                 out2, out3)                                                \
1387   {                                                                        \
1388     out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);                   \
1389     out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);                   \
1390     out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);                   \
1391     out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);                   \
1392   }
1393 #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
1394 
1395 /* Description : Addition of signed halfword elements and signed saturation
1396    Arguments   : Inputs  - in0, in1, in2, in3
1397                  Outputs - out0, out1
1398                  Return Type - as per RTYPE
1399    Details     : Signed halfword elements from 'in0' are added to signed
1400                  halfword elements of 'in1'. The result is then signed saturated
1401                  between halfword data type range
1402 */
1403 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)   \
1404   {                                                       \
1405     out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1); \
1406     out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3); \
1407   }
1408 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1409 
1410 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1411                  out2, out3)                                                \
1412   {                                                                         \
1413     ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);                        \
1414     ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);                        \
1415   }
1416 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1417 
1418 /* Description : Shift left all elements of vector (generic for all data types)
1419    Arguments   : Inputs  - in0, in1, in2, in3, shift
1420                  Outputs - in place operation
1421                  Return Type - as per input vector RTYPE
1422    Details     : Each element of vector 'in0' is left shifted by 'shift' and
1423                  the result is written in-place.
1424 */
1425 #define SLLI_4V(in0, in1, in2, in3, shift) \
1426   {                                        \
1427     in0 = in0 << shift;                    \
1428     in1 = in1 << shift;                    \
1429     in2 = in2 << shift;                    \
1430     in3 = in3 << shift;                    \
1431   }
1432 
1433 /* Description : Arithmetic shift right all elements of vector
1434                  (generic for all data types)
1435    Arguments   : Inputs  - in0, in1, in2, in3, shift
1436                  Outputs - in place operation
1437                  Return Type - as per input vector RTYPE
1438    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1439                  the result is written in-place. 'shift' is a GP variable.
1440 */
1441 #define SRA_2V(in0, in1, shift) \
1442   {                             \
1443     in0 = in0 >> shift;         \
1444     in1 = in1 >> shift;         \
1445   }
1446 
1447 #define SRA_4V(in0, in1, in2, in3, shift) \
1448   {                                       \
1449     in0 = in0 >> shift;                   \
1450     in1 = in1 >> shift;                   \
1451     in2 = in2 >> shift;                   \
1452     in3 = in3 >> shift;                   \
1453   }
1454 
1455 /* Description : Shift right arithmetic rounded words
1456    Arguments   : Inputs  - in0, in1, shift
1457                  Outputs - in place operation
1458                  Return Type - as per RTYPE
1459    Details     : Each element of vector 'in0' is shifted right arithmetically by
1460                  the number of bits in the corresponding element in the vector
1461                  'shift'. The last discarded bit is added to shifted value for
1462                  rounding and the result is written in-place.
1463                  'shift' is a vector.
1464 */
1465 #define SRAR_W2(RTYPE, in0, in1, shift)                  \
1466   {                                                      \
1467     in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift); \
1468     in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift); \
1469   }
1470 
1471 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
1472   {                                               \
1473     SRAR_W2(RTYPE, in0, in1, shift)               \
1474     SRAR_W2(RTYPE, in2, in3, shift)               \
1475   }
1476 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
1477 
1478 /* Description : Shift right arithmetic rounded (immediate)
1479    Arguments   : Inputs  - in0, in1, shift
1480                  Outputs - in place operation
1481                  Return Type - as per RTYPE
1482    Details     : Each element of vector 'in0' is shifted right arithmetically by
1483                  the value in 'shift'. The last discarded bit is added to the
1484                  shifted value for rounding and the result is written in-place.
1485                  'shift' is an immediate value.
1486 */
1487 #define SRARI_H2(RTYPE, in0, in1, shift)           \
1488   {                                                \
1489     in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift); \
1490     in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift); \
1491   }
1492 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1493 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1494 
1495 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
1496   {                                                \
1497     SRARI_H2(RTYPE, in0, in1, shift);              \
1498     SRARI_H2(RTYPE, in2, in3, shift);              \
1499   }
1500 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
1501 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
1502 
1503 #define SRARI_W2(RTYPE, in0, in1, shift)           \
1504   {                                                \
1505     in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \
1506     in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \
1507   }
1508 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1509 
1510 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
1511   {                                                \
1512     SRARI_W2(RTYPE, in0, in1, shift);              \
1513     SRARI_W2(RTYPE, in2, in3, shift);              \
1514   }
1515 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1516 
1517 /* Description : Logical shift right all elements of vector (immediate)
1518    Arguments   : Inputs  - in0, in1, in2, in3, shift
1519                  Outputs - out0, out1, out2, out3
1520                  Return Type - as per RTYPE
1521    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1522                  the result is written in-place. 'shift' is an immediate value.
1523 */
1524 #define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) \
1525   {                                                                       \
1526     out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                        \
1527     out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                        \
1528     out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                        \
1529     out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                        \
1530   }
1531 #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
1532 
1533 /* Description : Multiplication of pairs of vectors
1534    Arguments   : Inputs  - in0, in1, in2, in3
1535                  Outputs - out0, out1
1536    Details     : Each element from 'in0' is multiplied with elements from 'in1'
1537                  and the result is written to 'out0'
1538 */
1539 #define MUL2(in0, in1, in2, in3, out0, out1) \
1540   {                                          \
1541     out0 = in0 * in1;                        \
1542     out1 = in2 * in3;                        \
1543   }
1544 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1545   {                                                                          \
1546     MUL2(in0, in1, in2, in3, out0, out1);                                    \
1547     MUL2(in4, in5, in6, in7, out2, out3);                                    \
1548   }
1549 
1550 /* Description : Addition of 2 pairs of vectors
1551    Arguments   : Inputs  - in0, in1, in2, in3
1552                  Outputs - out0, out1
1553    Details     : Each element in 'in0' is added to 'in1' and result is written
1554                  to 'out0'.
1555 */
1556 #define ADD2(in0, in1, in2, in3, out0, out1) \
1557   {                                          \
1558     out0 = in0 + in1;                        \
1559     out1 = in2 + in3;                        \
1560   }
1561 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1562   {                                                                          \
1563     ADD2(in0, in1, in2, in3, out0, out1);                                    \
1564     ADD2(in4, in5, in6, in7, out2, out3);                                    \
1565   }
1566 
1567 /* Description : Subtraction of 2 pairs of vectors
1568    Arguments   : Inputs  - in0, in1, in2, in3
1569                  Outputs - out0, out1
1570    Details     : Each element in 'in1' is subtracted from 'in0' and result is
1571                  written to 'out0'.
1572 */
1573 #define SUB2(in0, in1, in2, in3, out0, out1) \
1574   {                                          \
1575     out0 = in0 - in1;                        \
1576     out1 = in2 - in3;                        \
1577   }
1578 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1579   {                                                                          \
1580     out0 = in0 - in1;                                                        \
1581     out1 = in2 - in3;                                                        \
1582     out2 = in4 - in5;                                                        \
1583     out3 = in6 - in7;                                                        \
1584   }
1585 
1586 /* Description : Sign extend halfword elements from right half of the vector
1587    Arguments   : Input  - in    (halfword vector)
1588                  Output - out   (sign extended word vector)
1589                  Return Type - signed word
1590    Details     : Sign bit of halfword elements from input vector 'in' is
1591                  extracted and interleaved with same vector 'in0' to generate
1592                  4 word elements keeping sign intact
1593 */
1594 #define UNPCK_R_SH_SW(in, out)                    \
1595   {                                               \
1596     v8i16 sign_m;                                 \
1597                                                   \
1598     sign_m = __msa_clti_s_h((v8i16)in, 0);        \
1599     out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
1600   }
1601 
1602 /* Description : Sign extend byte elements from input vector and return
1603                  halfword results in pair of vectors
1604    Arguments   : Input   - in           (byte vector)
1605                  Outputs - out0, out1   (sign extended halfword vectors)
1606                  Return Type - signed halfword
1607    Details     : Sign bit of byte elements from input vector 'in' is
1608                  extracted and interleaved right with same vector 'in0' to
1609                  generate 8 signed halfword elements in 'out0'
1610                  Then interleaved left with same vector 'in0' to
1611                  generate 8 signed halfword elements in 'out1'
1612 */
1613 #define UNPCK_SB_SH(in, out0, out1)       \
1614   {                                       \
1615     v16i8 tmp_m;                          \
1616                                           \
1617     tmp_m = __msa_clti_s_b((v16i8)in, 0); \
1618     ILVRL_B2_SH(tmp_m, in, out0, out1);   \
1619   }
1620 
1621 /* Description : Zero extend unsigned byte elements to halfword elements
1622    Arguments   : Input   - in          (unsigned byte vector)
1623                  Outputs - out0, out1  (unsigned  halfword vectors)
1624                  Return Type - signed halfword
1625    Details     : Zero extended right half of vector is returned in 'out0'
1626                  Zero extended left half of vector is returned in 'out1'
1627 */
1628 #define UNPCK_UB_SH(in, out0, out1)      \
1629   {                                      \
1630     v16i8 zero_m = { 0 };                \
1631                                          \
1632     ILVRL_B2_SH(zero_m, in, out0, out1); \
1633   }
1634 
1635 /* Description : Sign extend halfword elements from input vector and return
1636                  the result in pair of vectors
1637    Arguments   : Input   - in            (halfword vector)
1638                  Outputs - out0, out1   (sign extended word vectors)
1639                  Return Type - signed word
1640    Details     : Sign bit of halfword elements from input vector 'in' is
1641                  extracted and interleaved right with same vector 'in0' to
1642                  generate 4 signed word elements in 'out0'
1643                  Then interleaved left with same vector 'in0' to
1644                  generate 4 signed word elements in 'out1'
1645 */
1646 #define UNPCK_SH_SW(in, out0, out1)       \
1647   {                                       \
1648     v8i16 tmp_m;                          \
1649                                           \
1650     tmp_m = __msa_clti_s_h((v8i16)in, 0); \
1651     ILVRL_H2_SW(tmp_m, in, out0, out1);   \
1652   }
1653 
1654 /* Description : Butterfly of 4 input vectors
1655    Arguments   : Inputs  - in0, in1, in2, in3
1656                  Outputs - out0, out1, out2, out3
1657    Details     : Butterfly operation
1658 */
1659 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
1660   {                                                             \
1661     out0 = in0 + in3;                                           \
1662     out1 = in1 + in2;                                           \
1663                                                                 \
1664     out2 = in1 - in2;                                           \
1665     out3 = in0 - in3;                                           \
1666   }
1667 
1668 /* Description : Butterfly of 8 input vectors
1669    Arguments   : Inputs  - in0 ...  in7
1670                  Outputs - out0 .. out7
1671    Details     : Butterfly operation
1672 */
1673 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
1674                     out3, out4, out5, out6, out7)                             \
1675   {                                                                           \
1676     out0 = in0 + in7;                                                         \
1677     out1 = in1 + in6;                                                         \
1678     out2 = in2 + in5;                                                         \
1679     out3 = in3 + in4;                                                         \
1680                                                                               \
1681     out4 = in3 - in4;                                                         \
1682     out5 = in2 - in5;                                                         \
1683     out6 = in1 - in6;                                                         \
1684     out7 = in0 - in7;                                                         \
1685   }
1686 
1687 /* Description : Butterfly of 16 input vectors
1688    Arguments   : Inputs  - in0 ...  in15
1689                  Outputs - out0 .. out15
1690    Details     : Butterfly operation
1691 */
1692 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,  \
1693                      in11, in12, in13, in14, in15, out0, out1, out2, out3,    \
1694                      out4, out5, out6, out7, out8, out9, out10, out11, out12, \
1695                      out13, out14, out15)                                     \
1696   {                                                                           \
1697     out0 = in0 + in15;                                                        \
1698     out1 = in1 + in14;                                                        \
1699     out2 = in2 + in13;                                                        \
1700     out3 = in3 + in12;                                                        \
1701     out4 = in4 + in11;                                                        \
1702     out5 = in5 + in10;                                                        \
1703     out6 = in6 + in9;                                                         \
1704     out7 = in7 + in8;                                                         \
1705                                                                               \
1706     out8 = in7 - in8;                                                         \
1707     out9 = in6 - in9;                                                         \
1708     out10 = in5 - in10;                                                       \
1709     out11 = in4 - in11;                                                       \
1710     out12 = in3 - in12;                                                       \
1711     out13 = in2 - in13;                                                       \
1712     out14 = in1 - in14;                                                       \
1713     out15 = in0 - in15;                                                       \
1714   }
1715 
1716 /* Description : Transpose input 8x8 byte block
1717    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1718                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1719                  Return Type - as per RTYPE
1720 */
1721 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0,   \
1722                         out1, out2, out3, out4, out5, out6, out7)              \
1723   {                                                                            \
1724     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
1725     v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
1726                                                                                \
1727     ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, tmp0_m, tmp1_m, tmp2_m, \
1728                tmp3_m);                                                        \
1729     ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                               \
1730     ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                               \
1731     ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                               \
1732     ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                               \
1733     SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                               \
1734     SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                               \
1735   }
1736 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
1737 
1738 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
1739    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1740                            in8, in9, in10, in11, in12, in13, in14, in15
1741                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1742                  Return Type - unsigned byte
1743 */
1744 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, \
1745                             in10, in11, in12, in13, in14, in15, out0, out1,   \
1746                             out2, out3, out4, out5, out6, out7)               \
1747   {                                                                           \
1748     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
1749     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                     \
1750                                                                               \
1751     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                              \
1752     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                            \
1753     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                            \
1754     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                            \
1755                                                                               \
1756     tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                  \
1757     tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                  \
1758     tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                  \
1759     tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                  \
1760     out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                    \
1761     tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                  \
1762     out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                    \
1763     tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                  \
1764                                                                               \
1765     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                  \
1766     out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1767     out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1768                                                                               \
1769     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);              \
1770     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                  \
1771     out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1772     out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1773                                                                               \
1774     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);              \
1775     out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1776     out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1777                                                                               \
1778     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
1779     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
1780     out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1781     out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
1782   }
1783 
1784 /* Description : Transpose 4x4 block with half word elements in vectors
1785    Arguments   : Inputs  - in0, in1, in2, in3
1786                  Outputs - out0, out1, out2, out3
1787                  Return Type - signed halfword
1788 */
1789 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
1790   {                                                                    \
1791     v8i16 s0_m, s1_m;                                                  \
1792                                                                        \
1793     ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                        \
1794     ILVRL_W2_SH(s1_m, s0_m, out0, out2);                               \
1795     out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);              \
1796     out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);              \
1797   }
1798 
1799 /* Description : Transpose 4x8 block with half word elements in vectors
1800    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1801                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1802                  Return Type - signed halfword
1803 */
1804 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
1805                            out2, out3, out4, out5, out6, out7)                 \
1806   {                                                                            \
1807     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
1808     v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                      \
1809     v8i16 zero_m = { 0 };                                                      \
1810                                                                                \
1811     ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6, tmp0_n, tmp1_n, tmp2_n, \
1812                tmp3_n);                                                        \
1813     ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                               \
1814     ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                               \
1815                                                                                \
1816     out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
1817     out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                  \
1818     out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
1819     out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                  \
1820                                                                                \
1821     out4 = zero_m;                                                             \
1822     out5 = zero_m;                                                             \
1823     out6 = zero_m;                                                             \
1824     out7 = zero_m;                                                             \
1825   }
1826 
1827 /* Description : Transpose 8x4 block with half word elements in vectors
1828    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1829                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1830                  Return Type - signed halfword
1831 */
1832 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
1833   {                                                                    \
1834     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \
1835                                                                        \
1836     ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                    \
1837     ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                    \
1838     ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);            \
1839     ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);            \
1840   }
1841 
1842 /* Description : Transpose 8x8 block with half word elements in vectors
1843    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1844                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1845                  Return Type - as per RTYPE
1846 */
1847 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, \
1848                        out1, out2, out3, out4, out5, out6, out7)            \
1849   {                                                                         \
1850     v8i16 s0_m, s1_m;                                                       \
1851     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
1852     v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
1853                                                                             \
1854     ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
1855     ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
1856     ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
1857     ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
1858     ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
1859     ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
1860     ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
1861     ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
1862     PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, tmp3_m, \
1863              tmp7_m, out0, out2, out4, out6);                               \
1864     out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
1865     out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
1866     out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
1867     out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
1868   }
1869 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
1870 
1871 /* Description : Transpose 4x4 block with word elements in vectors
1872    Arguments   : Inputs  - in0, in1, in2, in3
1873                  Outputs - out0, out1, out2, out3
1874                  Return Type - signed word
1875 */
1876 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
1877   {                                                                    \
1878     v4i32 s0_m, s1_m, s2_m, s3_m;                                      \
1879                                                                        \
1880     ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                 \
1881     ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                 \
1882                                                                        \
1883     out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);              \
1884     out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);              \
1885     out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);              \
1886     out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);              \
1887   }
1888 
1889 /* Description : Add block 4x4
1890    Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
1891    Details     : Least significant 4 bytes from each input vector are added to
1892                  the destination bytes, clipped between 0-255 and stored.
1893 */
1894 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)        \
1895   {                                                              \
1896     uint32_t src0_m, src1_m, src2_m, src3_m;                     \
1897     v8i16 inp0_m, inp1_m, res0_m, res1_m;                        \
1898     v16i8 dst0_m = { 0 };                                        \
1899     v16i8 dst1_m = { 0 };                                        \
1900     v16i8 zero_m = { 0 };                                        \
1901                                                                  \
1902     ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)               \
1903     LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m);           \
1904     INSERT_W2_SB(src0_m, src1_m, dst0_m);                        \
1905     INSERT_W2_SB(src2_m, src3_m, dst1_m);                        \
1906     ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);  \
1907     ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);        \
1908     CLIP_SH2_0_255(res0_m, res1_m);                              \
1909     PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
1910     ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);          \
1911   }
1912 
1913 /* Description : Pack even elements of input vectors & xor with 128
1914    Arguments   : Inputs - in0, in1
1915                  Output - out_m
1916                  Return Type - unsigned byte
1917    Details     : Signed byte even elements from 'in0' and 'in1' are packed
1918                  together in one vector and the resulting vector is xor'ed with
1919                  128 to shift the range from signed to unsigned byte
1920 */
1921 #define PCKEV_XORI128_UB(in0, in1)                        \
1922   ({                                                      \
1923     v16u8 out_m;                                          \
1924                                                           \
1925     out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0); \
1926     out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);       \
1927     out_m;                                                \
1928   })
1929 
1930 /* Description : Converts inputs to unsigned bytes, interleave, average & store
1931                  as 8x4 unsigned byte block
1932    Arguments   : Inputs  - in0, in1, in2, in3, dst0, dst1, pdst, stride
1933 */
1934 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \
1935   {                                                                           \
1936     v16u8 tmp0_m, tmp1_m;                                                     \
1937     uint8_t *pdst_m = (uint8_t *)(pdst);                                      \
1938                                                                               \
1939     tmp0_m = PCKEV_XORI128_UB(in0, in1);                                      \
1940     tmp1_m = PCKEV_XORI128_UB(in2, in3);                                      \
1941     AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);                  \
1942     ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                                 \
1943   }
1944 
1945 /* Description : Pack even byte elements and store byte vector in destination
1946                  memory
1947    Arguments   : Inputs - in0, in1, pdst
1948 */
1949 #define PCKEV_ST_SB(in0, in1, pdst)                \
1950   {                                                \
1951     v16i8 tmp_m;                                   \
1952                                                    \
1953     tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0); \
1954     ST_SB(tmp_m, (pdst));                          \
1955   }
1956 
1957 /* Description : Horizontal 2 tap filter kernel code
1958    Arguments   : Inputs - in0, in1, mask, coeff, shift
1959 */
1960 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)        \
1961   ({                                                            \
1962     v16i8 tmp0_m;                                               \
1963     v8u16 tmp1_m;                                               \
1964                                                                 \
1965     tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0); \
1966     tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);       \
1967     tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);        \
1968                                                                 \
1969     tmp1_m;                                                     \
1970   })
1971 #endif  // VPX_VPX_DSP_MIPS_MACROS_MSA_H_
1972