• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3   *
4   *  Use of this source code is governed by a BSD-style license
5   *  that can be found in the LICENSE file in the root of the source
6   *  tree. An additional intellectual property rights grant can be found
7   *  in the file PATENTS.  All contributing project authors may
8   *  be found in the AUTHORS file in the root of the source tree.
9   */
10  
11  #ifndef VPX_DSP_MIPS_MACROS_MSA_H_
12  #define VPX_DSP_MIPS_MACROS_MSA_H_
13  
14  #include <msa.h>
15  
16  #include "./vpx_config.h"
17  #include "vpx/vpx_integer.h"
18  
19  #define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
20  #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
21  #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
22  
23  #define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
24  #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
25  #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
26  
27  #define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
28  #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
29  
30  #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
31  #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
32  #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
33  
34  #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
35  #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
36  
37  #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
38  #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
39  
40  #if (__mips_isa_rev >= 6)
41  #define LH(psrc) ({                                 \
42    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
43    uint16_t val_m;                                   \
44                                                      \
45    __asm__ __volatile__ (                            \
46        "lh  %[val_m],  %[psrc_m]  \n\t"              \
47                                                      \
48        : [val_m] "=r" (val_m)                        \
49        : [psrc_m] "m" (*psrc_m)                      \
50    );                                                \
51                                                      \
52    val_m;                                            \
53  })
54  
55  #define LW(psrc) ({                                 \
56    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
57    uint32_t val_m;                                   \
58                                                      \
59    __asm__ __volatile__ (                            \
60        "lw  %[val_m],  %[psrc_m]  \n\t"              \
61                                                      \
62        : [val_m] "=r" (val_m)                        \
63        : [psrc_m] "m" (*psrc_m)                      \
64    );                                                \
65                                                      \
66    val_m;                                            \
67  })
68  
69  #if (__mips == 64)
70  #define LD(psrc) ({                                 \
71    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
72    uint64_t val_m = 0;                               \
73                                                      \
74    __asm__ __volatile__ (                            \
75        "ld  %[val_m],  %[psrc_m]  \n\t"              \
76                                                      \
77        : [val_m] "=r" (val_m)                        \
78        : [psrc_m] "m" (*psrc_m)                      \
79    );                                                \
80                                                      \
81    val_m;                                            \
82  })
83  #else  // !(__mips == 64)
84  #define LD(psrc) ({                                        \
85    const uint8_t *psrc_m = (const uint8_t *)(psrc);         \
86    uint32_t val0_m, val1_m;                                 \
87    uint64_t val_m = 0;                                      \
88                                                             \
89    val0_m = LW(psrc_m);                                     \
90    val1_m = LW(psrc_m + 4);                                 \
91                                                             \
92    val_m = (uint64_t)(val1_m);                              \
93    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
94    val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
95                                                             \
96    val_m;                                                   \
97  })
98  #endif  // (__mips == 64)
99  
100  #define SH(val, pdst) {                 \
101    uint8_t *pdst_m = (uint8_t *)(pdst);  \
102    const uint16_t val_m = (val);         \
103                                          \
104    __asm__ __volatile__ (                \
105        "sh  %[val_m],  %[pdst_m]  \n\t"  \
106                                          \
107        : [pdst_m] "=m" (*pdst_m)         \
108        : [val_m] "r" (val_m)             \
109    );                                    \
110  }
111  
112  #define SW(val, pdst) {                 \
113    uint8_t *pdst_m = (uint8_t *)(pdst);  \
114    const uint32_t val_m = (val);         \
115                                          \
116    __asm__ __volatile__ (                \
117        "sw  %[val_m],  %[pdst_m]  \n\t"  \
118                                          \
119        : [pdst_m] "=m" (*pdst_m)         \
120        : [val_m] "r" (val_m)             \
121    );                                    \
122  }
123  
124  #define SD(val, pdst) {                 \
125    uint8_t *pdst_m = (uint8_t *)(pdst);  \
126    const uint64_t val_m = (val);         \
127                                          \
128    __asm__ __volatile__ (                \
129        "sd  %[val_m],  %[pdst_m]  \n\t"  \
130                                          \
131        : [pdst_m] "=m" (*pdst_m)         \
132        : [val_m] "r" (val_m)             \
133    );                                    \
134  }
135  #else  // !(__mips_isa_rev >= 6)
136  #define LH(psrc) ({                                 \
137    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
138    uint16_t val_m;                                   \
139                                                      \
140    __asm__ __volatile__ (                            \
141        "ulh  %[val_m],  %[psrc_m]  \n\t"             \
142                                                      \
143        : [val_m] "=r" (val_m)                        \
144        : [psrc_m] "m" (*psrc_m)                      \
145    );                                                \
146                                                      \
147    val_m;                                            \
148  })
149  
150  #define LW(psrc) ({                                 \
151    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
152    uint32_t val_m;                                   \
153                                                      \
154    __asm__ __volatile__ (                            \
155        "ulw  %[val_m],  %[psrc_m]  \n\t"             \
156                                                      \
157        : [val_m] "=r" (val_m)                        \
158        : [psrc_m] "m" (*psrc_m)                      \
159    );                                                \
160                                                      \
161    val_m;                                            \
162  })
163  
164  #if (__mips == 64)
165  #define LD(psrc) ({                                 \
166    const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
167    uint64_t val_m = 0;                               \
168                                                      \
169    __asm__ __volatile__ (                            \
170        "uld  %[val_m],  %[psrc_m]  \n\t"             \
171                                                      \
172        : [val_m] "=r" (val_m)                        \
173        : [psrc_m] "m" (*psrc_m)                      \
174    );                                                \
175                                                      \
176    val_m;                                            \
177  })
178  #else  // !(__mips == 64)
179  #define LD(psrc) ({                                        \
180    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);        \
181    uint32_t val0_m, val1_m;                                 \
182    uint64_t val_m = 0;                                      \
183                                                             \
184    val0_m = LW(psrc_m1);                                    \
185    val1_m = LW(psrc_m1 + 4);                                \
186                                                             \
187    val_m = (uint64_t)(val1_m);                              \
188    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
189    val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
190                                                             \
191    val_m;                                                   \
192  })
193  #endif  // (__mips == 64)
194  
195  #define SH(val, pdst) {                  \
196    uint8_t *pdst_m = (uint8_t *)(pdst);   \
197    const uint16_t val_m = (val);          \
198                                           \
199    __asm__ __volatile__ (                 \
200        "ush  %[val_m],  %[pdst_m]  \n\t"  \
201                                           \
202        : [pdst_m] "=m" (*pdst_m)          \
203        : [val_m] "r" (val_m)              \
204    );                                     \
205  }
206  
207  #define SW(val, pdst) {                  \
208    uint8_t *pdst_m = (uint8_t *)(pdst);   \
209    const uint32_t val_m = (val);          \
210                                           \
211    __asm__ __volatile__ (                 \
212        "usw  %[val_m],  %[pdst_m]  \n\t"  \
213                                           \
214        : [pdst_m] "=m" (*pdst_m)          \
215        : [val_m] "r" (val_m)              \
216    );                                     \
217  }
218  
219  #define SD(val, pdst) {                                     \
220    uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \
221    uint32_t val0_m, val1_m;                                  \
222                                                              \
223    val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
224    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
225                                                              \
226    SW(val0_m, pdst_m1);                                      \
227    SW(val1_m, pdst_m1 + 4);                                  \
228  }
229  #endif  // (__mips_isa_rev >= 6)
230  
231  /* Description : Load 4 words with stride
232     Arguments   : Inputs  - psrc, stride
233                   Outputs - out0, out1, out2, out3
234     Details     : Load word in 'out0' from (psrc)
235                   Load word in 'out1' from (psrc + stride)
236                   Load word in 'out2' from (psrc + 2 * stride)
237                   Load word in 'out3' from (psrc + 3 * stride)
238  */
239  #define LW4(psrc, stride, out0, out1, out2, out3) {  \
240    out0 = LW((psrc));                                 \
241    out1 = LW((psrc) + stride);                        \
242    out2 = LW((psrc) + 2 * stride);                    \
243    out3 = LW((psrc) + 3 * stride);                    \
244  }
245  
246  /* Description : Load double words with stride
247     Arguments   : Inputs  - psrc, stride
248                   Outputs - out0, out1
249     Details     : Load double word in 'out0' from (psrc)
250                   Load double word in 'out1' from (psrc + stride)
251  */
252  #define LD2(psrc, stride, out0, out1) {  \
253    out0 = LD((psrc));                     \
254    out1 = LD((psrc) + stride);            \
255  }
256  #define LD4(psrc, stride, out0, out1, out2, out3) {  \
257    LD2((psrc), stride, out0, out1);                   \
258    LD2((psrc) + 2 * stride, stride, out2, out3);      \
259  }
260  
261  /* Description : Store 4 words with stride
262     Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
263     Details     : Store word from 'in0' to (pdst)
264                   Store word from 'in1' to (pdst + stride)
265                   Store word from 'in2' to (pdst + 2 * stride)
266                   Store word from 'in3' to (pdst + 3 * stride)
267  */
268  #define SW4(in0, in1, in2, in3, pdst, stride) {  \
269    SW(in0, (pdst))                                \
270    SW(in1, (pdst) + stride);                      \
271    SW(in2, (pdst) + 2 * stride);                  \
272    SW(in3, (pdst) + 3 * stride);                  \
273  }
274  
275  /* Description : Store 4 double words with stride
276     Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
277     Details     : Store double word from 'in0' to (pdst)
278                   Store double word from 'in1' to (pdst + stride)
279                   Store double word from 'in2' to (pdst + 2 * stride)
280                   Store double word from 'in3' to (pdst + 3 * stride)
281  */
282  #define SD4(in0, in1, in2, in3, pdst, stride) {  \
283    SD(in0, (pdst))                                \
284    SD(in1, (pdst) + stride);                      \
285    SD(in2, (pdst) + 2 * stride);                  \
286    SD(in3, (pdst) + 3 * stride);                  \
287  }
288  
289  /* Description : Load vectors with 16 byte elements with stride
290     Arguments   : Inputs  - psrc, stride
291                   Outputs - out0, out1
292                   Return Type - as per RTYPE
293     Details     : Load 16 byte elements in 'out0' from (psrc)
294                   Load 16 byte elements in 'out1' from (psrc + stride)
295  */
296  #define LD_B2(RTYPE, psrc, stride, out0, out1) {  \
297    out0 = LD_B(RTYPE, (psrc));                     \
298    out1 = LD_B(RTYPE, (psrc) + stride);            \
299  }
300  #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
301  #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
302  
303  #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) {  \
304    LD_B2(RTYPE, (psrc), stride, out0, out1);             \
305    out2 = LD_B(RTYPE, (psrc) + 2 * stride);              \
306  }
307  #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
308  
309  #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
310    LD_B2(RTYPE, (psrc), stride, out0, out1);                   \
311    LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);     \
312  }
313  #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
314  #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
315  
316  #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) {  \
317    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);             \
318    out4 = LD_B(RTYPE, (psrc) + 4 * stride);                          \
319  }
320  #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
321  #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
322  
323  #define LD_B7(RTYPE, psrc, stride,                             \
324                out0, out1, out2, out3, out4, out5, out6) {      \
325    LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
326    LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
327  }
328  #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
329  
330  #define LD_B8(RTYPE, psrc, stride,                                    \
331                out0, out1, out2, out3, out4, out5, out6, out7) {       \
332    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
333    LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
334  }
335  #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
336  #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
337  
338  /* Description : Load vectors with 8 halfword elements with stride
339     Arguments   : Inputs  - psrc, stride
340                   Outputs - out0, out1
341     Details     : Load 8 halfword elements in 'out0' from (psrc)
342                   Load 8 halfword elements in 'out1' from (psrc + stride)
343  */
344  #define LD_H2(RTYPE, psrc, stride, out0, out1) {  \
345    out0 = LD_H(RTYPE, (psrc));                     \
346    out1 = LD_H(RTYPE, (psrc) + (stride));          \
347  }
348  #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
349  
350  #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
351    LD_H2(RTYPE, (psrc), stride, out0, out1);                   \
352    LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);      \
353  }
354  #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
355  
356  #define LD_H8(RTYPE, psrc, stride,                                    \
357                out0, out1, out2, out3, out4, out5, out6, out7) {       \
358    LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
359    LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
360  }
361  #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
362  
363  #define LD_H16(RTYPE, psrc, stride,                                     \
364                 out0, out1, out2, out3, out4, out5, out6, out7,          \
365                 out8, out9, out10, out11, out12, out13, out14, out15) {  \
366    LD_H8(RTYPE, (psrc), stride,                                          \
367          out0, out1, out2, out3, out4, out5, out6, out7);                \
368    LD_H8(RTYPE, (psrc) + 8 * stride, stride,                             \
369          out8, out9, out10, out11, out12, out13, out14, out15);          \
370  }
371  #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
372  
373  /* Description : Load 4x4 block of signed halfword elements from 1D source
374                   data into 4 vectors (Each vector with 4 signed halfwords)
375     Arguments   : Input   - psrc
376                   Outputs - out0, out1, out2, out3
377  */
378  #define LD4x4_SH(psrc, out0, out1, out2, out3) {         \
379    out0 = LD_SH(psrc);                                    \
380    out2 = LD_SH(psrc + 8);                                \
381    out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);  \
382    out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2);  \
383  }
384  
385  /* Description : Load 2 vectors of signed word elements with stride
386     Arguments   : Inputs  - psrc, stride
387                   Outputs - out0, out1
388                   Return Type - signed word
389  */
390  #define LD_SW2(psrc, stride, out0, out1) {  \
391    out0 = LD_SW((psrc));                     \
392    out1 = LD_SW((psrc) + stride);            \
393  }
394  
395  /* Description : Store vectors of 16 byte elements with stride
396     Arguments   : Inputs - in0, in1, pdst, stride
397     Details     : Store 16 byte elements from 'in0' to (pdst)
398                   Store 16 byte elements from 'in1' to (pdst + stride)
399  */
400  #define ST_B2(RTYPE, in0, in1, pdst, stride) {  \
401    ST_B(RTYPE, in0, (pdst));                     \
402    ST_B(RTYPE, in1, (pdst) + stride);            \
403  }
404  #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
405  
406  #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
407    ST_B2(RTYPE, in0, in1, (pdst), stride);                 \
408    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
409  }
410  #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
411  
412  #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,      \
413                pdst, stride) {                                     \
414    ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
415    ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
416  }
417  #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
418  
419  /* Description : Store vectors of 8 halfword elements with stride
420     Arguments   : Inputs - in0, in1, pdst, stride
421     Details     : Store 8 halfword elements from 'in0' to (pdst)
422                   Store 8 halfword elements from 'in1' to (pdst + stride)
423  */
424  #define ST_H2(RTYPE, in0, in1, pdst, stride) {  \
425    ST_H(RTYPE, in0, (pdst));                     \
426    ST_H(RTYPE, in1, (pdst) + stride);            \
427  }
428  #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
429  
430  #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
431    ST_H2(RTYPE, in0, in1, (pdst), stride);                 \
432    ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
433  }
434  #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
435  
436  #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) {  \
437    ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                           \
438    ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);              \
439  }
440  #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
441  
442  /* Description : Store vectors of word elements with stride
443     Arguments   : Inputs - in0, in1, pdst, stride
444     Details     : Store 4 word elements from 'in0' to (pdst)
445                   Store 4 word elements from 'in1' to (pdst + stride)
446  */
447  #define ST_SW2(in0, in1, pdst, stride) {  \
448    ST_SW(in0, (pdst));                     \
449    ST_SW(in1, (pdst) + stride);            \
450  }
451  
452  /* Description : Store 2x4 byte block to destination memory from input vector
453     Arguments   : Inputs - in, stidx, pdst, stride
454     Details     : Index 'stidx' halfword element from 'in' vector is copied to
455                   the GP register and stored to (pdst)
456                   Index 'stidx+1' halfword element from 'in' vector is copied to
457                   the GP register and stored to (pdst + stride)
458                   Index 'stidx+2' halfword element from 'in' vector is copied to
459                   the GP register and stored to (pdst + 2 * stride)
460                   Index 'stidx+3' halfword element from 'in' vector is copied to
461                   the GP register and stored to (pdst + 3 * stride)
462  */
463  #define ST2x4_UB(in, stidx, pdst, stride) {         \
464    uint16_t out0_m, out1_m, out2_m, out3_m;          \
465    uint8_t *pblk_2x4_m = (uint8_t *)(pdst);          \
466                                                      \
467    out0_m = __msa_copy_u_h((v8i16)in, (stidx));      \
468    out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1));  \
469    out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2));  \
470    out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3));  \
471                                                      \
472    SH(out0_m, pblk_2x4_m);                           \
473    SH(out1_m, pblk_2x4_m + stride);                  \
474    SH(out2_m, pblk_2x4_m + 2 * stride);              \
475    SH(out3_m, pblk_2x4_m + 3 * stride);              \
476  }
477  
478  /* Description : Store 4x2 byte block to destination memory from input vector
479     Arguments   : Inputs - in, pdst, stride
480     Details     : Index 0 word element from 'in' vector is copied to the GP
481                   register and stored to (pdst)
482                   Index 1 word element from 'in' vector is copied to the GP
483                   register and stored to (pdst + stride)
484  */
485  #define ST4x2_UB(in, pdst, stride) {        \
486    uint32_t out0_m, out1_m;                  \
487    uint8_t *pblk_4x2_m = (uint8_t *)(pdst);  \
488                                              \
489    out0_m = __msa_copy_u_w((v4i32)in, 0);    \
490    out1_m = __msa_copy_u_w((v4i32)in, 1);    \
491                                              \
492    SW(out0_m, pblk_4x2_m);                   \
493    SW(out1_m, pblk_4x2_m + stride);          \
494  }
495  
496  /* Description : Store 4x4 byte block to destination memory from input vector
497     Arguments   : Inputs - in0, in1, pdst, stride
498     Details     : 'Idx0' word element from input vector 'in0' is copied to the
499                   GP register and stored to (pdst)
500                   'Idx1' word element from input vector 'in0' is copied to the
501                   GP register and stored to (pdst + stride)
502                   'Idx2' word element from input vector 'in0' is copied to the
503                   GP register and stored to (pdst + 2 * stride)
504                   'Idx3' word element from input vector 'in0' is copied to the
505                   GP register and stored to (pdst + 3 * stride)
506  */
507  #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \
508    uint32_t out0_m, out1_m, out2_m, out3_m;                          \
509    uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                          \
510                                                                      \
511    out0_m = __msa_copy_u_w((v4i32)in0, idx0);                        \
512    out1_m = __msa_copy_u_w((v4i32)in0, idx1);                        \
513    out2_m = __msa_copy_u_w((v4i32)in1, idx2);                        \
514    out3_m = __msa_copy_u_w((v4i32)in1, idx3);                        \
515                                                                      \
516    SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);          \
517  }
518  #define ST4x8_UB(in0, in1, pdst, stride) {                        \
519    uint8_t *pblk_4x8 = (uint8_t *)(pdst);                          \
520                                                                    \
521    ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
522    ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
523  }
524  
525  /* Description : Store 8x1 byte block to destination memory from input vector
526     Arguments   : Inputs - in, pdst
527     Details     : Index 0 double word element from 'in' vector is copied to the
528                   GP register and stored to (pdst)
529  */
530  #define ST8x1_UB(in, pdst) {              \
531    uint64_t out0_m;                        \
532                                            \
533    out0_m = __msa_copy_u_d((v2i64)in, 0);  \
534    SD(out0_m, pdst);                       \
535  }
536  
537  /* Description : Store 8x2 byte block to destination memory from input vector
538     Arguments   : Inputs - in, pdst, stride
539     Details     : Index 0 double word element from 'in' vector is copied to the
540                   GP register and stored to (pdst)
541                   Index 1 double word element from 'in' vector is copied to the
542                   GP register and stored to (pdst + stride)
543  */
544  #define ST8x2_UB(in, pdst, stride) {        \
545    uint64_t out0_m, out1_m;                  \
546    uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \
547                                              \
548    out0_m = __msa_copy_u_d((v2i64)in, 0);    \
549    out1_m = __msa_copy_u_d((v2i64)in, 1);    \
550                                              \
551    SD(out0_m, pblk_8x2_m);                   \
552    SD(out1_m, pblk_8x2_m + stride);          \
553  }
554  
555  /* Description : Store 8x4 byte block to destination memory from input
556                   vectors
557     Arguments   : Inputs - in0, in1, pdst, stride
558     Details     : Index 0 double word element from 'in0' vector is copied to the
559                   GP register and stored to (pdst)
560                   Index 1 double word element from 'in0' vector is copied to the
561                   GP register and stored to (pdst + stride)
562                   Index 0 double word element from 'in1' vector is copied to the
563                   GP register and stored to (pdst + 2 * stride)
564                   Index 1 double word element from 'in1' vector is copied to the
565                   GP register and stored to (pdst + 3 * stride)
566  */
567  #define ST8x4_UB(in0, in1, pdst, stride) {                  \
568    uint64_t out0_m, out1_m, out2_m, out3_m;                  \
569    uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \
570                                                              \
571    out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \
572    out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \
573    out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \
574    out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \
575                                                              \
576    SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
577  }
578  
579  /* Description : average with rounding (in0 + in1 + 1) / 2.
580     Arguments   : Inputs  - in0, in1, in2, in3,
581                   Outputs - out0, out1
582                   Return Type - as per RTYPE
583     Details     : Each unsigned byte element from 'in0' vector is added with
584                   each unsigned byte element from 'in1' vector. Then the average
585                   with rounding is calculated and written to 'out0'
586  */
587  #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
588    out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1);    \
589    out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3);    \
590  }
591  #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
592  
593  #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
594                   out0, out1, out2, out3) {                       \
595    AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                \
596    AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                \
597  }
598  #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
599  
600  /* Description : Immediate number of elements to slide with zero
601     Arguments   : Inputs  - in0, in1, slide_val
602                   Outputs - out0, out1
603                   Return Type - as per RTYPE
604     Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
605                   value specified in the 'slide_val'
606  */
607  #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) {          \
608    v16i8 zero_m = { 0 };                                              \
609    out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val);  \
610    out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val);  \
611  }
612  #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
613  
614  #define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
615                    out0, out1, out2, out3, slide_val) {  \
616    SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);    \
617    SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);    \
618  }
619  #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
620  
621  /* Description : Immediate number of elements to slide
622     Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
623                   Outputs - out0, out1
624                   Return Type - as per RTYPE
625     Details     : Byte elements from 'in0_0' vector are slid into 'in1_0' by
626                   value specified in the 'slide_val'
627  */
628  #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) {  \
629    out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);         \
630    out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);         \
631  }
632  #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
633  #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
634  
635  #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,      \
636                  out0, out1, out2, slide_val) {                        \
637    SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)   \
638    out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);  \
639  }
640  #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
641  #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
642  
643  /* Description : Shuffle byte vector elements as per mask vector
644     Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
645                   Outputs - out0, out1
646                   Return Type - as per RTYPE
647     Details     : Byte elements from 'in0' & 'in1' are copied selectively to
648                   'out0' as per control vector 'mask0'
649  */
650  #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \
651    out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);     \
652    out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2);     \
653  }
654  #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
655  #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
656  #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
657  
658  #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,     \
659                  out0, out1, out2, out3) {                        \
660    VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
661    VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
662  }
663  #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
664  #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
665  
666  /* Description : Dot product of byte vector elements
667     Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
668                   Outputs - out0, out1
669                   Return Type - as per RTYPE
670     Details     : Unsigned byte elements from 'mult0' are multiplied with
671                   unsigned byte elements from 'cnst0' producing a result
672                   twice the size of input i.e. unsigned halfword.
673                   The multiplication result of adjacent odd-even elements
674                   are added together and written to the 'out0' vector
675  */
676  #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
677    out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);        \
678    out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);        \
679  }
680  #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
681  
682  #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,         \
683                   cnst0, cnst1, cnst2, cnst3,                \
684                   out0, out1, out2, out3) {                  \
685    DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
686    DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
687  }
688  #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
689  
690  /* Description : Dot product of byte vector elements
691     Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
692                   Outputs - out0, out1
693                   Return Type - as per RTYPE
694     Details     : Signed byte elements from 'mult0' are multiplied with
695                   signed byte elements from 'cnst0' producing a result
696                   twice the size of input i.e. signed halfword.
697                   The multiplication result of adjacent odd-even elements
698                   are added together and written to the 'out0' vector
699  */
700  #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
701    out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);        \
702    out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);        \
703  }
704  #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
705  
706  #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
707                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
708    DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
709    DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
710  }
711  #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
712  
713  /* Description : Dot product of halfword vector elements
714     Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
715                   Outputs - out0, out1
716                   Return Type - as per RTYPE
717     Details     : Signed halfword elements from 'mult0' are multiplied with
718                   signed halfword elements from 'cnst0' producing a result
719                   twice the size of input i.e. signed word.
720                   The multiplication result of adjacent odd-even elements
721                   are added together and written to the 'out0' vector
722  */
723  #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
724    out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);        \
725    out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);        \
726  }
727  #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
728  
729  #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,         \
730                   cnst0, cnst1, cnst2, cnst3,                \
731                   out0, out1, out2, out3) {                  \
732    DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
733    DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
734  }
735  #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
736  
737  /* Description : Dot product of word vector elements
738     Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
739                   Outputs - out0, out1
740                   Return Type - as per RTYPE
741     Details     : Signed word elements from 'mult0' are multiplied with
742                   signed word elements from 'cnst0' producing a result
743                   twice the size of input i.e. signed double word.
744                   The multiplication result of adjacent odd-even elements
745                   are added together and written to the 'out0' vector
746  */
747  #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
748    out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);        \
749    out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);        \
750  }
751  #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
752  
753  /* Description : Dot product & addition of byte vector elements
754     Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
755                   Outputs - out0, out1
756                   Return Type - as per RTYPE
757     Details     : Signed byte elements from 'mult0' are multiplied with
758                   signed byte elements from 'cnst0' producing a result
759                   twice the size of input i.e. signed halfword.
760                   The multiplication result of adjacent odd-even elements
761                   are added to the 'out0' vector
762  */
763  #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
764    out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0);  \
765    out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1);  \
766  }
767  #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
768  
769  #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
770                    cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
771    DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
772    DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
773  }
774  #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
775  
776  /* Description : Dot product & addition of halfword vector elements
777     Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
778                   Outputs - out0, out1
779                   Return Type - as per RTYPE
780     Details     : Signed halfword elements from 'mult0' are multiplied with
781                   signed halfword elements from 'cnst0' producing a result
782                   twice the size of input i.e. signed word.
783                   The multiplication result of adjacent odd-even elements
784                   are added to the 'out0' vector
785  */
786  #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
787    out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \
788    out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \
789  }
790  #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
791  
792  /* Description : Dot product & addition of double word vector elements
793     Arguments   : Inputs  - mult0, mult1
794                   Outputs - out0, out1
795                   Return Type - as per RTYPE
796     Details     : Each signed word element from 'mult0' is multiplied with itself
797                   producing an intermediate result twice the size of input
798                   i.e. signed double word
799                   The multiplication result of adjacent odd-even elements
800                   are added to the 'out0' vector
801  */
802  #define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) {                       \
803    out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0);  \
804    out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1);  \
805  }
806  #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
807  
808  /* Description : Minimum values between unsigned elements of
809                   either vector are copied to the output vector
810     Arguments   : Inputs  - in0, in1, min_vec
811                   Outputs - in place operation
812                   Return Type - as per RTYPE
813     Details     : Minimum of unsigned halfword element values from 'in0' and
814                   'min_vec' are written to output vector 'in0'
815  */
816  #define MIN_UH2(RTYPE, in0, in1, min_vec) {         \
817    in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec);  \
818    in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec);  \
819  }
820  #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
821  
822  #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) {  \
823    MIN_UH2(RTYPE, in0, in1, min_vec);                   \
824    MIN_UH2(RTYPE, in2, in3, min_vec);                   \
825  }
826  #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
827  
828  /* Description : Clips all signed halfword elements of input vector
829                   between 0 & 255
830     Arguments   : Input  - in
831                   Output - out_m
832                   Return Type - signed halfword
833  */
834  #define CLIP_SH_0_255(in) ({                          \
835    v8i16 max_m = __msa_ldi_h(255);                     \
836    v8i16 out_m;                                        \
837                                                        \
838    out_m = __msa_maxi_s_h((v8i16)in, 0);               \
839    out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \
840    out_m;                                              \
841  })
842  #define CLIP_SH2_0_255(in0, in1) {  \
843    in0 = CLIP_SH_0_255(in0);         \
844    in1 = CLIP_SH_0_255(in1);         \
845  }
846  #define CLIP_SH4_0_255(in0, in1, in2, in3) {  \
847    CLIP_SH2_0_255(in0, in1);                   \
848    CLIP_SH2_0_255(in2, in3);                   \
849  }
850  
851  /* Description : Horizontal addition of 4 signed word elements of input vector
852     Arguments   : Input  - in       (signed word vector)
853                   Output - sum_m    (i32 sum)
854                   Return Type - signed word (GP)
855     Details     : 4 signed word elements of 'in' vector are added together and
856                   the resulting integer sum is returned
857  */
858  #define HADD_SW_S32(in) ({                        \
859    v2i64 res0_m, res1_m;                           \
860    int32_t sum_m;                                  \
861                                                    \
862    res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);  \
863    res1_m = __msa_splati_d(res0_m, 1);             \
864    res0_m = res0_m + res1_m;                       \
865    sum_m = __msa_copy_s_w((v4i32)res0_m, 0);       \
866    sum_m;                                          \
867  })
868  
869  /* Description : Horizontal addition of 8 unsigned halfword elements
870     Arguments   : Inputs  - in       (unsigned halfword vector)
871                   Outputs - sum_m    (u32 sum)
872                   Return Type - unsigned word
873     Details     : 8 unsigned halfword elements of input vector are added
874                   together and the resulting integer sum is returned
875  */
876  #define HADD_UH_U32(in) ({                           \
877    v4u32 res_m;                                       \
878    v2u64 res0_m, res1_m;                              \
879    uint32_t sum_m;                                    \
880                                                       \
881    res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in);      \
882    res0_m = __msa_hadd_u_d(res_m, res_m);             \
883    res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);  \
884    res0_m = res0_m + res1_m;                          \
885    sum_m = __msa_copy_u_w((v4i32)res0_m, 0);          \
886    sum_m;                                             \
887  })
888  
889  /* Description : Horizontal addition of unsigned byte vector elements
890     Arguments   : Inputs  - in0, in1
891                   Outputs - out0, out1
892                   Return Type - as per RTYPE
893     Details     : Each unsigned odd byte element from 'in0' is added to
894                   even unsigned byte element from 'in0' (pairwise) and the
895                   halfword result is written to 'out0'
896  */
897  #define HADD_UB2(RTYPE, in0, in1, out0, out1) {          \
898    out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \
899    out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1);  \
900  }
901  #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
902  
903  #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \
904    HADD_UB2(RTYPE, in0, in1, out0, out1);                               \
905    HADD_UB2(RTYPE, in2, in3, out2, out3);                               \
906  }
907  #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
908  
909  /* Description : Horizontal subtraction of unsigned byte vector elements
910     Arguments   : Inputs  - in0, in1
911                   Outputs - out0, out1
912                   Return Type - as per RTYPE
913     Details     : Each unsigned odd byte element from 'in0' is subtracted from
914                   even unsigned byte element from 'in0' (pairwise) and the
915                   halfword result is written to 'out0'
916  */
917  #define HSUB_UB2(RTYPE, in0, in1, out0, out1) {          \
918    out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \
919    out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \
920  }
921  #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
922  
923  /* Description : SAD (Sum of Absolute Difference)
924     Arguments   : Inputs  - in0, in1, ref0, ref1
925                   Outputs - sad_m                 (halfword vector)
926                   Return Type - unsigned halfword
927     Details     : Absolute difference of all the byte elements from 'in0' with
928                   'ref0' is calculated and preserved in 'diff0'. Then even-odd
929                   pairs are added together to generate 8 halfword results.
930  */
931  #define SAD_UB2_UH(in0, in1, ref0, ref1) ({                 \
932    v16u8 diff0_m, diff1_m;                                   \
933    v8u16 sad_m = { 0 };                                      \
934                                                              \
935    diff0_m = __msa_asub_u_b((v16u8)in0, (v16u8)ref0);        \
936    diff1_m = __msa_asub_u_b((v16u8)in1, (v16u8)ref1);        \
937                                                              \
938    sad_m += __msa_hadd_u_h((v16u8)diff0_m, (v16u8)diff0_m);  \
939    sad_m += __msa_hadd_u_h((v16u8)diff1_m, (v16u8)diff1_m);  \
940                                                              \
941    sad_m;                                                    \
942  })
943  
944  /* Description : Horizontal subtraction of signed halfword vector elements
945     Arguments   : Inputs  - in0, in1
946                   Outputs - out0, out1
947                   Return Type - as per RTYPE
948     Details     : Each signed odd halfword element from 'in0' is subtracted from
949                   even signed halfword element from 'in0' (pairwise) and the
950                   word result is written to 'out0'
951  */
952  #define HSUB_UH2(RTYPE, in0, in1, out0, out1) {          \
953    out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0);  \
954    out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1);  \
955  }
956  #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
957  
958  /* Description : Set element n input vector to GPR value
959     Arguments   : Inputs - in0, in1, in2, in3
960                   Output - out
961                   Return Type - as per RTYPE
962     Details     : Set element 0 in vector 'out' to value specified in 'in0'
963  */
964  #define INSERT_W2(RTYPE, in0, in1, out) {           \
965    out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
966    out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \
967  }
968  #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
969  
970  #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) {  \
971    out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);   \
972    out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);   \
973    out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);   \
974    out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);   \
975  }
976  #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
977  #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
978  
979  #define INSERT_D2(RTYPE, in0, in1, out) {           \
980    out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
981    out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
982  }
983  #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
984  #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
985  
986  /* Description : Interleave even byte elements from vectors
987     Arguments   : Inputs  - in0, in1, in2, in3
988                   Outputs - out0, out1
989                   Return Type - as per RTYPE
990     Details     : Even byte elements of 'in0' and 'in1' are interleaved
991                   and written to 'out0'
992  */
993  #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
994    out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);     \
995    out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);     \
996  }
997  #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
998  #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
999  
1000  /* Description : Interleave even halfword elements from vectors
1001     Arguments   : Inputs  - in0, in1, in2, in3
1002                   Outputs - out0, out1
1003                   Return Type - as per RTYPE
1004     Details     : Even halfword elements of 'in0' and 'in1' are interleaved
1005                   and written to 'out0'
1006  */
1007  #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1008    out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);     \
1009    out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);     \
1010  }
1011  #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1012  #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1013  #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1014  
1015  /* Description : Interleave even word elements from vectors
1016     Arguments   : Inputs  - in0, in1, in2, in3
1017                   Outputs - out0, out1
1018                   Return Type - as per RTYPE
1019     Details     : Even word elements of 'in0' and 'in1' are interleaved
1020                   and written to 'out0'
1021  */
1022  #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1023    out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);     \
1024    out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);     \
1025  }
1026  #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1027  
1028  /* Description : Interleave even double word elements from vectors
1029     Arguments   : Inputs  - in0, in1, in2, in3
1030                   Outputs - out0, out1
1031                   Return Type - as per RTYPE
1032     Details     : Even double word elements of 'in0' and 'in1' are interleaved
1033                   and written to 'out0'
1034  */
1035  #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1036    out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);     \
1037    out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);     \
1038  }
1039  #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1040  
1041  /* Description : Interleave left half of byte elements from vectors
1042     Arguments   : Inputs  - in0, in1, in2, in3
1043                   Outputs - out0, out1
1044                   Return Type - as per RTYPE
1045     Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
1046                   and written to 'out0'.
1047  */
1048  #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1049    out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);     \
1050    out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);     \
1051  }
1052  #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1053  #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1054  #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1055  #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1056  
1057  #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1058                  out0, out1, out2, out3) {                       \
1059    ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1060    ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1061  }
1062  #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1063  #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1064  
1065  /* Description : Interleave left half of halfword elements from vectors
1066     Arguments   : Inputs  - in0, in1, in2, in3
1067                   Outputs - out0, out1
1068                   Return Type - as per RTYPE
1069     Details     : Left half of halfword elements of 'in0' and 'in1' are
1070                   interleaved and written to 'out0'.
1071  */
1072  #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1073    out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);     \
1074    out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3);     \
1075  }
1076  #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1077  
1078  /* Description : Interleave left half of word elements from vectors
1079     Arguments   : Inputs  - in0, in1, in2, in3
1080                   Outputs - out0, out1
1081                   Return Type - as per RTYPE
1082     Details     : Left half of word elements of 'in0' and 'in1' are interleaved
1083                   and written to 'out0'.
1084  */
1085  #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1086    out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);     \
1087    out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3);     \
1088  }
1089  #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1090  #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1091  
1092  /* Description : Interleave right half of byte elements from vectors
1093     Arguments   : Inputs  - in0, in1, in2, in3
1094                   Outputs - out0, out1
1095                   Return Type - as per RTYPE
1096     Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
1097                   and written to out0.
1098  */
1099  #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1100    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \
1101    out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);     \
1102  }
1103  #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1104  #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1105  #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1106  #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1107  
1108  #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1109                  out0, out1, out2, out3) {                       \
1110    ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1111    ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1112  }
1113  #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1114  #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1115  #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1116  #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1117  
1118  #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
1119                  in8, in9, in10, in11, in12, in13, in14, in15,      \
1120                  out0, out1, out2, out3, out4, out5, out6, out7) {  \
1121    ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,           \
1122            out0, out1, out2, out3);                                 \
1123    ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,     \
1124            out4, out5, out6, out7);                                 \
1125  }
1126  #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1127  
1128  /* Description : Interleave right half of halfword elements from vectors
1129     Arguments   : Inputs  - in0, in1, in2, in3
1130                   Outputs - out0, out1
1131                   Return Type - as per RTYPE
1132     Details     : Right half of halfword elements of 'in0' and 'in1' are
1133                   interleaved and written to 'out0'.
1134  */
1135  #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1136    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \
1137    out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \
1138  }
1139  #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1140  
1141  #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1142                  out0, out1, out2, out3) {                       \
1143    ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1144    ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1145  }
1146  #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1147  
1148  #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1149    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);     \
1150    out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3);     \
1151  }
1152  #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1153  #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1154  
1155  #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1156                  out0, out1, out2, out3) {                       \
1157    ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1158    ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1159  }
1160  #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1161  
1162  /* Description : Interleave right half of double word elements from vectors
1163     Arguments   : Inputs  - in0, in1, in2, in3
1164                   Outputs - out0, out1
1165                   Return Type - as per RTYPE
1166     Details     : Right half of double word elements of 'in0' and 'in1' are
1167                   interleaved and written to 'out0'.
1168  */
1169  #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {   \
1170    out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \
1171    out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3));  \
1172  }
1173  #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1174  #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1175  #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1176  
1177  #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) {  \
1178    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                         \
1179    out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));                 \
1180  }
1181  #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1182  
1183  #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1184                  out0, out1, out2, out3) {                       \
1185    ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1186    ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1187  }
1188  #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1189  #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1190  
1191  /* Description : Interleave both left and right half of input vectors
1192     Arguments   : Inputs  - in0, in1
1193                   Outputs - out0, out1
1194                   Return Type - as per RTYPE
1195     Details     : Right half of byte elements from 'in0' and 'in1' are
1196                   interleaved and written to 'out0'
1197  */
1198  #define ILVRL_B2(RTYPE, in0, in1, out0, out1) {        \
1199    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
1200    out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
1201  }
1202  #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1203  #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1204  #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1205  #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1206  
1207  #define ILVRL_H2(RTYPE, in0, in1, out0, out1) {        \
1208    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
1209    out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
1210  }
1211  #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1212  #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1213  
1214  #define ILVRL_W2(RTYPE, in0, in1, out0, out1) {        \
1215    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
1216    out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
1217  }
1218  #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1219  #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1220  
1221  /* Description : Saturate the halfword element values to the max
1222                   unsigned value of (sat_val + 1) bits
1223                   The element data width remains unchanged
1224     Arguments   : Inputs  - in0, in1, sat_val
1225                   Outputs - in place operation
1226                   Return Type - as per RTYPE
1227     Details     : Each unsigned halfword element from 'in0' is saturated to the
1228                   value generated with (sat_val + 1) bit range.
1229                   The results are written in place
1230  */
1231  #define SAT_UH2(RTYPE, in0, in1, sat_val) {         \
1232    in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \
1233    in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val);  \
1234  }
1235  #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1236  
1237  #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
1238    SAT_UH2(RTYPE, in0, in1, sat_val);                   \
1239    SAT_UH2(RTYPE, in2, in3, sat_val)                    \
1240  }
1241  #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1242  
1243  /* Description : Saturate the halfword element values to the max
1244                   unsigned value of (sat_val + 1) bits
1245                   The element data width remains unchanged
1246     Arguments   : Inputs  - in0, in1, sat_val
1247                   Outputs - in place operation
1248                   Return Type - as per RTYPE
1249     Details     : Each unsigned halfword element from 'in0' is saturated to the
1250                   value generated with (sat_val + 1) bit range
1251                   The results are written in place
1252  */
1253  #define SAT_SH2(RTYPE, in0, in1, sat_val) {         \
1254    in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \
1255    in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val);  \
1256  }
1257  #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1258  
1259  #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
1260    SAT_SH2(RTYPE, in0, in1, sat_val);                   \
1261    SAT_SH2(RTYPE, in2, in3, sat_val);                   \
1262  }
1263  #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1264  
1265  /* Description : Indexed halfword element values are replicated to all
1266                   elements in output vector
1267     Arguments   : Inputs  - in, idx0, idx1
1268                   Outputs - out0, out1
1269                   Return Type - as per RTYPE
1270     Details     : 'idx0' element value from 'in' vector is replicated to all
1271                    elements in 'out0' vector
1272                    Valid index range for halfword operation is 0-7
1273  */
1274  #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) {  \
1275    out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);        \
1276    out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);        \
1277  }
1278  #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1279  
1280  #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
1281                    out0, out1, out2, out3) {           \
1282    SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);       \
1283    SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);       \
1284  }
1285  #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1286  #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1287  
1288  /* Description : Pack even byte elements of vector pairs
1289     Arguments   : Inputs  - in0, in1, in2, in3
1290                   Outputs - out0, out1
1291                   Return Type - as per RTYPE
1292     Details     : Even byte elements of 'in0' are copied to the left half of
1293                   'out0' & even byte elements of 'in1' are copied to the right
1294                   half of 'out0'.
1295  */
1296  #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1297    out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \
1298    out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);     \
1299  }
1300  #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1301  #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1302  #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1303  
1304  #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1305                   out0, out1, out2, out3) {                       \
1306    PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1307    PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1308  }
1309  #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1310  #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1311  #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1312  
1313  /* Description : Pack even halfword elements of vector pairs
1314     Arguments   : Inputs  - in0, in1, in2, in3
1315                   Outputs - out0, out1
1316                   Return Type - as per RTYPE
1317     Details     : Even halfword elements of 'in0' are copied to the left half of
1318                   'out0' & even halfword elements of 'in1' are copied to the
1319                   right half of 'out0'.
1320  */
1321  #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1322    out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);     \
1323    out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);     \
1324  }
1325  #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1326  #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1327  
1328  #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1329                   out0, out1, out2, out3) {                       \
1330    PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1331    PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1332  }
1333  #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1334  
1335  /* Description : Pack even double word elements of vector pairs
1336     Arguments   : Inputs  - in0, in1, in2, in3
1337                   Outputs - out0, out1
1338                   Return Type - as per RTYPE
1339     Details     : Even double elements of 'in0' are copied to the left half of
1340                   'out0' & even double elements of 'in1' are copied to the right
1341                   half of 'out0'.
1342  */
1343  #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1344    out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1);     \
1345    out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3);     \
1346  }
1347  #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1348  #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1349  
1350  #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1351                   out0, out1, out2, out3) {                       \
1352    PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1353    PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1354  }
1355  #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1356  
1357  /* Description : Each byte element is logically xor'ed with immediate 128
1358     Arguments   : Inputs  - in0, in1
1359                   Outputs - in place operation
1360                   Return Type - as per RTYPE
1361     Details     : Each unsigned byte element from input vector 'in0' is
1362                   logically xor'ed with 128 and the result is stored in-place.
1363  */
1364  #define XORI_B2_128(RTYPE, in0, in1) {         \
1365    in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128);  \
1366    in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128);  \
1367  }
1368  #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1369  #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1370  
1371  #define XORI_B3_128(RTYPE, in0, in1, in2) {    \
1372    XORI_B2_128(RTYPE, in0, in1);                \
1373    in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128);  \
1374  }
1375  #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1376  
1377  #define XORI_B4_128(RTYPE, in0, in1, in2, in3) {  \
1378    XORI_B2_128(RTYPE, in0, in1);                   \
1379    XORI_B2_128(RTYPE, in2, in3);                   \
1380  }
1381  #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1382  #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1383  
1384  #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) {  \
1385    XORI_B4_128(RTYPE, in0, in1, in2, in3);                        \
1386    XORI_B3_128(RTYPE, in4, in5, in6);                             \
1387  }
1388  #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1389  
1390  /* Description : Average of signed halfword elements -> (a + b) / 2
1391     Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1392                   Outputs - out0, out1, out2, out3
1393                   Return Type - as per RTYPE
1394     Details     : Each signed halfword element from 'in0' is added to each
1395                   signed halfword element of 'in1' with full precision resulting
1396                   in one extra bit in the result. The result is then divided by
1397                   2 and written to 'out0'
1398  */
1399  #define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1400                  out0, out1, out2, out3) {                       \
1401    out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);          \
1402    out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);          \
1403    out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);          \
1404    out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);          \
1405  }
1406  #define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
1407  
1408  /* Description : Addition of signed halfword elements and signed saturation
1409     Arguments   : Inputs  - in0, in1, in2, in3
1410                   Outputs - out0, out1
1411                   Return Type - as per RTYPE
1412     Details     : Signed halfword elements from 'in0' are added to signed
1413                   halfword elements of 'in1'. The result is then signed saturated
1414                   between halfword data type range
1415  */
1416  #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
1417    out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1);    \
1418    out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3);    \
1419  }
1420  #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1421  
1422  #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1423                   out0, out1, out2, out3) {                       \
1424    ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);               \
1425    ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);               \
1426  }
1427  #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1428  
1429  /* Description : Shift left all elements of vector (generic for all data types)
1430     Arguments   : Inputs  - in0, in1, in2, in3, shift
1431                   Outputs - in place operation
1432                   Return Type - as per input vector RTYPE
1433     Details     : Each element of vector 'in0' is left shifted by 'shift' and
1434                   the result is written in-place.
1435  */
1436  #define SLLI_4V(in0, in1, in2, in3, shift) {  \
1437    in0 = in0 << shift;                         \
1438    in1 = in1 << shift;                         \
1439    in2 = in2 << shift;                         \
1440    in3 = in3 << shift;                         \
1441  }
1442  
1443  /* Description : Arithmetic shift right all elements of vector
1444                   (generic for all data types)
1445     Arguments   : Inputs  - in0, in1, in2, in3, shift
1446                   Outputs - in place operation
1447                   Return Type - as per input vector RTYPE
1448     Details     : Each element of vector 'in0' is right shifted by 'shift' and
1449                   the result is written in-place. 'shift' is a GP variable.
1450  */
1451  #define SRA_4V(in0, in1, in2, in3, shift) {  \
1452    in0 = in0 >> shift;                        \
1453    in1 = in1 >> shift;                        \
1454    in2 = in2 >> shift;                        \
1455    in3 = in3 >> shift;                        \
1456  }
1457  
1458  /* Description : Shift right arithmetic rounded words
1459     Arguments   : Inputs  - in0, in1, shift
1460                   Outputs - in place operation
1461                   Return Type - as per RTYPE
1462     Details     : Each element of vector 'in0' is shifted right arithmetically by
1463                   the number of bits in the corresponding element in the vector
1464                   'shift'. The last discarded bit is added to shifted value for
1465                   rounding and the result is written in-place.
1466                   'shift' is a vector.
1467  */
1468  #define SRAR_W2(RTYPE, in0, in1, shift) {               \
1469    in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift);  \
1470    in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift);  \
1471  }
1472  
1473  #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) {  \
1474    SRAR_W2(RTYPE, in0, in1, shift)                    \
1475    SRAR_W2(RTYPE, in2, in3, shift)                    \
1476  }
1477  #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
1478  
1479  /* Description : Shift right arithmetic rounded (immediate)
1480     Arguments   : Inputs  - in0, in1, shift
1481                   Outputs - in place operation
1482                   Return Type - as per RTYPE
1483     Details     : Each element of vector 'in0' is shifted right arithmetically by
1484                   the value in 'shift'. The last discarded bit is added to the
1485                   shifted value for rounding and the result is written in-place.
1486                   'shift' is an immediate value.
1487  */
1488  #define SRARI_H2(RTYPE, in0, in1, shift) {        \
1489    in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \
1490    in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift);  \
1491  }
1492  #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1493  #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1494  
1495  #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) {  \
1496    SRARI_H2(RTYPE, in0, in1, shift);                   \
1497    SRARI_H2(RTYPE, in2, in3, shift);                   \
1498  }
1499  #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
1500  #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
1501  
1502  #define SRARI_W2(RTYPE, in0, in1, shift) {        \
1503    in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
1504    in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
1505  }
1506  #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1507  
1508  #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) {  \
1509    SRARI_W2(RTYPE, in0, in1, shift);                   \
1510    SRARI_W2(RTYPE, in2, in3, shift);                   \
1511  }
1512  #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1513  
1514  /* Description : Logical shift right all elements of vector (immediate)
1515     Arguments   : Inputs  - in0, in1, in2, in3, shift
1516                   Outputs - out0, out1, out2, out3
1517                   Return Type - as per RTYPE
1518     Details     : Each element of vector 'in0' is right shifted by 'shift' and
1519                   the result is written in-place. 'shift' is an immediate value.
1520  */
1521  #define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) {  \
1522    out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                             \
1523    out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                             \
1524    out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                             \
1525    out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                             \
1526  }
1527  #define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
1528  
1529  /* Description : Multiplication of pairs of vectors
1530     Arguments   : Inputs  - in0, in1, in2, in3
1531                   Outputs - out0, out1
1532     Details     : Each element from 'in0' is multiplied with elements from 'in1'
1533                   and the result is written to 'out0'
1534  */
1535  #define MUL2(in0, in1, in2, in3, out0, out1) {  \
1536    out0 = in0 * in1;                             \
1537    out1 = in2 * in3;                             \
1538  }
1539  #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
1540               out0, out1, out2, out3) {                \
1541    MUL2(in0, in1, in2, in3, out0, out1);               \
1542    MUL2(in4, in5, in6, in7, out2, out3);               \
1543  }
1544  
1545  /* Description : Addition of 2 pairs of vectors
1546     Arguments   : Inputs  - in0, in1, in2, in3
1547                   Outputs - out0, out1
1548     Details     : Each element in 'in0' is added to 'in1' and result is written
1549                   to 'out0'.
1550  */
1551  #define ADD2(in0, in1, in2, in3, out0, out1) {  \
1552    out0 = in0 + in1;                             \
1553    out1 = in2 + in3;                             \
1554  }
1555  #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
1556               out0, out1, out2, out3) {                \
1557    ADD2(in0, in1, in2, in3, out0, out1);               \
1558    ADD2(in4, in5, in6, in7, out2, out3);               \
1559  }
1560  
1561  /* Description : Subtraction of 2 pairs of vectors
1562     Arguments   : Inputs  - in0, in1, in2, in3
1563                   Outputs - out0, out1
1564     Details     : Each element in 'in1' is subtracted from 'in0' and result is
1565                   written to 'out0'.
1566  */
1567  #define SUB2(in0, in1, in2, in3, out0, out1) {  \
1568    out0 = in0 - in1;                             \
1569    out1 = in2 - in3;                             \
1570  }
1571  #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
1572               out0, out1, out2, out3) {                \
1573    out0 = in0 - in1;                                   \
1574    out1 = in2 - in3;                                   \
1575    out2 = in4 - in5;                                   \
1576    out3 = in6 - in7;                                   \
1577  }
1578  
1579  /* Description : Sign extend halfword elements from right half of the vector
1580     Arguments   : Input  - in    (halfword vector)
1581                   Output - out   (sign extended word vector)
1582                   Return Type - signed word
1583     Details     : Sign bit of halfword elements from input vector 'in' is
1584                   extracted and interleaved with same vector 'in0' to generate
1585                   4 word elements keeping sign intact
1586  */
1587  #define UNPCK_R_SH_SW(in, out) {                 \
1588    v8i16 sign_m;                                  \
1589                                                   \
1590    sign_m = __msa_clti_s_h((v8i16)in, 0);         \
1591    out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);  \
1592  }
1593  
1594  /* Description : Zero extend unsigned byte elements to halfword elements
1595     Arguments   : Input   - in          (unsigned byte vector)
1596                   Outputs - out0, out1  (unsigned  halfword vectors)
1597                   Return Type - signed halfword
1598     Details     : Zero extended right half of vector is returned in 'out0'
1599                   Zero extended left half of vector is returned in 'out1'
1600  */
1601  #define UNPCK_UB_SH(in, out0, out1) {   \
1602    v16i8 zero_m = { 0 };                 \
1603                                          \
1604    ILVRL_B2_SH(zero_m, in, out0, out1);  \
1605  }
1606  
1607  /* Description : Sign extend halfword elements from input vector and return
1608                   the result in pair of vectors
1609     Arguments   : Input   - in            (halfword vector)
1610                   Outputs - out0, out1   (sign extended word vectors)
1611                   Return Type - signed word
1612     Details     : Sign bit of halfword elements from input vector 'in' is
1613                   extracted and interleaved right with same vector 'in0' to
1614                   generate 4 signed word elements in 'out0'
1615                   Then interleaved left with same vector 'in0' to
1616                   generate 4 signed word elements in 'out1'
1617  */
1618  #define UNPCK_SH_SW(in, out0, out1) {    \
1619    v8i16 tmp_m;                           \
1620                                           \
1621    tmp_m = __msa_clti_s_h((v8i16)in, 0);  \
1622    ILVRL_H2_SW(tmp_m, in, out0, out1);    \
1623  }
1624  
1625  /* Description : Butterfly of 4 input vectors
1626     Arguments   : Inputs  - in0, in1, in2, in3
1627                   Outputs - out0, out1, out2, out3
1628     Details     : Butterfly operation
1629  */
1630  #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
1631    out0 = in0 + in3;                                                \
1632    out1 = in1 + in2;                                                \
1633                                                                     \
1634    out2 = in1 - in2;                                                \
1635    out3 = in0 - in3;                                                \
1636  }
1637  
1638  /* Description : Butterfly of 8 input vectors
1639     Arguments   : Inputs  - in0 ...  in7
1640                   Outputs - out0 .. out7
1641     Details     : Butterfly operation
1642  */
1643  #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,            \
1644                      out0, out1, out2, out3, out4, out5, out6, out7) {  \
1645    out0 = in0 + in7;                                                    \
1646    out1 = in1 + in6;                                                    \
1647    out2 = in2 + in5;                                                    \
1648    out3 = in3 + in4;                                                    \
1649                                                                         \
1650    out4 = in3 - in4;                                                    \
1651    out5 = in2 - in5;                                                    \
1652    out6 = in1 - in6;                                                    \
1653    out7 = in0 - in7;                                                    \
1654  }
1655  
1656  /* Description : Butterfly of 16 input vectors
1657     Arguments   : Inputs  - in0 ...  in15
1658                   Outputs - out0 .. out15
1659     Details     : Butterfly operation
1660  */
1661  #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                  \
1662                       in8, in9,  in10, in11, in12, in13, in14, in15,           \
1663                       out0, out1, out2, out3, out4, out5, out6, out7,          \
1664                       out8, out9, out10, out11, out12, out13, out14, out15) {  \
1665    out0 = in0 + in15;                                                          \
1666    out1 = in1 + in14;                                                          \
1667    out2 = in2 + in13;                                                          \
1668    out3 = in3 + in12;                                                          \
1669    out4 = in4 + in11;                                                          \
1670    out5 = in5 + in10;                                                          \
1671    out6 = in6 + in9;                                                           \
1672    out7 = in7 + in8;                                                           \
1673                                                                                \
1674    out8 = in7 - in8;                                                           \
1675    out9 = in6 - in9;                                                           \
1676    out10 = in5 - in10;                                                         \
1677    out11 = in4 - in11;                                                         \
1678    out12 = in3 - in12;                                                         \
1679    out13 = in2 - in13;                                                         \
1680    out14 = in1 - in14;                                                         \
1681    out15 = in0 - in15;                                                         \
1682  }
1683  
1684  /* Description : Transpose input 8x8 byte block
1685     Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1686                   Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1687                   Return Type - as per RTYPE
1688  */
1689  #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
1690                          out0, out1, out2, out3, out4, out5, out6, out7) {  \
1691    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
1692    v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
1693                                                                             \
1694    ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                       \
1695               tmp0_m, tmp1_m, tmp2_m, tmp3_m);                              \
1696    ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                             \
1697    ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                             \
1698    ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                             \
1699    ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                             \
1700    SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                             \
1701    SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                             \
1702  }
1703  #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
1704  
1705  /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
1706     Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1707                             in8, in9, in10, in11, in12, in13, in14, in15
1708                   Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1709                   Return Type - unsigned byte
1710  */
1711  #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
1712                              in8, in9, in10, in11, in12, in13, in14, in15,      \
1713                              out0, out1, out2, out3, out4, out5, out6, out7) {  \
1714    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                        \
1715    v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                        \
1716                                                                                 \
1717    ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                                 \
1718    ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                               \
1719    ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                               \
1720    ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                               \
1721                                                                                 \
1722    tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                     \
1723    tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                     \
1724    tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                     \
1725    tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                     \
1726    out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                       \
1727    tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                     \
1728    out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                       \
1729    tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                     \
1730                                                                                 \
1731    ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                     \
1732    out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1733    out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1734                                                                                 \
1735    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                 \
1736    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                     \
1737    out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1738    out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1739                                                                                 \
1740    ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);                 \
1741    out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1742    out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1743                                                                                 \
1744    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
1745    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
1746    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
1747    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
1748    out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1749    out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
1750  }
1751  
1752  /* Description : Transpose 4x4 block with half word elements in vectors
1753     Arguments   : Inputs  - in0, in1, in2, in3
1754                   Outputs - out0, out1, out2, out3
1755                   Return Type - signed halfword
1756  */
1757  #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
1758    v8i16 s0_m, s1_m;                                                       \
1759                                                                            \
1760    ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                             \
1761    ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                    \
1762    out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                   \
1763    out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);                   \
1764  }
1765  
1766  /* Description : Transpose 4x8 block with half word elements in vectors
1767     Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1768                   Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1769                   Return Type - signed halfword
1770  */
1771  #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \
1772                             out0, out1, out2, out3, out4, out5, out6, out7) {  \
1773    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                       \
1774    v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                       \
1775    v8i16 zero_m = { 0 };                                                       \
1776                                                                                \
1777    ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                          \
1778               tmp0_n, tmp1_n, tmp2_n, tmp3_n);                                 \
1779    ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                                \
1780    ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                                \
1781                                                                                \
1782    out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
1783    out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
1784    out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
1785    out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
1786                                                                                \
1787    out4 = zero_m;                                                              \
1788    out5 = zero_m;                                                              \
1789    out6 = zero_m;                                                              \
1790    out7 = zero_m;                                                              \
1791  }
1792  
1793  /* Description : Transpose 8x4 block with half word elements in vectors
1794     Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1795                   Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1796                   Return Type - signed halfword
1797  */
1798  #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
1799    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
1800                                                                            \
1801    ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                         \
1802    ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                         \
1803    ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);                 \
1804    ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);                 \
1805  }
1806  
1807  /* Description : Transpose 8x8 block with half word elements in vectors
1808     Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1809                   Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1810                   Return Type - as per RTYPE
1811  */
1812  #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
1813                         out0, out1, out2, out3, out4, out5, out6, out7) {  \
1814    v8i16 s0_m, s1_m;                                                       \
1815    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
1816    v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
1817                                                                            \
1818    ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
1819    ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
1820    ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
1821    ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
1822    ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
1823    ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
1824    ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
1825    ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
1826    PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,         \
1827             tmp3_m, tmp7_m, out0, out2, out4, out6);                       \
1828    out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
1829    out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
1830    out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
1831    out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
1832  }
1833  #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
1834  
1835  /* Description : Transpose 4x4 block with word elements in vectors
1836     Arguments   : Inputs  - in0, in1, in2, in3
1837                   Outputs - out0, out1, out2, out3
1838                   Return Type - signed word
1839  */
1840  #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) {  \
1841    v4i32 s0_m, s1_m, s2_m, s3_m;                                           \
1842                                                                            \
1843    ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                      \
1844    ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                      \
1845                                                                            \
1846    out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);                   \
1847    out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                   \
1848    out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                   \
1849    out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                   \
1850  }
1851  
1852  /* Description : Add block 4x4
1853     Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
1854     Details     : Least significant 4 bytes from each input vector are added to
1855                   the destination bytes, clipped between 0-255 and stored.
1856  */
1857  #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \
1858    uint32_t src0_m, src1_m, src2_m, src3_m;                      \
1859    v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
1860    v16i8 dst0_m = { 0 };                                         \
1861    v16i8 dst1_m = { 0 };                                         \
1862    v16i8 zero_m = { 0 };                                         \
1863                                                                  \
1864    ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
1865    LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
1866    INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
1867    INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
1868    ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
1869    ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
1870    CLIP_SH2_0_255(res0_m, res1_m);                               \
1871    PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
1872    ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
1873  }
1874  
1875  /* Description : Pack even elements of input vectors & xor with 128
1876     Arguments   : Inputs - in0, in1
1877                   Output - out_m
1878                   Return Type - unsigned byte
1879     Details     : Signed byte even elements from 'in0' and 'in1' are packed
1880                   together in one vector and the resulting vector is xor'ed with
1881                   128 to shift the range from signed to unsigned byte
1882  */
1883  #define PCKEV_XORI128_UB(in0, in1) ({                    \
1884    v16u8 out_m;                                           \
1885                                                           \
1886    out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0);  \
1887    out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);        \
1888    out_m;                                                 \
1889  })
1890  
1891  /* Description : Converts inputs to unsigned bytes, interleave, average & store
1892                   as 8x4 unsigned byte block
1893     Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
1894                            pdst, stride
1895  */
1896  #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,                      \
1897                                  dst0, dst1, dst2, dst3, pdst, stride) {  \
1898    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
1899    uint8_t *pdst_m = (uint8_t *)(pdst);                                   \
1900                                                                           \
1901    tmp0_m = PCKEV_XORI128_UB(in0, in1);                                   \
1902    tmp1_m = PCKEV_XORI128_UB(in2, in3);                                   \
1903    ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                    \
1904    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);           \
1905    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                              \
1906  }
1907  
1908  /* Description : Pack even byte elements and store byte vector in destination
1909                   memory
1910     Arguments   : Inputs - in0, in1, pdst
1911  */
1912  #define PCKEV_ST_SB(in0, in1, pdst) {             \
1913    v16i8 tmp_m;                                    \
1914                                                    \
1915    tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0);  \
1916    ST_SB(tmp_m, (pdst));                           \
1917  }
1918  
1919  /* Description : Horizontal 2 tap filter kernel code
1920     Arguments   : Inputs - in0, in1, mask, coeff, shift
1921  */
1922  #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({    \
1923    v16i8 tmp0_m;                                                \
1924    v8u16 tmp1_m;                                                \
1925                                                                 \
1926    tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0);  \
1927    tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);        \
1928    tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);         \
1929                                                                 \
1930    tmp1_m;                                                      \
1931  })
1932  #endif  /* VPX_DSP_MIPS_MACROS_MSA_H_ */
1933