• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
23 
24 #include <stdint.h>
25 #include <msa.h>
26 #include <config.h>
27 
28 #define ALIGNMENT           16
29 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
30 
31 #define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
32 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
33 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
34 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
35 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
36 #define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
37 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
38 
39 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
40 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
41 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
42 #define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
43 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
44 #define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
45 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
46 
47 #if (__mips_isa_rev >= 6)
48     #define LH(psrc)                              \
49     ( {                                           \
50         uint16_t val_lh_m = *(uint16_t *)(psrc);  \
51         val_lh_m;                                 \
52     } )
53 
54     #define LW(psrc)                              \
55     ( {                                           \
56         uint32_t val_lw_m = *(uint32_t *)(psrc);  \
57         val_lw_m;                                 \
58     } )
59 
60     #if (__mips == 64)
61         #define LD(psrc)                               \
62         ( {                                            \
63             uint64_t val_ld_m =  *(uint64_t *)(psrc);  \
64             val_ld_m;                                  \
65         } )
66     #else  // !(__mips == 64)
67         #define LD(psrc)                                                    \
68         ( {                                                                 \
69             uint8_t *psrc_ld_m = (uint8_t *) (psrc);                        \
70             uint32_t val0_ld_m, val1_ld_m;                                  \
71             uint64_t val_ld_m = 0;                                          \
72                                                                             \
73             val0_ld_m = LW(psrc_ld_m);                                      \
74             val1_ld_m = LW(psrc_ld_m + 4);                                  \
75                                                                             \
76             val_ld_m = (uint64_t) (val1_ld_m);                              \
77             val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000);  \
78             val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m);        \
79                                                                             \
80             val_ld_m;                                                       \
81         } )
82     #endif  // (__mips == 64)
83 
84     #define SH(val, pdst)  *(uint16_t *)(pdst) = (val);
85     #define SW(val, pdst)  *(uint32_t *)(pdst) = (val);
86     #define SD(val, pdst)  *(uint64_t *)(pdst) = (val);
87 
88 #else  // !(__mips_isa_rev >= 6)
89     #define LH(psrc)                                 \
90     ( {                                              \
91         uint8_t *psrc_lh_m = (uint8_t *) (psrc);     \
92         uint16_t val_lh_m;                           \
93                                                      \
94         __asm__ volatile (                           \
95             "ulh  %[val_lh_m],  %[psrc_lh_m]  \n\t"  \
96                                                      \
97             : [val_lh_m] "=r" (val_lh_m)             \
98             : [psrc_lh_m] "m" (*psrc_lh_m)           \
99         );                                           \
100                                                      \
101         val_lh_m;                                    \
102     } )
103 
104     #define LW(psrc)                                 \
105     ( {                                              \
106         uint8_t *psrc_lw_m = (uint8_t *) (psrc);     \
107         uint32_t val_lw_m;                           \
108                                                      \
109         __asm__ volatile (                           \
110             "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t"  \
111             "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t"  \
112                                                      \
113             : [val_lw_m] "=&r"(val_lw_m)             \
114             : [psrc_lw_m] "r"(psrc_lw_m)             \
115         );                                           \
116                                                      \
117         val_lw_m;                                    \
118     } )
119 
120     #if (__mips == 64)
121         #define LD(psrc)                                 \
122         ( {                                              \
123             uint8_t *psrc_ld_m = (uint8_t *) (psrc);     \
124             uint64_t val_ld_m = 0;                       \
125                                                          \
126             __asm__ volatile (                           \
127                 "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t"  \
128                 "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t"  \
129                                                          \
130                 : [val_ld_m] "=&r" (val_ld_m)            \
131                 : [psrc_ld_m] "r" (psrc_ld_m)            \
132             );                                           \
133                                                          \
134             val_ld_m;                                    \
135         } )
136     #else  // !(__mips == 64)
137         #define LD(psrc)                                                    \
138         ( {                                                                 \
139             uint8_t *psrc_ld_m = (uint8_t *) (psrc);                        \
140             uint32_t val0_ld_m, val1_ld_m;                                  \
141             uint64_t val_ld_m = 0;                                          \
142                                                                             \
143             val0_ld_m = LW(psrc_ld_m);                                      \
144             val1_ld_m = LW(psrc_ld_m + 4);                                  \
145                                                                             \
146             val_ld_m = (uint64_t) (val1_ld_m);                              \
147             val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000);  \
148             val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m);        \
149                                                                             \
150             val_ld_m;                                                       \
151         } )
152     #endif  // (__mips == 64)
153 
154     #define SH(val, pdst)                            \
155     {                                                \
156         uint8_t *pdst_sh_m = (uint8_t *) (pdst);     \
157         uint16_t val_sh_m = (val);                   \
158                                                      \
159         __asm__ volatile (                           \
160             "ush  %[val_sh_m],  %[pdst_sh_m]  \n\t"  \
161                                                      \
162             : [pdst_sh_m] "=m" (*pdst_sh_m)          \
163             : [val_sh_m] "r" (val_sh_m)              \
164         );                                           \
165     }
166 
167     #define SW(val, pdst)                            \
168     {                                                \
169         uint8_t *pdst_sw_m = (uint8_t *) (pdst);     \
170         uint32_t val_sw_m = (val);                   \
171                                                      \
172         __asm__ volatile (                           \
173             "usw  %[val_sw_m],  %[pdst_sw_m]  \n\t"  \
174                                                      \
175             : [pdst_sw_m] "=m" (*pdst_sw_m)          \
176             : [val_sw_m] "r" (val_sw_m)              \
177         );                                           \
178     }
179 
180     #define SD(val, pdst)                                             \
181     {                                                                 \
182         uint8_t *pdst_sd_m = (uint8_t *) (pdst);                      \
183         uint32_t val0_sd_m, val1_sd_m;                                \
184                                                                       \
185         val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF);          \
186         val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF);  \
187                                                                       \
188         SW(val0_sd_m, pdst_sd_m);                                     \
189         SW(val1_sd_m, pdst_sd_m + 4);                                 \
190     }
191 #endif // (__mips_isa_rev >= 6)
192 
193 /* Description : Load 4 words with stride
194    Arguments   : Inputs  - psrc    (source pointer to load from)
195                          - stride
196                  Outputs - out0, out1, out2, out3
197    Details     : Loads word in 'out0' from (psrc)
198                  Loads word in 'out1' from (psrc + stride)
199                  Loads word in 'out2' from (psrc + 2 * stride)
200                  Loads word in 'out3' from (psrc + 3 * stride)
201 */
202 #define LW4(psrc, stride, out0, out1, out2, out3)  \
203 {                                                  \
204     out0 = LW((psrc));                             \
205     out1 = LW((psrc) + stride);                    \
206     out2 = LW((psrc) + 2 * stride);                \
207     out3 = LW((psrc) + 3 * stride);                \
208 }
209 
210 #define LW2(psrc, stride, out0, out1)  \
211 {                                      \
212     out0 = LW((psrc));                 \
213     out1 = LW((psrc) + stride);        \
214 }
215 
216 /* Description : Load double words with stride
217    Arguments   : Inputs  - psrc    (source pointer to load from)
218                          - stride
219                  Outputs - out0, out1
220    Details     : Loads double word in 'out0' from (psrc)
221                  Loads double word in 'out1' from (psrc + stride)
222 */
223 #define LD2(psrc, stride, out0, out1)  \
224 {                                      \
225     out0 = LD((psrc));                 \
226     out1 = LD((psrc) + stride);        \
227 }
228 #define LD4(psrc, stride, out0, out1, out2, out3)  \
229 {                                                  \
230     LD2((psrc), stride, out0, out1);               \
231     LD2((psrc) + 2 * stride, stride, out2, out3);  \
232 }
233 
234 /* Description : Store 4 words with stride
235    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
236    Details     : Stores word from 'in0' to (pdst)
237                  Stores word from 'in1' to (pdst + stride)
238                  Stores word from 'in2' to (pdst + 2 * stride)
239                  Stores word from 'in3' to (pdst + 3 * stride)
240 */
241 #define SW4(in0, in1, in2, in3, pdst, stride)  \
242 {                                              \
243     SW(in0, (pdst))                            \
244     SW(in1, (pdst) + stride);                  \
245     SW(in2, (pdst) + 2 * stride);              \
246     SW(in3, (pdst) + 3 * stride);              \
247 }
248 
249 /* Description : Store 4 double words with stride
250    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
251    Details     : Stores double word from 'in0' to (pdst)
252                  Stores double word from 'in1' to (pdst + stride)
253                  Stores double word from 'in2' to (pdst + 2 * stride)
254                  Stores double word from 'in3' to (pdst + 3 * stride)
255 */
256 #define SD4(in0, in1, in2, in3, pdst, stride)  \
257 {                                              \
258     SD(in0, (pdst))                            \
259     SD(in1, (pdst) + stride);                  \
260     SD(in2, (pdst) + 2 * stride);              \
261     SD(in3, (pdst) + 3 * stride);              \
262 }
263 
264 /* Description : Load vector elements with stride
265    Arguments   : Inputs  - psrc    (source pointer to load from)
266                          - stride
267                  Outputs - out0, out1
268                  Return Type - as per RTYPE
269    Details     : Loads elements in 'out0' from (psrc)
270                  Loads elements in 'out1' from (psrc + stride)
271 */
272 #define LD_V2(RTYPE, psrc, stride, out0, out1)  \
273 {                                               \
274     out0 = LD_V(RTYPE, (psrc));                 \
275     out1 = LD_V(RTYPE, (psrc) + stride);        \
276 }
277 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
278 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
279 #define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
280 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
281 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
282 
283 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2)  \
284 {                                                     \
285     LD_V2(RTYPE, (psrc), stride, out0, out1);         \
286     out2 = LD_V(RTYPE, (psrc) + 2 * stride);          \
287 }
288 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
289 #define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__)
290 
291 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3)   \
292 {                                                            \
293     LD_V2(RTYPE, (psrc), stride, out0, out1);                \
294     LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);  \
295 }
296 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
297 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
298 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
299 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
300 #define LD_SW4(...) LD_V4(v4i32, __VA_ARGS__)
301 
302 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4)  \
303 {                                                                 \
304     LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);         \
305     out4 = LD_V(RTYPE, (psrc) + 4 * stride);                      \
306 }
307 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
308 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
309 
310 #define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5)  \
311 {                                                                       \
312     LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
313     LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5);              \
314 }
315 #define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
316 #define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
317 #define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
318 #define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
319 
320 #define LD_V7(RTYPE, psrc, stride,                               \
321               out0, out1, out2, out3, out4, out5, out6)          \
322 {                                                                \
323     LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
324     LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
325 }
326 #define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__)
327 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
328 
329 #define LD_V8(RTYPE, psrc, stride,                                      \
330               out0, out1, out2, out3, out4, out5, out6, out7)           \
331 {                                                                       \
332     LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
333     LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
334 }
335 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
336 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
337 #define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
338 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
339 #define LD_SW8(...) LD_V8(v4i32, __VA_ARGS__)
340 
341 #define LD_V16(RTYPE, psrc, stride,                                   \
342                out0, out1, out2, out3, out4, out5, out6, out7,        \
343                out8, out9, out10, out11, out12, out13, out14, out15)  \
344 {                                                                     \
345     LD_V8(RTYPE, (psrc), stride,                                      \
346           out0, out1, out2, out3, out4, out5, out6, out7);            \
347     LD_V8(RTYPE, (psrc) + 8 * stride, stride,                         \
348           out8, out9, out10, out11, out12, out13, out14, out15);      \
349 }
350 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
351 
352 /* Description : Store vectors with stride
353    Arguments   : Inputs  - in0, in1, stride
354                  Outputs - pdst    (destination pointer to store to)
355    Details     : Stores elements from 'in0' to (pdst)
356                  Stores elements from 'in1' to (pdst + stride)
357 */
358 #define ST_V2(RTYPE, in0, in1, pdst, stride)  \
359 {                                             \
360     ST_V(RTYPE, in0, (pdst));                 \
361     ST_V(RTYPE, in1, (pdst) + stride);        \
362 }
363 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
364 #define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
365 #define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
366 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
367 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
368 
369 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
370 {                                                         \
371     ST_V2(RTYPE, in0, in1, (pdst), stride);               \
372     ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
373 }
374 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
375 #define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
376 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
377 #define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
378 
379 #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride)  \
380 {                                                                 \
381     ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride);             \
382     ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride);          \
383 }
384 #define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
385 
386 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
387 {                                                                           \
388     ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride);                       \
389     ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);          \
390 }
391 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
392 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
393 #define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
394 
395 /* Description : Store half word elements of vector with stride
396  * Arguments   : Inputs  - in   source vector
397  *                       - pdst    (destination pointer to store to)
398  *                       - stride
399  * Details     : Stores half word 'idx0' from 'in' to (pdst)
400  *               Stores half word 'idx1' from 'in' to (pdst + stride)
401  *               Similar for other elements
402  */
403 #define ST_H1(in, idx, pdst)                             \
404 {                                                        \
405     uint16_t out0_m;                                     \
406     out0_m = __msa_copy_u_h((v8i16) in, idx);            \
407     SH(out0_m, (pdst));                                  \
408 }
409 #define ST_H2(in, idx0, idx1, pdst, stride)              \
410 {                                                        \
411     uint16_t out0_m, out1_m;                             \
412     out0_m = __msa_copy_u_h((v8i16) in, idx0);           \
413     out1_m = __msa_copy_u_h((v8i16) in, idx1);           \
414     SH(out0_m, (pdst));                                  \
415     SH(out1_m, (pdst) + stride);                         \
416 }
417 #define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)  \
418 {                                                        \
419     uint16_t out0_m, out1_m, out2_m, out3_m;             \
420     out0_m = __msa_copy_u_h((v8i16) in, idx0);           \
421     out1_m = __msa_copy_u_h((v8i16) in, idx1);           \
422     out2_m = __msa_copy_u_h((v8i16) in, idx2);           \
423     out3_m = __msa_copy_u_h((v8i16) in, idx3);           \
424     SH(out0_m, (pdst));                                  \
425     SH(out1_m, (pdst) + stride);                         \
426     SH(out2_m, (pdst) + 2 * stride);                     \
427     SH(out3_m, (pdst) + 3 * stride);                     \
428 }
429 #define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5,            \
430               idx6, idx7, pdst, stride)                          \
431 {                                                                \
432     ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)              \
433     ST_H4(in, idx4, idx5, idx6, idx7, (pdst) + 4*stride, stride) \
434 }
435 
436 /* Description : Store word elements of vector with stride
437  * Arguments   : Inputs  - in   source vector
438  *                       - pdst    (destination pointer to store to)
439  *                       - stride
440  * Details     : Stores word 'idx0' from 'in' to (pdst)
441  *               Stores word 'idx1' from 'in' to (pdst + stride)
442  *               Similar for other elements
443  */
444 #define ST_W1(in, idx, pdst)                             \
445 {                                                        \
446     uint32_t out0_m;                                     \
447     out0_m = __msa_copy_u_w((v4i32) in, idx);            \
448     SW(out0_m, (pdst));                                  \
449 }
450 #define ST_W2(in, idx0, idx1, pdst, stride)              \
451 {                                                        \
452     uint32_t out0_m, out1_m;                             \
453     out0_m = __msa_copy_u_w((v4i32) in, idx0);           \
454     out1_m = __msa_copy_u_w((v4i32) in, idx1);           \
455     SW(out0_m, (pdst));                                  \
456     SW(out1_m, (pdst) + stride);                         \
457 }
458 #define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)  \
459 {                                                        \
460     uint32_t out0_m, out1_m, out2_m, out3_m;             \
461     out0_m = __msa_copy_u_w((v4i32) in, idx0);           \
462     out1_m = __msa_copy_u_w((v4i32) in, idx1);           \
463     out2_m = __msa_copy_u_w((v4i32) in, idx2);           \
464     out3_m = __msa_copy_u_w((v4i32) in, idx3);           \
465     SW(out0_m, (pdst));                                  \
466     SW(out1_m, (pdst) + stride);                         \
467     SW(out2_m, (pdst) + 2*stride);                       \
468     SW(out3_m, (pdst) + 3*stride);                       \
469 }
470 #define ST_W8(in0, in1, idx0, idx1, idx2, idx3,                 \
471               idx4, idx5, idx6, idx7, pdst, stride)             \
472 {                                                               \
473     ST_W4(in0, idx0, idx1, idx2, idx3, pdst, stride)            \
474     ST_W4(in1, idx4, idx5, idx6, idx7, pdst + 4*stride, stride) \
475 }
476 
477 /* Description : Store double word elements of vector with stride
478  * Arguments   : Inputs  - in   source vector
479  *                       - pdst    (destination pointer to store to)
480  *                       - stride
481  * Details     : Stores double word 'idx0' from 'in' to (pdst)
482  *               Stores double word 'idx1' from 'in' to (pdst + stride)
483  *               Similar for other elements
484  */
485 #define ST_D1(in, idx, pdst)                   \
486 {                                              \
487     uint64_t out0_m;                           \
488     out0_m = __msa_copy_u_d((v2i64) in, idx);  \
489     SD(out0_m, (pdst));                        \
490 }
491 #define ST_D2(in, idx0, idx1, pdst, stride)    \
492 {                                              \
493     uint64_t out0_m, out1_m;                   \
494     out0_m = __msa_copy_u_d((v2i64) in, idx0); \
495     out1_m = __msa_copy_u_d((v2i64) in, idx1); \
496     SD(out0_m, (pdst));                        \
497     SD(out1_m, (pdst) + stride);               \
498 }
499 #define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
500 {                                                             \
501     uint64_t out0_m, out1_m, out2_m, out3_m;                  \
502     out0_m = __msa_copy_u_d((v2i64) in0, idx0);               \
503     out1_m = __msa_copy_u_d((v2i64) in0, idx1);               \
504     out2_m = __msa_copy_u_d((v2i64) in1, idx2);               \
505     out3_m = __msa_copy_u_d((v2i64) in1, idx3);               \
506     SD(out0_m, (pdst));                                       \
507     SD(out1_m, (pdst) + stride);                              \
508     SD(out2_m, (pdst) + 2 * stride);                          \
509     SD(out3_m, (pdst) + 3 * stride);                          \
510 }
511 #define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3,              \
512               idx4, idx5, idx6, idx7, pdst, stride)                    \
513 {                                                                      \
514     ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)              \
515     ST_D4(in2, in3, idx4, idx5, idx6, idx7, pdst + 4 * stride, stride) \
516 }
517 
518 /* Description : Store as 12x8 byte block to destination memory from
519                  input vectors
520    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
521    Details     : Index 0 double word element from input vector 'in0' is copied
522                  and stored to destination memory at (pblk_12x8_m) followed by
523                  index 2 word element from same input vector 'in0' at
524                  (pblk_12x8_m + 8)
525                  Similar to remaining lines
526 */
527 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
528 {                                                                        \
529     uint64_t out0_m, out1_m, out2_m, out3_m;                             \
530     uint64_t out4_m, out5_m, out6_m, out7_m;                             \
531     uint32_t out8_m, out9_m, out10_m, out11_m;                           \
532     uint32_t out12_m, out13_m, out14_m, out15_m;                         \
533     uint8_t *pblk_12x8_m = (uint8_t *) (pdst);                           \
534                                                                          \
535     out0_m = __msa_copy_u_d((v2i64) in0, 0);                             \
536     out1_m = __msa_copy_u_d((v2i64) in1, 0);                             \
537     out2_m = __msa_copy_u_d((v2i64) in2, 0);                             \
538     out3_m = __msa_copy_u_d((v2i64) in3, 0);                             \
539     out4_m = __msa_copy_u_d((v2i64) in4, 0);                             \
540     out5_m = __msa_copy_u_d((v2i64) in5, 0);                             \
541     out6_m = __msa_copy_u_d((v2i64) in6, 0);                             \
542     out7_m = __msa_copy_u_d((v2i64) in7, 0);                             \
543                                                                          \
544     out8_m =  __msa_copy_u_w((v4i32) in0, 2);                            \
545     out9_m =  __msa_copy_u_w((v4i32) in1, 2);                            \
546     out10_m = __msa_copy_u_w((v4i32) in2, 2);                            \
547     out11_m = __msa_copy_u_w((v4i32) in3, 2);                            \
548     out12_m = __msa_copy_u_w((v4i32) in4, 2);                            \
549     out13_m = __msa_copy_u_w((v4i32) in5, 2);                            \
550     out14_m = __msa_copy_u_w((v4i32) in6, 2);                            \
551     out15_m = __msa_copy_u_w((v4i32) in7, 2);                            \
552                                                                          \
553     SD(out0_m, pblk_12x8_m);                                             \
554     SW(out8_m, pblk_12x8_m + 8);                                         \
555     pblk_12x8_m += stride;                                               \
556     SD(out1_m, pblk_12x8_m);                                             \
557     SW(out9_m, pblk_12x8_m + 8);                                         \
558     pblk_12x8_m += stride;                                               \
559     SD(out2_m, pblk_12x8_m);                                             \
560     SW(out10_m, pblk_12x8_m + 8);                                        \
561     pblk_12x8_m += stride;                                               \
562     SD(out3_m, pblk_12x8_m);                                             \
563     SW(out11_m, pblk_12x8_m + 8);                                        \
564     pblk_12x8_m += stride;                                               \
565     SD(out4_m, pblk_12x8_m);                                             \
566     SW(out12_m, pblk_12x8_m + 8);                                        \
567     pblk_12x8_m += stride;                                               \
568     SD(out5_m, pblk_12x8_m);                                             \
569     SW(out13_m, pblk_12x8_m + 8);                                        \
570     pblk_12x8_m += stride;                                               \
571     SD(out6_m, pblk_12x8_m);                                             \
572     SW(out14_m, pblk_12x8_m + 8);                                        \
573     pblk_12x8_m += stride;                                               \
574     SD(out7_m, pblk_12x8_m);                                             \
575     SW(out15_m, pblk_12x8_m + 8);                                        \
576 }
577 
578 /* Description : average with rounding (in0 + in1 + 1) / 2.
579    Arguments   : Inputs  - in0, in1, in2, in3,
580                  Outputs - out0, out1
581                  Return Type - as per RTYPE
582    Details     : Each byte element from 'in0' vector is added with each byte
583                  element from 'in1' vector. The addition of the elements plus 1
584                 (for rounding) is done unsigned with full precision,
585                 i.e. the result has one extra bit. Unsigned division by 2
586                 (or logical shift right by one bit) is performed before writing
587                 the result to vector 'out0'
588                 Similar for the pair of 'in2' and 'in3'
589 */
590 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)       \
591 {                                                             \
592     out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1);  \
593     out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3);  \
594 }
595 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
596 
597 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
598                  out0, out1, out2, out3)                        \
599 {                                                               \
600     AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)             \
601     AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)             \
602 }
603 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
604 
605 /* Description : Immediate number of columns to slide
606    Arguments   : Inputs  - s, d, slide_val
607                  Outputs - out
608                  Return Type - as per RTYPE
609    Details     : Byte elements from 'd' vector are slide into 's' by
610                  number of elements specified by 'slide_val'
611 */
612 #define SLDI_B(RTYPE, d, s, slide_val, out)                       \
613 {                                                                 \
614     out = (RTYPE) __msa_sldi_b((v16i8) d, (v16i8) s, slide_val);  \
615 }
616 
617 #define SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1)  \
618 {                                                              \
619     SLDI_B(RTYPE, d0, s0, slide_val, out0)                     \
620     SLDI_B(RTYPE, d1, s1, slide_val, out1)                     \
621 }
622 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
623 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
624 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
625 #define SLDI_B2_SW(...) SLDI_B2(v4i32, __VA_ARGS__)
626 
627 #define SLDI_B3(RTYPE, d0, s0, d1, s1, d2, s2, slide_val,  \
628                 out0, out1, out2)                          \
629 {                                                          \
630     SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1)  \
631     SLDI_B(RTYPE, d2, s2, slide_val, out2)                 \
632 }
633 #define SLDI_B3_UB(...) SLDI_B3(v16u8, __VA_ARGS__)
634 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
635 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
636 
637 #define SLDI_B4(RTYPE, d0, s0, d1, s1, d2, s2, d3, s3,     \
638                 slide_val, out0, out1, out2, out3)         \
639 {                                                          \
640     SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1)  \
641     SLDI_B2(RTYPE, d2, s2, d3, s3, slide_val, out2, out3)  \
642 }
643 #define SLDI_B4_UB(...) SLDI_B4(v16u8, __VA_ARGS__)
644 #define SLDI_B4_SB(...) SLDI_B4(v16i8, __VA_ARGS__)
645 #define SLDI_B4_SH(...) SLDI_B4(v8i16, __VA_ARGS__)
646 
647 /* Description : Shuffle byte vector elements as per mask vector
648    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
649                  Outputs - out0, out1
650                  Return Type - as per RTYPE
651    Details     : Selective byte elements from in0 & in1 are copied to out0 as
652                  per control vector mask0
653                  Selective byte elements from in2 & in3 are copied to out1 as
654                  per control vector mask1
655 */
656 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
657 {                                                                          \
658     out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0);  \
659     out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2);  \
660 }
661 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
662 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
663 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
664 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
665 
666 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
667                 out0, out1, out2)                                          \
668 {                                                                          \
669     VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
670     out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4);  \
671 }
672 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
673 
674 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,       \
675                 out0, out1, out2, out3)                            \
676 {                                                                  \
677     VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
678     VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
679 }
680 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
681 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
682 
683 /* Description : Shuffle halfword vector elements as per mask vector
684    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
685                  Outputs - out0, out1
686                  Return Type - as per RTYPE
687    Details     : Selective halfword elements from in0 & in1 are copied to out0
688                  as per control vector mask0
689                  Selective halfword elements from in2 & in3 are copied to out1
690                  as per control vector mask1
691 */
692 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
693 {                                                                          \
694     out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0);  \
695     out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2);  \
696 }
697 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
698 
699 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
700                 out0, out1, out2)                                          \
701 {                                                                          \
702     VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
703     out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4);  \
704 }
705 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
706 
707 /* Description : Shuffle byte vector elements as per mask vector
708    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
709                  Outputs - out0, out1
710                  Return Type - as per RTYPE
711    Details     : Selective byte elements from in0 & in1 are copied to out0 as
712                  per control vector mask0
713                  Selective byte elements from in2 & in3 are copied to out1 as
714                  per control vector mask1
715 */
716 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)      \
717 {                                                                         \
718     out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
719     out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
720 }
721 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
722 
723 /* Description : Dot product of byte vector elements
724    Arguments   : Inputs  - mult0, mult1
725                            cnst0, cnst1
726                  Outputs - out0, out1
727                  Return Type - as per RTYPE
728    Details     : Unsigned byte elements from mult0 are multiplied with
729                  unsigned byte elements from cnst0 producing a result
730                  twice the size of input i.e. unsigned halfword.
731                  Then this multiplication results of adjacent odd-even elements
732                  are added together and stored to the out vector
733                  (2 unsigned halfword results)
734 */
735 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
736 {                                                                 \
737     out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0);  \
738     out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1);  \
739 }
740 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
741 
742 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,           \
743                  cnst0, cnst1, cnst2, cnst3,                  \
744                  out0, out1, out2, out3)                      \
745 {                                                             \
746     DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
747     DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
748 }
749 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
750 
751 /* Description : Dot product of byte vector elements
752    Arguments   : Inputs  - mult0, mult1
753                            cnst0, cnst1
754                  Outputs - out0, out1
755                  Return Type - as per RTYPE
756    Details     : Signed byte elements from mult0 are multiplied with
757                  signed byte elements from cnst0 producing a result
758                  twice the size of input i.e. signed halfword.
759                  Then this multiplication results of adjacent odd-even elements
760                  are added together and stored to the out vector
761                  (2 signed halfword results)
762 */
763 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
764 {                                                                 \
765     out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0);  \
766     out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1);  \
767 }
768 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
769 
770 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2,  \
771                  out0, out1, out2)                                 \
772 {                                                                  \
773     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);       \
774     out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2);   \
775 }
776 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
777 
778 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
779                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
780 {                                                                     \
781     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
782     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
783 }
784 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
785 
786 /* Description : Dot product of halfword vector elements
787    Arguments   : Inputs  - mult0, mult1
788                            cnst0, cnst1
789                  Outputs - out0, out1
790                  Return Type - as per RTYPE
791    Details     : Signed halfword elements from mult0 are multiplied with
792                  signed halfword elements from cnst0 producing a result
793                  twice the size of input i.e. signed word.
794                  Then this multiplication results of adjacent odd-even elements
795                  are added together and stored to the out vector
796                  (2 signed word results)
797 */
798 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
799 {                                                                 \
800     out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0);  \
801     out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1);  \
802 }
803 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
804 
805 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,           \
806                  cnst0, cnst1, cnst2, cnst3,                  \
807                  out0, out1, out2, out3)                      \
808 {                                                             \
809     DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
810     DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
811 }
812 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
813 
814 /* Description : Dot product & addition of byte vector elements
815    Arguments   : Inputs  - mult0, mult1
816                            cnst0, cnst1
817                  Outputs - out0, out1
818                  Return Type - as per RTYPE
819    Details     : Signed byte elements from mult0 are multiplied with
820                  signed byte elements from cnst0 producing a result
821                  twice the size of input i.e. signed halfword.
822                  Then this multiplication results of adjacent odd-even elements
823                  are added to the out vector
824                  (2 signed halfword results)
825 */
826 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
827 {                                                                  \
828     out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0,                   \
829                                    (v16i8) mult0, (v16i8) cnst0);  \
830     out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1,                   \
831                                    (v16i8) mult1, (v16i8) cnst1);  \
832 }
833 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
834 
835 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
836                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
837 {                                                                      \
838     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
839     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
840 }
841 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
842 
843 /* Description : Dot product & addition of byte vector elements
844    Arguments   : Inputs  - mult0, mult1
845                            cnst0, cnst1
846                  Outputs - out0, out1
847                  Return Type - as per RTYPE
848    Details     : Unsigned byte elements from mult0 are multiplied with
849                  unsigned byte elements from cnst0 producing a result
850                  twice the size of input i.e. unsigned halfword.
851                  Then this multiplication results of adjacent odd-even elements
852                  are added to the out vector
853                  (2 unsigned halfword results)
854 */
855 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
856 {                                                                  \
857     out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0,                   \
858                                    (v16u8) mult0, (v16u8) cnst0);  \
859     out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1,                   \
860                                    (v16u8) mult1, (v16u8) cnst1);  \
861 }
862 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
863 
864 /* Description : Dot product & addition of halfword vector elements
865    Arguments   : Inputs  - mult0, mult1
866                            cnst0, cnst1
867                  Outputs - out0, out1
868                  Return Type - as per RTYPE
869    Details     : Signed halfword elements from mult0 are multiplied with
870                  signed halfword elements from cnst0 producing a result
871                  twice the size of input i.e. signed word.
872                  Then this multiplication results of adjacent odd-even elements
873                  are added to the out vector
874                  (2 signed word results)
875 */
876 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
877 {                                                                  \
878     out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0,                   \
879                                    (v8i16) mult0, (v8i16) cnst0);  \
880     out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1,                   \
881                                    (v8i16) mult1, (v8i16) cnst1);  \
882 }
883 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
884 
885 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3,                   \
886                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
887 {                                                                      \
888     DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
889     DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
890 }
891 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
892 
893 /* Description : Minimum values between unsigned elements of
894                  either vector are copied to the output vector
895    Arguments   : Inputs  - in0, in1, min_vec
896                  Outputs - in0, in1, (in place)
897                  Return Type - as per RTYPE
898    Details     : Minimum of unsigned halfword element values from 'in0' and
899                  'min_value' are written to output vector 'in0'
900 */
901 #define MIN_UH2(RTYPE, in0, in1, min_vec)               \
902 {                                                       \
903     in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec);  \
904     in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec);  \
905 }
906 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
907 
908 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec)  \
909 {                                                    \
910     MIN_UH2(RTYPE, in0, in1, min_vec);               \
911     MIN_UH2(RTYPE, in2, in3, min_vec);               \
912 }
913 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
914 
915 /* Description : Clips all halfword elements of input vector between min & max
916                  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
917    Arguments   : Inputs  - in    (input vector)
918                          - min   (min threshold)
919                          - max   (max threshold)
920                  Outputs - in    (output vector with clipped elements)
921                  Return Type - signed halfword
922 */
923 #define CLIP_SH(in, min, max)                     \
924 {                                                 \
925     in = __msa_max_s_h((v8i16) min, (v8i16) in);  \
926     in = __msa_min_s_h((v8i16) max, (v8i16) in);  \
927 }
928 
929 /* Description : Clips all signed halfword elements of input vector
930                  between 0 & 255
931    Arguments   : Inputs  - in    (input vector)
932                  Outputs - in    (output vector with clipped elements)
933                  Return Type - signed halfwords
934 */
935 #define CLIP_SH_0_255(in)                       \
936 {                                               \
937     in = __msa_maxi_s_h((v8i16) in, 0);         \
938     in = (v8i16) __msa_sat_u_h((v8u16) in, 7);  \
939 }
940 
941 #define CLIP_SH2_0_255(in0, in1)  \
942 {                                 \
943     CLIP_SH_0_255(in0);           \
944     CLIP_SH_0_255(in1);           \
945 }
946 
947 #define CLIP_SH4_0_255(in0, in1, in2, in3)  \
948 {                                           \
949     CLIP_SH2_0_255(in0, in1);               \
950     CLIP_SH2_0_255(in2, in3);               \
951 }
952 
953 #define CLIP_SH8_0_255(in0, in1, in2, in3,  \
954                        in4, in5, in6, in7)  \
955 {                                           \
956     CLIP_SH4_0_255(in0, in1, in2, in3);     \
957     CLIP_SH4_0_255(in4, in5, in6, in7);     \
958 }
959 
960 /* Description : Clips all signed word elements of input vector
961                  between 0 & 255
962    Arguments   : Inputs  - in    (input vector)
963                  Outputs - in    (output vector with clipped elements)
964                  Return Type - signed word
965 */
966 #define CLIP_SW_0_255(in)                       \
967 {                                               \
968     in = __msa_maxi_s_w((v4i32) in, 0);         \
969     in = (v4i32) __msa_sat_u_w((v4u32) in, 7);  \
970 }
971 
972 #define CLIP_SW2_0_255(in0, in1)  \
973 {                                 \
974     CLIP_SW_0_255(in0);           \
975     CLIP_SW_0_255(in1);           \
976 }
977 
978 #define CLIP_SW4_0_255(in0, in1, in2, in3)  \
979 {                                           \
980     CLIP_SW2_0_255(in0, in1);               \
981     CLIP_SW2_0_255(in2, in3);               \
982 }
983 
984 #define CLIP_SW8_0_255(in0, in1, in2, in3,  \
985                        in4, in5, in6, in7)  \
986 {                                           \
987     CLIP_SW4_0_255(in0, in1, in2, in3);     \
988     CLIP_SW4_0_255(in4, in5, in6, in7);     \
989 }
990 
991 /* Description : Addition of 4 signed word elements
992                  4 signed word elements of input vector are added together and
993                  resulted integer sum is returned
994    Arguments   : Inputs  - in       (signed word vector)
995                  Outputs - sum_m    (i32 sum)
996                  Return Type - signed word
997 */
998 #define HADD_SW_S32(in)                               \
999 ( {                                                   \
1000     v2i64 res0_m, res1_m;                             \
1001     int32_t sum_m;                                    \
1002                                                       \
1003     res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in);  \
1004     res1_m = __msa_splati_d(res0_m, 1);               \
1005     res0_m += res1_m;                                 \
1006     sum_m = __msa_copy_s_w((v4i32) res0_m, 0);        \
1007     sum_m;                                            \
1008 } )
1009 
1010 /* Description : Addition of 8 unsigned halfword elements
1011                  8 unsigned halfword elements of input vector are added
1012                  together and resulted integer sum is returned
1013    Arguments   : Inputs  - in       (unsigned halfword vector)
1014                  Outputs - sum_m    (u32 sum)
1015                  Return Type - unsigned word
1016 */
1017 #define HADD_UH_U32(in)                                  \
1018 ( {                                                      \
1019     v4u32 res_m;                                         \
1020     v2u64 res0_m, res1_m;                                \
1021     uint32_t sum_m;                                      \
1022                                                          \
1023     res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in);      \
1024     res0_m = __msa_hadd_u_d(res_m, res_m);               \
1025     res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1);  \
1026     res0_m += res1_m;                                    \
1027     sum_m = __msa_copy_u_w((v4i32) res0_m, 0);           \
1028     sum_m;                                               \
1029 } )
1030 
1031 /* Description : Horizontal addition of signed byte vector elements
1032    Arguments   : Inputs  - in0, in1
1033                  Outputs - out0, out1
1034                  Return Type - as per RTYPE
1035    Details     : Each signed odd byte element from 'in0' is added to
1036                  even signed byte element from 'in0' (pairwise) and the
1037                  halfword result is stored in 'out0'
1038 */
1039 #define HADD_SB2(RTYPE, in0, in1, out0, out1)                 \
1040 {                                                             \
1041     out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0);  \
1042     out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1);  \
1043 }
1044 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1045 
1046 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1047 {                                                                    \
1048     HADD_SB2(RTYPE, in0, in1, out0, out1);                           \
1049     HADD_SB2(RTYPE, in2, in3, out2, out3);                           \
1050 }
1051 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1052 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1053 
1054 /* Description : Horizontal addition of unsigned byte vector elements
1055    Arguments   : Inputs  - in0, in1
1056                  Outputs - out0, out1
1057                  Return Type - as per RTYPE
1058    Details     : Each unsigned odd byte element from 'in0' is added to
1059                  even unsigned byte element from 'in0' (pairwise) and the
1060                  halfword result is stored in 'out0'
1061 */
1062 #define HADD_UB2(RTYPE, in0, in1, out0, out1)                 \
1063 {                                                             \
1064     out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0);  \
1065     out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1);  \
1066 }
1067 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1068 
1069 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2)      \
1070 {                                                             \
1071     HADD_UB2(RTYPE, in0, in1, out0, out1);                    \
1072     out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2);  \
1073 }
1074 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1075 
1076 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1077 {                                                                    \
1078     HADD_UB2(RTYPE, in0, in1, out0, out1);                           \
1079     HADD_UB2(RTYPE, in2, in3, out2, out3);                           \
1080 }
1081 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1082 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1083 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1084 
1085 /* Description : Horizontal subtraction of unsigned byte vector elements
1086    Arguments   : Inputs  - in0, in1
1087                  Outputs - out0, out1
1088                  Return Type - as per RTYPE
1089    Details     : Each unsigned odd byte element from 'in0' is subtracted from
1090                  even unsigned byte element from 'in0' (pairwise) and the
1091                  halfword result is stored in 'out0'
1092 */
1093 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)                 \
1094 {                                                             \
1095     out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0);  \
1096     out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1);  \
1097 }
1098 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1099 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1100 
1101 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1102 {                                                                    \
1103     HSUB_UB2(RTYPE, in0, in1, out0, out1);                           \
1104     HSUB_UB2(RTYPE, in2, in3, out2, out3);                           \
1105 }
1106 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1107 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1108 
1109 /* Description : SAD (Sum of Absolute Difference)
1110    Arguments   : Inputs  - in0, in1, ref0, ref1  (unsigned byte src & ref)
1111                  Outputs - sad_m                 (halfword vector with sad)
1112                  Return Type - unsigned halfword
1113    Details     : Absolute difference of all the byte elements from 'in0' with
1114                  'ref0' is calculated and preserved in 'diff0'. From the 16
1115                  unsigned absolute diff values, even-odd pairs are added
1116                  together to generate 8 halfword results.
1117 */
1118 #define SAD_UB2_UH(in0, in1, ref0, ref1)                        \
1119 ( {                                                             \
1120     v16u8 diff0_m, diff1_m;                                     \
1121     v8u16 sad_m = { 0 };                                        \
1122                                                                 \
1123     diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0);        \
1124     diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1);        \
1125                                                                 \
1126     sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m);  \
1127     sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m);  \
1128                                                                 \
1129     sad_m;                                                      \
1130 } )
1131 
1132 /* Description : Insert specified word elements from input vectors to 1
1133                  destination vector
1134    Arguments   : Inputs  - in0, in1, in2, in3 (4 input vectors)
1135                  Outputs - out                (output vector)
1136                  Return Type - as per RTYPE
1137 */
1138 #define INSERT_W2(RTYPE, in0, in1, out)                 \
1139 {                                                       \
1140     out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0);  \
1141     out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1);  \
1142 }
1143 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1144 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1145 
1146 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out)       \
1147 {                                                       \
1148     out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0);  \
1149     out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1);  \
1150     out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2);  \
1151     out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3);  \
1152 }
1153 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1154 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1155 #define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__)
1156 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1157 
1158 /* Description : Insert specified double word elements from input vectors to 1
1159                  destination vector
1160    Arguments   : Inputs  - in0, in1      (2 input vectors)
1161                  Outputs - out           (output vector)
1162                  Return Type - as per RTYPE
1163 */
1164 #define INSERT_D2(RTYPE, in0, in1, out)                 \
1165 {                                                       \
1166     out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0);  \
1167     out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1);  \
1168 }
1169 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1170 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1171 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
1172 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1173 
1174 /* Description : Interleave even byte elements from vectors
1175    Arguments   : Inputs  - in0, in1, in2, in3
1176                  Outputs - out0, out1
1177                  Return Type - as per RTYPE
1178    Details     : Even byte elements of 'in0' and even byte
1179                  elements of 'in1' are interleaved and copied to 'out0'
1180                  Even byte elements of 'in2' and even byte
1181                  elements of 'in3' are interleaved and copied to 'out1'
1182 */
1183 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1184 {                                                            \
1185     out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0);  \
1186     out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2);  \
1187 }
1188 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1189 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1190 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1191 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1192 
1193 /* Description : Interleave even halfword elements from vectors
1194    Arguments   : Inputs  - in0, in1, in2, in3
1195                  Outputs - out0, out1
1196                  Return Type - as per RTYPE
1197    Details     : Even halfword elements of 'in0' and even halfword
1198                  elements of 'in1' are interleaved and copied to 'out0'
1199                  Even halfword elements of 'in2' and even halfword
1200                  elements of 'in3' are interleaved and copied to 'out1'
1201 */
1202 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1203 {                                                            \
1204     out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0);  \
1205     out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2);  \
1206 }
1207 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1208 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1209 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1210 
1211 /* Description : Interleave even word elements from vectors
1212    Arguments   : Inputs  - in0, in1, in2, in3
1213                  Outputs - out0, out1
1214                  Return Type - as per RTYPE
1215    Details     : Even word elements of 'in0' and even word
1216                  elements of 'in1' are interleaved and copied to 'out0'
1217                  Even word elements of 'in2' and even word
1218                  elements of 'in3' are interleaved and copied to 'out1'
1219 */
1220 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1221 {                                                            \
1222     out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0);  \
1223     out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2);  \
1224 }
1225 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1226 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1227 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1228 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1229 
1230 /* Description : Interleave even double word elements from vectors
1231    Arguments   : Inputs  - in0, in1, in2, in3
1232                  Outputs - out0, out1
1233                  Return Type - as per RTYPE
1234    Details     : Even double word elements of 'in0' and even double word
1235                  elements of 'in1' are interleaved and copied to 'out0'
1236                  Even double word elements of 'in2' and even double word
1237                  elements of 'in3' are interleaved and copied to 'out1'
1238 */
1239 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1240 {                                                            \
1241     out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0);  \
1242     out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2);  \
1243 }
1244 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1245 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1246 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1247 
1248 /* Description : Interleave left half of byte elements from vectors
1249    Arguments   : Inputs  - in0, in1, in2, in3
1250                  Outputs - out0, out1
1251                  Return Type - as per RTYPE
1252    Details     : Left half of byte elements of in0 and left half of byte
1253                  elements of in1 are interleaved and copied to out0.
1254                  Left half of byte elements of in2 and left half of byte
1255                  elements of in3 are interleaved and copied to out1.
1256 */
1257 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1258 {                                                           \
1259     out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1260     out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3);  \
1261 }
1262 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1263 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1264 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1265 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1266 
1267 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1268                 out0, out1, out2, out3)                         \
1269 {                                                               \
1270     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1271     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1272 }
1273 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1274 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1275 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1276 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1277 
1278 /* Description : Interleave left half of halfword elements from vectors
1279    Arguments   : Inputs  - in0, in1, in2, in3
1280                  Outputs - out0, out1
1281                  Return Type - as per RTYPE
1282    Details     : Left half of halfword elements of in0 and left half of halfword
1283                  elements of in1 are interleaved and copied to out0.
1284                  Left half of halfword elements of in2 and left half of halfword
1285                  elements of in3 are interleaved and copied to out1.
1286 */
1287 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1288 {                                                           \
1289     out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1290     out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3);  \
1291 }
1292 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1293 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1294 
1295 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1296                 out0, out1, out2, out3)                         \
1297 {                                                               \
1298     ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1299     ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1300 }
1301 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1302 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1303 
1304 /* Description : Interleave left half of word elements from vectors
1305    Arguments   : Inputs  - in0, in1, in2, in3
1306                  Outputs - out0, out1
1307                  Return Type - as per RTYPE
1308    Details     : Left half of word elements of in0 and left half of word
1309                  elements of in1 are interleaved and copied to out0.
1310                  Left half of word elements of in2 and left half of word
1311                  elements of in3 are interleaved and copied to out1.
1312 */
1313 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1314 {                                                           \
1315     out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1316     out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3);  \
1317 }
1318 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1319 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1320 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1321 
1322 /* Description : Interleave right half of byte elements from vectors
1323    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1324                  Outputs - out0, out1, out2, out3
1325                  Return Type - as per RTYPE
1326    Details     : Right half of byte elements of in0 and right half of byte
1327                  elements of in1 are interleaved and copied to out0.
1328                  Right half of byte elements of in2 and right half of byte
1329                  elements of in3 are interleaved and copied to out1.
1330                  Similar for other pairs
1331 */
1332 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1333 {                                                           \
1334     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1335     out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3);  \
1336 }
1337 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1338 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1339 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1340 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1341 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1342 
1343 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1344 {                                                                       \
1345     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1346     out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5);              \
1347 }
1348 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1349 #define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__)
1350 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1351 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1352 
1353 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1354                 out0, out1, out2, out3)                         \
1355 {                                                               \
1356     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1357     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1358 }
1359 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1360 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1361 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1362 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1363 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1364 
1365 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,    \
1366                 in8, in9, in10, in11, in12, in13, in14, in15,     \
1367                 out0, out1, out2, out3, out4, out5, out6, out7)   \
1368 {                                                                 \
1369     ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,        \
1370             out0, out1, out2, out3);                              \
1371     ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,  \
1372             out4, out5, out6, out7);                              \
1373 }
1374 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1375 #define ILVR_B8_SW(...) ILVR_B8(v4i32, __VA_ARGS__)
1376 
1377 /* Description : Interleave right half of halfword elements from vectors
1378    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1379                  Outputs - out0, out1, out2, out3
1380                  Return Type - as per RTYPE
1381    Details     : Right half of halfword elements of in0 and right half of
1382                  halfword elements of in1 are interleaved and copied to out0.
1383                  Right half of halfword elements of in2 and right half of
1384                  halfword elements of in3 are interleaved and copied to out1.
1385                  Similar for other pairs
1386 */
1387 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1388 {                                                           \
1389     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1390     out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3);  \
1391 }
1392 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1393 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1394 
1395 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1396 {                                                                       \
1397     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1398     out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5);              \
1399 }
1400 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1401 
1402 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1403                 out0, out1, out2, out3)                         \
1404 {                                                               \
1405     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1406     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1407 }
1408 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1409 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1410 
1411 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1412 {                                                           \
1413     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1414     out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3);  \
1415 }
1416 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1417 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1418 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1419 
1420 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1421                 out0, out1, out2, out3)                         \
1422 {                                                               \
1423     ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1424     ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1425 }
1426 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1427 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1428 
1429 /* Description : Interleave right half of double word elements from vectors
1430    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1431                  Outputs - out0, out1, out2, out3
1432                  Return Type - as per RTYPE
1433    Details     : Right half of double word elements of in0 and right half of
1434                  double word elements of in1 are interleaved and copied to out0.
1435                  Right half of double word elements of in2 and right half of
1436                  double word elements of in3 are interleaved and copied to out1.
1437 */
1438 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1439 {                                                           \
1440     out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1);  \
1441     out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3);  \
1442 }
1443 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1444 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1445 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1446 
1447 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1448 {                                                                       \
1449     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1450     out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5);              \
1451 }
1452 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1453 
1454 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1455                 out0, out1, out2, out3)                         \
1456 {                                                               \
1457     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1458     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1459 }
1460 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1461 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1462 
1463 /* Description : Interleave left half of double word elements from vectors
1464    Arguments   : Inputs  - in0, in1, in2, in3
1465                  Outputs - out0, out1
1466                  Return Type - as per RTYPE
1467    Details     : Left half of double word elements of in0 and left half of
1468                  double word elements of in1 are interleaved and copied to out0.
1469                  Left half of double word elements of in2 and left half of
1470                  double word elements of in3 are interleaved and copied to out1.
1471 */
1472 #define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1473 {                                                           \
1474     out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1);  \
1475     out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3);  \
1476 }
1477 #define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__)
1478 #define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__)
1479 #define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__)
1480 
1481 /* Description : Interleave both left and right half of input vectors
1482    Arguments   : Inputs  - in0, in1
1483                  Outputs - out0, out1
1484                  Return Type - as per RTYPE
1485    Details     : Right half of byte elements from 'in0' and 'in1' are
1486                  interleaved and stored to 'out0'
1487                  Left half of byte elements from 'in0' and 'in1' are
1488                  interleaved and stored to 'out1'
1489 */
1490 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)               \
1491 {                                                           \
1492     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1493     out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1494 }
1495 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1496 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1497 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1498 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1499 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1500 
1501 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)               \
1502 {                                                           \
1503     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1504     out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1505 }
1506 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
1507 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1508 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1509 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1510 
1511 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)               \
1512 {                                                           \
1513     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1514     out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1515 }
1516 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1517 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1518 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1519 
1520 /* Description : Maximum values between signed elements of vector and
1521                  5-bit signed immediate value are copied to the output vector
1522    Arguments   : Inputs  - in0, in1, in2, in3, max_val
1523                  Outputs - in0, in1, in2, in3 (in place)
1524                  Return Type - as per RTYPE
1525    Details     : Maximum of signed halfword element values from 'in0' and
1526                  'max_val' are written to output vector 'in0'
1527 */
1528 #define MAXI_SH2(RTYPE, in0, in1, max_val)               \
1529 {                                                        \
1530     in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val);  \
1531     in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val);  \
1532 }
1533 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1534 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1535 
1536 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val)  \
1537 {                                                     \
1538     MAXI_SH2(RTYPE, in0, in1, max_val);               \
1539     MAXI_SH2(RTYPE, in2, in3, max_val);               \
1540 }
1541 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1542 #define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__)
1543 
1544 #define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val)  \
1545 {                                                                         \
1546     MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val);                         \
1547     MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val);                         \
1548 }
1549 #define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__)
1550 #define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__)
1551 
1552 /* Description : Saturate the halfword element values to the max
1553                  unsigned value of (sat_val+1 bits)
1554                  The element data width remains unchanged
1555    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1556                  Outputs - in0, in1, in2, in3 (in place)
1557                  Return Type - as per RTYPE
1558    Details     : Each unsigned halfword element from 'in0' is saturated to the
1559                  value generated with (sat_val+1) bit range
1560                  Results are in placed to original vectors
1561 */
1562 #define SAT_UH2(RTYPE, in0, in1, sat_val)               \
1563 {                                                       \
1564     in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val);  \
1565     in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val);  \
1566 }
1567 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1568 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1569 
1570 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1571 {                                                    \
1572     SAT_UH2(RTYPE, in0, in1, sat_val);               \
1573     SAT_UH2(RTYPE, in2, in3, sat_val);               \
1574 }
1575 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1576 #define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__)
1577 
1578 #define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val)  \
1579 {                                                                        \
1580     SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val);                         \
1581     SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val);                         \
1582 }
1583 #define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__)
1584 #define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__)
1585 
1586 /* Description : Saturate the halfword element values to the max
1587                  unsigned value of (sat_val+1 bits)
1588                  The element data width remains unchanged
1589    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1590                  Outputs - in0, in1, in2, in3 (in place)
1591                  Return Type - as per RTYPE
1592    Details     : Each unsigned halfword element from 'in0' is saturated to the
1593                  value generated with (sat_val+1) bit range
1594                  Results are in placed to original vectors
1595 */
1596 #define SAT_SH2(RTYPE, in0, in1, sat_val)               \
1597 {                                                       \
1598     in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val);  \
1599     in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val);  \
1600 }
1601 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1602 
1603 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val)          \
1604 {                                                       \
1605     SAT_SH2(RTYPE, in0, in1, sat_val);                  \
1606     in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val);  \
1607 }
1608 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1609 
1610 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1611 {                                                    \
1612     SAT_SH2(RTYPE, in0, in1, sat_val);               \
1613     SAT_SH2(RTYPE, in2, in3, sat_val);               \
1614 }
1615 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1616 
1617 /* Description : Saturate the word element values to the max
1618                  unsigned value of (sat_val+1 bits)
1619                  The element data width remains unchanged
1620    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1621                  Outputs - in0, in1, in2, in3 (in place)
1622                  Return Type - as per RTYPE
1623    Details     : Each unsigned word element from 'in0' is saturated to the
1624                  value generated with (sat_val+1) bit range
1625                  Results are in placed to original vectors
1626 */
1627 #define SAT_SW2(RTYPE, in0, in1, sat_val)               \
1628 {                                                       \
1629     in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val);  \
1630     in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val);  \
1631 }
1632 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1633 
1634 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val)  \
1635 {                                                    \
1636     SAT_SW2(RTYPE, in0, in1, sat_val);               \
1637     SAT_SW2(RTYPE, in2, in3, sat_val);               \
1638 }
1639 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1640 
1641 /* Description : Indexed halfword element values are replicated to all
1642                  elements in output vector
1643    Arguments   : Inputs  - in, idx0, idx1
1644                  Outputs - out0, out1
1645                  Return Type - as per RTYPE
1646    Details     : 'idx0' element value from 'in' vector is replicated to all
1647                   elements in 'out0' vector
1648                   Valid index range for halfword operation is 0-7
1649 */
1650 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1)  \
1651 {                                                     \
1652     out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0);  \
1653     out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1);  \
1654 }
1655 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1656 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1657 
1658 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2,        \
1659                   out0, out1, out2)                   \
1660 {                                                     \
1661     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1662     out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2);  \
1663 }
1664 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1665 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1666 
1667 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
1668                   out0, out1, out2, out3)             \
1669 {                                                     \
1670     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1671     SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);     \
1672 }
1673 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1674 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1675 
1676 /* Description : Indexed word element values are replicated to all
1677                  elements in output vector
1678    Arguments   : Inputs  - in, stidx
1679                  Outputs - out0, out1
1680                  Return Type - as per RTYPE
1681    Details     : 'stidx' element value from 'in' vector is replicated to all
1682                   elements in 'out0' vector
1683                  'stidx + 1' element value from 'in' vector is replicated to all
1684                   elements in 'out1' vector
1685                   Valid index range for halfword operation is 0-3
1686 */
1687 #define SPLATI_W2(RTYPE, in, stidx, out0, out1)            \
1688 {                                                          \
1689     out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx);      \
1690     out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1));  \
1691 }
1692 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1693 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1694 
1695 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3)  \
1696 {                                                     \
1697     SPLATI_W2(RTYPE, in, 0, out0, out1);              \
1698     SPLATI_W2(RTYPE, in, 2, out2, out3);              \
1699 }
1700 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1701 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1702 
1703 /* Description : Pack even byte elements of vector pairs
1704    Arguments   : Inputs  - in0, in1, in2, in3
1705                  Outputs - out0, out1
1706                  Return Type - as per RTYPE
1707    Details     : Even byte elements of in0 are copied to the left half of
1708                  out0 & even byte elements of in1 are copied to the right
1709                  half of out0.
1710                  Even byte elements of in2 are copied to the left half of
1711                  out1 & even byte elements of in3 are copied to the right
1712                  half of out1.
1713 */
1714 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1715 {                                                            \
1716     out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
1717     out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3);  \
1718 }
1719 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1720 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1721 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1722 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1723 
1724 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1725 {                                                                        \
1726     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1727     out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5);              \
1728 }
1729 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1730 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1731 
1732 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1733                  out0, out1, out2, out3)                         \
1734 {                                                                \
1735     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1736     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1737 }
1738 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1739 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1740 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1741 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1742 
1743 /* Description : Pack even halfword elements of vector pairs
1744    Arguments   : Inputs  - in0, in1, in2, in3
1745                  Outputs - out0, out1
1746                  Return Type - as per RTYPE
1747    Details     : Even halfword elements of in0 are copied to the left half of
1748                  out0 & even halfword elements of in1 are copied to the right
1749                  half of out0.
1750                  Even halfword elements of in2 are copied to the left half of
1751                  out1 & even halfword elements of in3 are copied to the right
1752                  half of out1.
1753 */
1754 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1755 {                                                            \
1756     out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1);  \
1757     out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3);  \
1758 }
1759 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1760 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1761 
1762 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1763                  out0, out1, out2, out3)                         \
1764 {                                                                \
1765     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1766     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1767 }
1768 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1769 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1770 
1771 /* Description : Pack even double word elements of vector pairs
1772    Arguments   : Inputs  - in0, in1, in2, in3
1773                  Outputs - out0, out1
1774                  Return Type - as per RTYPE
1775    Details     : Even double elements of in0 are copied to the left half of
1776                  out0 & even double elements of in1 are copied to the right
1777                  half of out0.
1778                  Even double elements of in2 are copied to the left half of
1779                  out1 & even double elements of in3 are copied to the right
1780                  half of out1.
1781 */
1782 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1783 {                                                            \
1784     out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \
1785     out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3);  \
1786 }
1787 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1788 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1789 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1790 
1791 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1792                  out0, out1, out2, out3)                         \
1793 {                                                                \
1794     PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1795     PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1796 }
1797 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1798 
1799 /* Description : Pack odd double word elements of vector pairs
1800    Arguments   : Inputs  - in0, in1
1801                  Outputs - out0, out1
1802                  Return Type - as per RTYPE
1803    Details     : As operation is on same input 'in0' vector, index 1 double word
1804                  element is overwritten to index 0 and result is written to out0
1805                  As operation is on same input 'in1' vector, index 1 double word
1806                  element is overwritten to index 0 and result is written to out1
1807 */
1808 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1809 {                                                            \
1810     out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1);  \
1811     out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3);  \
1812 }
1813 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1814 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1815 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1816 
1817 /* Description : Each byte element is logically xor'ed with immediate 128
1818    Arguments   : Inputs  - in0, in1
1819                  Outputs - in0, in1 (in-place)
1820                  Return Type - as per RTYPE
1821    Details     : Each unsigned byte element from input vector 'in0' is
1822                  logically xor'ed with 128 and result is in-place stored in
1823                  'in0' vector
1824                  Each unsigned byte element from input vector 'in1' is
1825                  logically xor'ed with 128 and result is in-place stored in
1826                  'in1' vector
1827                  Similar for other pairs
1828 */
1829 #define XORI_B2_128(RTYPE, in0, in1)               \
1830 {                                                  \
1831     in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128);  \
1832     in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128);  \
1833 }
1834 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1835 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1836 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1837 
1838 #define XORI_B3_128(RTYPE, in0, in1, in2)          \
1839 {                                                  \
1840     XORI_B2_128(RTYPE, in0, in1);                  \
1841     in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128);  \
1842 }
1843 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1844 
1845 #define XORI_B4_128(RTYPE, in0, in1, in2, in3)  \
1846 {                                               \
1847     XORI_B2_128(RTYPE, in0, in1);               \
1848     XORI_B2_128(RTYPE, in2, in3);               \
1849 }
1850 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1851 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1852 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1853 
1854 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4)  \
1855 {                                                    \
1856     XORI_B3_128(RTYPE, in0, in1, in2);               \
1857     XORI_B2_128(RTYPE, in3, in4);                    \
1858 }
1859 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1860 
1861 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5)  \
1862 {                                                         \
1863     XORI_B4_128(RTYPE, in0, in1, in2, in3);               \
1864     XORI_B2_128(RTYPE, in4, in5);                         \
1865 }
1866 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1867 
1868 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6)  \
1869 {                                                              \
1870     XORI_B4_128(RTYPE, in0, in1, in2, in3);                    \
1871     XORI_B3_128(RTYPE, in4, in5, in6);                         \
1872 }
1873 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1874 
1875 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7)  \
1876 {                                                                   \
1877     XORI_B4_128(RTYPE, in0, in1, in2, in3);                         \
1878     XORI_B4_128(RTYPE, in4, in5, in6, in7);                         \
1879 }
1880 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1881 #define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__)
1882 
1883 /* Description : Addition of signed halfword elements and signed saturation
1884    Arguments   : Inputs  - in0, in1, in2, in3
1885                  Outputs - out0, out1
1886                  Return Type - as per RTYPE
1887    Details     : Signed halfword elements from 'in0' are added to signed
1888                  halfword elements of 'in1'. The result is then signed saturated
1889                  between -32768 to +32767 (as per halfword data type)
1890                  Similar for other pairs
1891 */
1892 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)       \
1893 {                                                             \
1894     out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1);  \
1895     out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3);  \
1896 }
1897 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1898 
1899 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1900                  out0, out1, out2, out3)                         \
1901 {                                                                \
1902     ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1903     ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1904 }
1905 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1906 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1907 
1908 /* Description : Shift left all elements of vector (generic for all data types)
1909    Arguments   : Inputs  - in0, in1, in2, in3, shift
1910                  Outputs - in0, in1, in2, in3 (in place)
1911                  Return Type - as per input vector RTYPE
1912    Details     : Each element of vector 'in0' is left shifted by 'shift' and
1913                  result is in place written to 'in0'
1914                  Similar for other pairs
1915 */
1916 #define SLLI_2V(in0, in1, shift)  \
1917 {                                 \
1918     in0 = in0 << shift;           \
1919     in1 = in1 << shift;           \
1920 }
1921 #define SLLI_4V(in0, in1, in2, in3, shift)  \
1922 {                                           \
1923     in0 = in0 << shift;                     \
1924     in1 = in1 << shift;                     \
1925     in2 = in2 << shift;                     \
1926     in3 = in3 << shift;                     \
1927 }
1928 
1929 /* Description : Arithmetic shift right all elements of vector
1930                  (generic for all data types)
1931    Arguments   : Inputs  - in0, in1, in2, in3, shift
1932                  Outputs - in0, in1, in2, in3 (in place)
1933                  Return Type - as per input vector RTYPE
1934    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1935                  result is in place written to 'in0'
1936                  Here, 'shift' is GP variable passed in
1937                  Similar for other pairs
1938 */
1939 #define SRA_4V(in0, in1, in2, in3, shift)  \
1940 {                                          \
1941     in0 = in0 >> shift;                    \
1942     in1 = in1 >> shift;                    \
1943     in2 = in2 >> shift;                    \
1944     in3 = in3 >> shift;                    \
1945 }
1946 
1947 /* Description : Shift right logical all halfword elements of vector
1948    Arguments   : Inputs  - in0, in1, in2, in3, shift
1949                  Outputs - in0, in1, in2, in3 (in place)
1950                  Return Type - as per RTYPE
1951    Details     : Each element of vector 'in0' is shifted right logical by
1952                  number of bits respective element holds in vector 'shift' and
1953                  result is in place written to 'in0'
1954                  Here, 'shift' is a vector passed in
1955                  Similar for other pairs
1956 */
1957 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift)            \
1958 {                                                           \
1959     in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift);  \
1960     in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift);  \
1961     in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift);  \
1962     in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift);  \
1963 }
1964 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1965 
1966 #define SRLR_H4(RTYPE, in0, in1, in2, in3, shift)            \
1967 {                                                            \
1968     in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift);  \
1969     in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift);  \
1970     in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift);  \
1971     in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift);  \
1972 }
1973 #define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__)
1974 #define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__)
1975 
1976 #define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift)  \
1977 {                                                                      \
1978     SRLR_H4(RTYPE, in0, in1, in2, in3, shift);                         \
1979     SRLR_H4(RTYPE, in4, in5, in6, in7, shift);                         \
1980 }
1981 #define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__)
1982 #define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__)
1983 
1984 /* Description : Shift right arithmetic rounded halfwords
1985    Arguments   : Inputs  - in0, in1, shift
1986                  Outputs - in0, in1, (in place)
1987                  Return Type - as per RTYPE
1988    Details     : Each element of vector 'in0' is shifted right arithmetic by
1989                  number of bits respective element holds in vector 'shift'.
1990                  The last discarded bit is added to shifted value for rounding
1991                  and the result is in place written to 'in0'
1992                  Here, 'shift' is a vector passed in
1993                  Similar for other pairs
1994 */
1995 #define SRAR_H2(RTYPE, in0, in1, shift)                      \
1996 {                                                            \
1997     in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift);  \
1998     in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift);  \
1999 }
2000 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2001 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2002 
2003 #define SRAR_H3(RTYPE, in0, in1, in2, shift)                 \
2004 {                                                            \
2005     SRAR_H2(RTYPE, in0, in1, shift)                          \
2006     in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift);  \
2007 }
2008 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2009 
2010 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift)  \
2011 {                                                  \
2012     SRAR_H2(RTYPE, in0, in1, shift)                \
2013     SRAR_H2(RTYPE, in2, in3, shift)                \
2014 }
2015 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2016 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2017 
2018 /* Description : Shift right arithmetic rounded words
2019    Arguments   : Inputs  - in0, in1, shift
2020                  Outputs - in0, in1, (in place)
2021                  Return Type - as per RTYPE
2022    Details     : Each element of vector 'in0' is shifted right arithmetic by
2023                  number of bits respective element holds in vector 'shift'.
2024                  The last discarded bit is added to shifted value for rounding
2025                  and the result is in place written to 'in0'
2026                  Here, 'shift' is a vector passed in
2027                  Similar for other pairs
2028 */
2029 #define SRAR_W2(RTYPE, in0, in1, shift)                      \
2030 {                                                            \
2031     in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift);  \
2032     in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift);  \
2033 }
2034 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2035 
2036 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift)  \
2037 {                                                  \
2038     SRAR_W2(RTYPE, in0, in1, shift)                \
2039     SRAR_W2(RTYPE, in2, in3, shift)                \
2040 }
2041 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2042 
2043 /* Description : Shift right arithmetic rounded (immediate)
2044    Arguments   : Inputs  - in0, in1, in2, in3, shift
2045                  Outputs - in0, in1, in2, in3 (in place)
2046                  Return Type - as per RTYPE
2047    Details     : Each element of vector 'in0' is shifted right arithmetic by
2048                  value in 'shift'.
2049                  The last discarded bit is added to shifted value for rounding
2050                  and the result is in place written to 'in0'
2051                  Similar for other pairs
2052 */
2053 #define SRARI_H2(RTYPE, in0, in1, shift)              \
2054 {                                                     \
2055     in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift);  \
2056     in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift);  \
2057 }
2058 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2059 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2060 
2061 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift)    \
2062 {                                                     \
2063     SRARI_H2(RTYPE, in0, in1, shift);                 \
2064     SRARI_H2(RTYPE, in2, in3, shift);                 \
2065 }
2066 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2067 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2068 
2069 /* Description : Shift right arithmetic rounded (immediate)
2070    Arguments   : Inputs  - in0, in1, shift
2071                  Outputs - in0, in1     (in place)
2072                  Return Type - as per RTYPE
2073    Details     : Each element of vector 'in0' is shifted right arithmetic by
2074                  value in 'shift'.
2075                  The last discarded bit is added to shifted value for rounding
2076                  and the result is in place written to 'in0'
2077                  Similar for other pairs
2078 */
2079 #define SRARI_W2(RTYPE, in0, in1, shift)              \
2080 {                                                     \
2081     in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift);  \
2082     in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift);  \
2083 }
2084 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2085 
2086 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift)  \
2087 {                                                   \
2088     SRARI_W2(RTYPE, in0, in1, shift);               \
2089     SRARI_W2(RTYPE, in2, in3, shift);               \
2090 }
2091 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2092 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2093 
2094 /* Description : Multiplication of pairs of vectors
2095    Arguments   : Inputs  - in0, in1, in2, in3
2096                  Outputs - out0, out1
2097    Details     : Each element from 'in0' is multiplied with elements from 'in1'
2098                  and result is written to 'out0'
2099                  Similar for other pairs
2100 */
2101 #define MUL2(in0, in1, in2, in3, out0, out1)  \
2102 {                                             \
2103     out0 = in0 * in1;                         \
2104     out1 = in2 * in3;                         \
2105 }
2106 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2107 {                                                                             \
2108     MUL2(in0, in1, in2, in3, out0, out1);                                     \
2109     MUL2(in4, in5, in6, in7, out2, out3);                                     \
2110 }
2111 
2112 /* Description : Addition of 2 pairs of vectors
2113    Arguments   : Inputs  - in0, in1, in2, in3
2114                  Outputs - out0, out1
2115    Details     : Each element from 2 pairs vectors is added and 2 results are
2116                  produced
2117 */
2118 #define ADD2(in0, in1, in2, in3, out0, out1)  \
2119 {                                             \
2120     out0 = in0 + in1;                         \
2121     out1 = in2 + in3;                         \
2122 }
2123 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2124 {                                                                             \
2125     ADD2(in0, in1, in2, in3, out0, out1);                                     \
2126     ADD2(in4, in5, in6, in7, out2, out3);                                     \
2127 }
2128 
2129 /* Description : Subtraction of 2 pairs of vectors
2130    Arguments   : Inputs  - in0, in1, in2, in3
2131                  Outputs - out0, out1
2132    Details     : Each element from 2 pairs vectors is subtracted and 2 results
2133                  are produced
2134 */
2135 #define SUB2(in0, in1, in2, in3, out0, out1)  \
2136 {                                             \
2137     out0 = in0 - in1;                         \
2138     out1 = in2 - in3;                         \
2139 }
2140 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2141 {                                                                             \
2142     out0 = in0 - in1;                                                         \
2143     out1 = in2 - in3;                                                         \
2144     out2 = in4 - in5;                                                         \
2145     out3 = in6 - in7;                                                         \
2146 }
2147 
2148 /* Description : Sign extend byte elements from right half of the vector
2149    Arguments   : Input  - in    (byte vector)
2150                  Output - out   (sign extended halfword vector)
2151                  Return Type - signed halfword
2152    Details     : Sign bit of byte elements from input vector 'in' is
2153                  extracted and interleaved with same vector 'in' to generate
2154                  8 halfword elements keeping sign intact
2155 */
2156 #define UNPCK_R_SB_SH(in, out)                       \
2157 {                                                    \
2158     v16i8 sign_m;                                    \
2159                                                      \
2160     sign_m = __msa_clti_s_b((v16i8) in, 0);          \
2161     out = (v8i16) __msa_ilvr_b(sign_m, (v16i8) in);  \
2162 }
2163 
2164 /* Description : Sign extend halfword elements from right half of the vector
2165    Arguments   : Inputs  - in    (input halfword vector)
2166                  Outputs - out   (sign extended word vectors)
2167                  Return Type - signed word
2168    Details     : Sign bit of halfword elements from input vector 'in' is
2169                  extracted and interleaved with same vector 'in0' to generate
2170                  4 word elements keeping sign intact
2171 */
2172 #define UNPCK_R_SH_SW(in, out)                       \
2173 {                                                    \
2174     v8i16 sign_m;                                    \
2175                                                      \
2176     sign_m = __msa_clti_s_h((v8i16) in, 0);          \
2177     out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in);  \
2178 }
2179 
2180 /* Description : Sign extend byte elements from input vector and return
2181                  halfword results in pair of vectors
2182    Arguments   : Inputs  - in           (1 input byte vector)
2183                  Outputs - out0, out1   (sign extended 2 halfword vectors)
2184                  Return Type - signed halfword
2185    Details     : Sign bit of byte elements from input vector 'in' is
2186                  extracted and interleaved right with same vector 'in0' to
2187                  generate 8 signed halfword elements in 'out0'
2188                  Then interleaved left with same vector 'in0' to
2189                  generate 8 signed halfword elements in 'out1'
2190 */
2191 #define UNPCK_SB_SH(in, out0, out1)                  \
2192 {                                                    \
2193     v16i8 tmp_m;                                     \
2194                                                      \
2195     tmp_m = __msa_clti_s_b((v16i8) in, 0);           \
2196     ILVRL_B2_SH(tmp_m, in, out0, out1);              \
2197 }
2198 
2199 /* Description : Zero extend unsigned byte elements to halfword elements
2200    Arguments   : Inputs  - in           (1 input unsigned byte vector)
2201                  Outputs - out0, out1   (unsigned 2 halfword vectors)
2202                  Return Type - signed halfword
2203    Details     : Zero extended right half of vector is returned in 'out0'
2204                  Zero extended left half of vector is returned in 'out1'
2205 */
2206 #define UNPCK_UB_SH(in, out0, out1)                   \
2207 {                                                     \
2208     v16i8 zero_m = { 0 };                             \
2209                                                       \
2210     ILVRL_B2_SH(zero_m, in, out0, out1);              \
2211 }
2212 
2213 /* Description : Sign extend halfword elements from input vector and return
2214                  result in pair of vectors
2215    Arguments   : Inputs  - in           (1 input halfword vector)
2216                  Outputs - out0, out1   (sign extended 2 word vectors)
2217                  Return Type - signed word
2218    Details     : Sign bit of halfword elements from input vector 'in' is
2219                  extracted and interleaved right with same vector 'in0' to
2220                  generate 4 signed word elements in 'out0'
2221                  Then interleaved left with same vector 'in0' to
2222                  generate 4 signed word elements in 'out1'
2223 */
2224 #define UNPCK_SH_SW(in, out0, out1)                  \
2225 {                                                    \
2226     v8i16 tmp_m;                                     \
2227                                                      \
2228     tmp_m = __msa_clti_s_h((v8i16) in, 0);           \
2229     ILVRL_H2_SW(tmp_m, in, out0, out1);              \
2230 }
2231 
2232 /* Description : Swap two variables
2233    Arguments   : Inputs  - in0, in1
2234                  Outputs - in0, in1 (in-place)
2235    Details     : Swapping of two input variables using xor
2236 */
2237 #define SWAP(in0, in1)  \
2238 {                       \
2239     in0 = in0 ^ in1;    \
2240     in1 = in0 ^ in1;    \
2241     in0 = in0 ^ in1;    \
2242 }
2243 
2244 /* Description : Butterfly of 4 input vectors
2245    Arguments   : Inputs  - in0, in1, in2, in3
2246                  Outputs - out0, out1, out2, out3
2247    Details     : Butterfly operation
2248 */
2249 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3)  \
2250 {                                                                \
2251     out0 = in0 + in3;                                            \
2252     out1 = in1 + in2;                                            \
2253                                                                  \
2254     out2 = in1 - in2;                                            \
2255     out3 = in0 - in3;                                            \
2256 }
2257 
2258 /* Description : Butterfly of 8 input vectors
2259    Arguments   : Inputs  - in0 ...  in7
2260                  Outputs - out0 .. out7
2261    Details     : Butterfly operation
2262 */
2263 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,          \
2264                     out0, out1, out2, out3, out4, out5, out6, out7)  \
2265 {                                                                    \
2266     out0 = in0 + in7;                                                \
2267     out1 = in1 + in6;                                                \
2268     out2 = in2 + in5;                                                \
2269     out3 = in3 + in4;                                                \
2270                                                                      \
2271     out4 = in3 - in4;                                                \
2272     out5 = in2 - in5;                                                \
2273     out6 = in1 - in6;                                                \
2274     out7 = in0 - in7;                                                \
2275 }
2276 
2277 /* Description : Butterfly of 16 input vectors
2278    Arguments   : Inputs  - in0 ...  in15
2279                  Outputs - out0 .. out15
2280    Details     : Butterfly operation
2281 */
2282 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                \
2283                      in8, in9,  in10, in11, in12, in13, in14, in15,         \
2284                      out0, out1, out2, out3, out4, out5, out6, out7,        \
2285                      out8, out9, out10, out11, out12, out13, out14, out15)  \
2286 {                                                                           \
2287     out0 = in0 + in15;                                                      \
2288     out1 = in1 + in14;                                                      \
2289     out2 = in2 + in13;                                                      \
2290     out3 = in3 + in12;                                                      \
2291     out4 = in4 + in11;                                                      \
2292     out5 = in5 + in10;                                                      \
2293     out6 = in6 + in9;                                                       \
2294     out7 = in7 + in8;                                                       \
2295                                                                             \
2296     out8 = in7 - in8;                                                       \
2297     out9 = in6 - in9;                                                       \
2298     out10 = in5 - in10;                                                     \
2299     out11 = in4 - in11;                                                     \
2300     out12 = in3 - in12;                                                     \
2301     out13 = in2 - in13;                                                     \
2302     out14 = in1 - in14;                                                     \
2303     out15 = in0 - in15;                                                     \
2304 }
2305 
2306 /* Description : Transposes input 4x4 byte block
2307    Arguments   : Inputs  - in0, in1, in2, in3      (input 4x4 byte block)
2308                  Outputs - out0, out1, out2, out3  (output 4x4 byte block)
2309                  Return Type - unsigned byte
2310    Details     :
2311 */
2312 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3)  \
2313 {                                                                       \
2314     v16i8 zero_m = { 0 };                                               \
2315     v16i8 s0_m, s1_m, s2_m, s3_m;                                       \
2316                                                                         \
2317     ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m);                         \
2318     ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m);                                \
2319                                                                         \
2320     out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m);                            \
2321     out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4);               \
2322     out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4);               \
2323     out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4);               \
2324 }
2325 
2326 /* Description : Transposes input 8x4 byte block into 4x8
2327    Arguments   : Inputs  - in0, in1, in2, in3      (input 8x4 byte block)
2328                  Outputs - out0, out1, out2, out3  (output 4x8 byte block)
2329                  Return Type - as per RTYPE
2330    Details     :
2331 */
2332 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
2333                         out0, out1, out2, out3)                         \
2334 {                                                                       \
2335     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
2336                                                                         \
2337     ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m);                    \
2338     tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
2339     ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m);                    \
2340                                                                         \
2341     tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
2342     ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m);                        \
2343                                                                         \
2344     ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2);                        \
2345     out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0);            \
2346     out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
2347 }
2348 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2349 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2350 
2351 /* Description : Transposes input 8x8 byte block
2352    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
2353                            (input 8x8 byte block)
2354                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2355                            (output 8x8 byte block)
2356                  Return Type - as per RTYPE
2357    Details     :
2358 */
2359 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \
2360                         out0, out1, out2, out3, out4, out5, out6, out7)  \
2361 {                                                                        \
2362     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                \
2363     v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                \
2364     v16i8 zeros = { 0 };                                                 \
2365                                                                          \
2366     ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                   \
2367                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                          \
2368     ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                         \
2369     ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                         \
2370     ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                         \
2371     ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                         \
2372     SLDI_B4(RTYPE, zeros, out0, zeros, out2, zeros, out4, zeros, out6,   \
2373             8, out1, out3, out5, out7);                                  \
2374 }
2375 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2376 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2377 
2378 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2379    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
2380                            in8, in9, in10, in11, in12, in13, in14, in15
2381                  Outputs - out0, out1, out2, out3
2382                  Return Type - unsigned byte
2383    Details     :
2384 */
2385 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
2386                             in8, in9, in10, in11, in12, in13, in14, in15,  \
2387                             out0, out1, out2, out3)                        \
2388 {                                                                          \
2389     v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
2390                                                                            \
2391     ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m);                      \
2392     out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \
2393                                                                            \
2394     ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                      \
2395     out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \
2396                                                                            \
2397     ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m);                     \
2398                                                                            \
2399     tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
2400     ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                     \
2401                                                                            \
2402     tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
2403     ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);               \
2404     out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2405     out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2406                                                                            \
2407     tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1);            \
2408     tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m);        \
2409     out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2410     out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2411 }
2412 
2413 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2414    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
2415                            in8, in9, in10, in11, in12, in13, in14, in15
2416                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2417                  Return Type - unsigned byte
2418    Details     :
2419 */
2420 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,          \
2421                             in8, in9, in10, in11, in12, in13, in14, in15,    \
2422                             out0, out1, out2, out3, out4, out5, out6, out7)  \
2423 {                                                                            \
2424     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
2425     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
2426                                                                              \
2427     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
2428     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
2429     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
2430     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
2431                                                                              \
2432     tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7);              \
2433     tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7);              \
2434     tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5);              \
2435     tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5);              \
2436     out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3);                \
2437     tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3);              \
2438     out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1);                \
2439     tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1);              \
2440                                                                              \
2441     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
2442     out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2443     out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2444                                                                              \
2445     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2446     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5);              \
2447     out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2448     out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2449                                                                              \
2450     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
2451     out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2452     out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2453                                                                              \
2454     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \
2455     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \
2456     out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2457     out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2458 }
2459 
2460 /* Description : Transposes 4x4 block with half word elements in vectors
2461    Arguments   : Inputs  - in0, in1, in2, in3
2462                  Outputs - out0, out1, out2, out3
2463                  Return Type - signed halfword
2464    Details     :
2465 */
2466 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)  \
2467 {                                                                       \
2468     v8i16 s0_m, s1_m;                                                   \
2469                                                                         \
2470     ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                         \
2471     ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                \
2472     out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0);            \
2473     out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
2474 }
2475 
2476 /* Description : Transposes 8x8 block with half word elements in vectors
2477    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
2478                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2479                  Return Type - as per RTYPE
2480    Details     :
2481 */
2482 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \
2483                        out0, out1, out2, out3, out4, out5, out6, out7)  \
2484 {                                                                       \
2485     v8i16 s0_m, s1_m;                                                   \
2486     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
2487     v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                               \
2488                                                                         \
2489     ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                         \
2490     ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                            \
2491     ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                         \
2492     ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                            \
2493     ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                         \
2494     ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                            \
2495     ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                         \
2496     ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                            \
2497     PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,     \
2498              tmp3_m, tmp7_m, out0, out2, out4, out6);                   \
2499     out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m);       \
2500     out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m);       \
2501     out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m);       \
2502     out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m);       \
2503 }
2504 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2505 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2506 
2507 /* Description : Transposes 4x4 block with word elements in vectors
2508    Arguments   : Inputs  - in0, in1, in2, in3
2509                  Outputs - out0, out1, out2, out3
2510                  Return Type - signed word
2511    Details     :
2512 */
2513 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)  \
2514 {                                                                       \
2515     v4i32 s0_m, s1_m, s2_m, s3_m;                                       \
2516                                                                         \
2517     ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                  \
2518     ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                  \
2519                                                                         \
2520     out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m);            \
2521     out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m);            \
2522     out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m);            \
2523     out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m);            \
2524 }
2525 
2526 /* Description : Average byte elements from pair of vectors and store 8x4 byte
2527                  block in destination memory
2528    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2529    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2530                  averaged (a + b)/2 and stored in 'tmp0_m'
2531                  Each byte element from input vector pair 'in2' and 'in3' are
2532                  averaged (a + b)/2 and stored in 'tmp1_m'
2533                  Each byte element from input vector pair 'in4' and 'in5' are
2534                  averaged (a + b)/2 and stored in 'tmp2_m'
2535                  Each byte element from input vector pair 'in6' and 'in7' are
2536                  averaged (a + b)/2 and stored in 'tmp3_m'
2537                  The half vector results from all 4 vectors are stored in
2538                  destination memory as 8x4 byte block
2539 */
2540 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2541 {                                                                           \
2542     uint64_t out0_m, out1_m, out2_m, out3_m;                                \
2543     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
2544                                                                             \
2545     tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                       \
2546     tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                       \
2547     tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                       \
2548     tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                       \
2549                                                                             \
2550     out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0);                             \
2551     out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0);                             \
2552     out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0);                             \
2553     out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0);                             \
2554     SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                      \
2555 }
2556 
2557 /* Description : Average byte elements from pair of vectors and store 16x4 byte
2558                  block in destination memory
2559    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2560    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2561                  averaged (a + b)/2 and stored in 'tmp0_m'
2562                  Each byte element from input vector pair 'in2' and 'in3' are
2563                  averaged (a + b)/2 and stored in 'tmp1_m'
2564                  Each byte element from input vector pair 'in4' and 'in5' are
2565                  averaged (a + b)/2 and stored in 'tmp2_m'
2566                  Each byte element from input vector pair 'in6' and 'in7' are
2567                  averaged (a + b)/2 and stored in 'tmp3_m'
2568                  The results from all 4 vectors are stored in destination
2569                  memory as 16x4 byte block
2570 */
2571 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2572 {                                                                            \
2573     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
2574                                                                              \
2575     tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                        \
2576     tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                        \
2577     tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                        \
2578     tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                        \
2579                                                                              \
2580     ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride);                    \
2581 }
2582 
2583 /* Description : Average rounded byte elements from pair of vectors and store
2584                  8x4 byte block in destination memory
2585    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2586    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2587                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2588                  Each byte element from input vector pair 'in2' and 'in3' are
2589                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2590                  Each byte element from input vector pair 'in4' and 'in5' are
2591                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2592                  Each byte element from input vector pair 'in6' and 'in7' are
2593                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2594                  The half vector results from all 4 vectors are stored in
2595                  destination memory as 8x4 byte block
2596 */
2597 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2598 {                                                                            \
2599     uint64_t out0_m, out1_m, out2_m, out3_m;                                 \
2600     v16u8 tp0_m, tp1_m, tp2_m, tp3_m;                                        \
2601                                                                              \
2602     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                      \
2603                 tp0_m, tp1_m, tp2_m, tp3_m);                                 \
2604                                                                              \
2605     out0_m = __msa_copy_u_d((v2i64) tp0_m, 0);                               \
2606     out1_m = __msa_copy_u_d((v2i64) tp1_m, 0);                               \
2607     out2_m = __msa_copy_u_d((v2i64) tp2_m, 0);                               \
2608     out3_m = __msa_copy_u_d((v2i64) tp3_m, 0);                               \
2609     SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                       \
2610 }
2611 
2612 /* Description : Average rounded byte elements from pair of vectors and store
2613                  16x4 byte block in destination memory
2614    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2615    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2616                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2617                  Each byte element from input vector pair 'in2' and 'in3' are
2618                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2619                  Each byte element from input vector pair 'in4' and 'in5' are
2620                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2621                  Each byte element from input vector pair 'in6' and 'in7' are
2622                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2623                  The vector results from all 4 vectors are stored in
2624                  destination memory as 16x4 byte block
2625 */
2626 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2627 {                                                                             \
2628     v16u8 t0_m, t1_m, t2_m, t3_m;                                             \
2629                                                                               \
2630     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                       \
2631                 t0_m, t1_m, t2_m, t3_m);                                      \
2632     ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride);                             \
2633 }
2634 
2635 /* Description : Average rounded byte elements from pair of vectors,
2636                  average rounded with destination and store 8x4 byte block
2637                  in destination memory
2638    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2639    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2640                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2641                  Each byte element from input vector pair 'in2' and 'in3' are
2642                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2643                  Each byte element from input vector pair 'in4' and 'in5' are
2644                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2645                  Each byte element from input vector pair 'in6' and 'in7' are
2646                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2647                  The half vector results from all 4 vectors are stored in
2648                  destination memory as 8x4 byte block
2649 */
2650 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
2651                           pdst, stride)                            \
2652 {                                                                  \
2653     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                          \
2654     v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                          \
2655                                                                    \
2656     LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);          \
2657     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
2658                 tmp0_m, tmp1_m, tmp2_m, tmp3_m);                   \
2659     AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
2660                   dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
2661 }
2662 
2663 /* Description : Average rounded byte elements from pair of vectors,
2664                  average rounded with destination and store 16x4 byte block
2665                  in destination memory
2666    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2667    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2668                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2669                  Each byte element from input vector pair 'in2' and 'in3' are
2670                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2671                  Each byte element from input vector pair 'in4' and 'in5' are
2672                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2673                  Each byte element from input vector pair 'in6' and 'in7' are
2674                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2675                  The vector results from all 4 vectors are stored in
2676                  destination memory as 16x4 byte block
2677 */
2678 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
2679                            pdst, stride)                            \
2680 {                                                                   \
2681     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                           \
2682     v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                           \
2683                                                                     \
2684     LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);           \
2685     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,             \
2686                 tmp0_m, tmp1_m, tmp2_m, tmp3_m);                    \
2687     AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
2688                    dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
2689 }
2690 
2691 /* Description : Add block 4x4
2692    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
2693    Details     : Least significant 4 bytes from each input vector are added to
2694                  the destination bytes, clipped between 0-255 and then stored.
2695 */
2696 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)         \
2697 {                                                                 \
2698     uint32_t src0_m, src1_m, src2_m, src3_m;                      \
2699     uint32_t out0_m, out1_m, out2_m, out3_m;                      \
2700     v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
2701     v16i8 dst0_m = { 0 };                                         \
2702     v16i8 dst1_m = { 0 };                                         \
2703     v16i8 zero_m = { 0 };                                         \
2704                                                                   \
2705     ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
2706     LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
2707     INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
2708     INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
2709     ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
2710     ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
2711     CLIP_SH2_0_255(res0_m, res1_m);                               \
2712     PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
2713                                                                   \
2714     out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);                   \
2715     out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);                   \
2716     out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);                   \
2717     out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);                   \
2718     SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);            \
2719 }
2720 
2721 /* Description : Dot product and addition of 3 signed halfword input vectors
2722    Arguments   : Inputs  - in0, in1, in2, coeff0, coeff1, coeff2
2723                  Outputs - out0_m
2724                  Return Type - signed halfword
2725    Details     : Dot product of 'in0' with 'coeff0'
2726                  Dot product of 'in1' with 'coeff1'
2727                  Dot product of 'in2' with 'coeff2'
2728                  Addition of all the 3 vector results
2729 
2730                  out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2731 */
2732 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)         \
2733 ( {                                                                 \
2734     v8i16 out0_m;                                                   \
2735                                                                     \
2736     out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0);           \
2737     out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1);  \
2738     out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2);  \
2739                                                                     \
2740     out0_m;                                                         \
2741 } )
2742 
2743 /* Description : Pack even elements of input vectors & xor with 128
2744    Arguments   : Inputs  - in0, in1
2745                  Outputs - out_m
2746                  Return Type - unsigned byte
2747    Details     : Signed byte even elements from 'in0' and 'in1' are packed
2748                  together in one vector and the resulted vector is xor'ed with
2749                  128 to shift the range from signed to unsigned byte
2750 */
2751 #define PCKEV_XORI128_UB(in0, in1)                            \
2752 ( {                                                           \
2753     v16u8 out_m;                                              \
2754     out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
2755     out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128);         \
2756     out_m;                                                    \
2757 } )
2758 
2759 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2760                  as 8x4 unsigned byte block
2761    Arguments   : Inputs  - in0, in1, in2, in3, dst0, dst1, pdst, stride
2762 */
2763 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,           \
2764                                 dst0, dst1, pdst, stride)     \
2765 {                                                             \
2766     v16u8 tmp0_m, tmp1_m;                                     \
2767     uint8_t *pdst_m = (uint8_t *) (pdst);                     \
2768                                                               \
2769     tmp0_m = PCKEV_XORI128_UB(in0, in1);                      \
2770     tmp1_m = PCKEV_XORI128_UB(in2, in3);                      \
2771     AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);  \
2772     ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride);        \
2773 }
2774 
2775 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
2776                  of results and store 4 words in destination memory as per
2777                  stride
2778    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
2779 */
2780 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride)  \
2781 {                                                         \
2782     uint32_t out0_m, out1_m, out2_m, out3_m;              \
2783     v16i8 tmp0_m, tmp1_m;                                 \
2784                                                           \
2785     PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m);      \
2786                                                           \
2787     out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0);           \
2788     out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2);           \
2789     out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0);           \
2790     out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2);           \
2791                                                           \
2792     SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);    \
2793 }
2794 
2795 /* Description : Pack even byte elements and store byte vector in destination
2796                  memory
2797    Arguments   : Inputs  - in0, in1, pdst
2798 */
2799 #define PCKEV_ST_SB(in0, in1, pdst)                   \
2800 {                                                     \
2801     v16i8 tmp_m;                                      \
2802     tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
2803     ST_SB(tmp_m, (pdst));                             \
2804 }
2805 
2806 /* Description : Horizontal 2 tap filter kernel code
2807    Arguments   : Inputs  - in0, in1, mask, coeff, shift
2808 */
2809 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)            \
2810 ( {                                                                 \
2811     v16i8 tmp0_m;                                                   \
2812     v8u16 tmp1_m;                                                   \
2813                                                                     \
2814     tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0);  \
2815     tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff);         \
2816     tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift);          \
2817     tmp1_m = __msa_sat_u_h(tmp1_m, shift);                          \
2818                                                                     \
2819     tmp1_m;                                                         \
2820 } )
2821 #endif  /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */
2822