• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
23 
24 #include <stdint.h>
25 #include <msa.h>
26 #include <config.h>
27 
28 #if HAVE_MSA2
29 #include <msa2.h>
30 #endif
31 
32 #define ALIGNMENT           16
33 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
34 
35 #define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
36 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
37 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
38 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
39 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
40 #define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
41 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
42 
43 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
44 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
45 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
46 #define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
47 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
48 #define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
49 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
50 
51 #if (__mips_isa_rev >= 6)
52     #define LH(psrc)                              \
53     ( {                                           \
54         uint16_t val_lh_m = *(uint16_t *)(psrc);  \
55         val_lh_m;                                 \
56     } )
57 
58     #define LW(psrc)                              \
59     ( {                                           \
60         uint32_t val_lw_m = *(uint32_t *)(psrc);  \
61         val_lw_m;                                 \
62     } )
63 
64     #if (__mips == 64)
65         #define LD(psrc)                               \
66         ( {                                            \
67             uint64_t val_ld_m =  *(uint64_t *)(psrc);  \
68             val_ld_m;                                  \
69         } )
70     #else  // !(__mips == 64)
71         #define LD(psrc)                                                    \
72         ( {                                                                 \
73             uint8_t *psrc_ld_m = (uint8_t *) (psrc);                        \
74             uint32_t val0_ld_m, val1_ld_m;                                  \
75             uint64_t val_ld_m = 0;                                          \
76                                                                             \
77             val0_ld_m = LW(psrc_ld_m);                                      \
78             val1_ld_m = LW(psrc_ld_m + 4);                                  \
79                                                                             \
80             val_ld_m = (uint64_t) (val1_ld_m);                              \
81             val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000);  \
82             val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m);        \
83                                                                             \
84             val_ld_m;                                                       \
85         } )
86     #endif  // (__mips == 64)
87 
88     #define SH(val, pdst)  *(uint16_t *)(pdst) = (val);
89     #define SW(val, pdst)  *(uint32_t *)(pdst) = (val);
90     #define SD(val, pdst)  *(uint64_t *)(pdst) = (val);
91 
92 #else  // !(__mips_isa_rev >= 6)
93     #define LH(psrc)                                 \
94     ( {                                              \
95         uint8_t *psrc_lh_m = (uint8_t *) (psrc);     \
96         uint16_t val_lh_m;                           \
97                                                      \
98         __asm__ volatile (                           \
99             "ulh  %[val_lh_m],  %[psrc_lh_m]  \n\t"  \
100                                                      \
101             : [val_lh_m] "=r" (val_lh_m)             \
102             : [psrc_lh_m] "m" (*psrc_lh_m)           \
103         );                                           \
104                                                      \
105         val_lh_m;                                    \
106     } )
107 
108     #define LW(psrc)                                 \
109     ( {                                              \
110         uint8_t *psrc_lw_m = (uint8_t *) (psrc);     \
111         uint32_t val_lw_m;                           \
112                                                      \
113         __asm__ volatile (                           \
114             "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t"  \
115             "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t"  \
116                                                      \
117             : [val_lw_m] "=&r"(val_lw_m)             \
118             : [psrc_lw_m] "r"(psrc_lw_m)             \
119         );                                           \
120                                                      \
121         val_lw_m;                                    \
122     } )
123 
124     #if (__mips == 64)
125         #define LD(psrc)                                 \
126         ( {                                              \
127             uint8_t *psrc_ld_m = (uint8_t *) (psrc);     \
128             uint64_t val_ld_m = 0;                       \
129                                                          \
130             __asm__ volatile (                           \
131                 "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t"  \
132                 "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t"  \
133                                                          \
134                 : [val_ld_m] "=&r" (val_ld_m)            \
135                 : [psrc_ld_m] "r" (psrc_ld_m)            \
136             );                                           \
137                                                          \
138             val_ld_m;                                    \
139         } )
140     #else  // !(__mips == 64)
141         #define LD(psrc)                                                    \
142         ( {                                                                 \
143             uint8_t *psrc_ld_m = (uint8_t *) (psrc);                        \
144             uint32_t val0_ld_m, val1_ld_m;                                  \
145             uint64_t val_ld_m = 0;                                          \
146                                                                             \
147             val0_ld_m = LW(psrc_ld_m);                                      \
148             val1_ld_m = LW(psrc_ld_m + 4);                                  \
149                                                                             \
150             val_ld_m = (uint64_t) (val1_ld_m);                              \
151             val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000);  \
152             val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m);        \
153                                                                             \
154             val_ld_m;                                                       \
155         } )
156     #endif  // (__mips == 64)
157 
158     #define SH(val, pdst)                            \
159     {                                                \
160         uint8_t *pdst_sh_m = (uint8_t *) (pdst);     \
161         uint16_t val_sh_m = (val);                   \
162                                                      \
163         __asm__ volatile (                           \
164             "ush  %[val_sh_m],  %[pdst_sh_m]  \n\t"  \
165                                                      \
166             : [pdst_sh_m] "=m" (*pdst_sh_m)          \
167             : [val_sh_m] "r" (val_sh_m)              \
168         );                                           \
169     }
170 
171     #define SW(val, pdst)                            \
172     {                                                \
173         uint8_t *pdst_sw_m = (uint8_t *) (pdst);     \
174         uint32_t val_sw_m = (val);                   \
175                                                      \
176         __asm__ volatile (                           \
177             "usw  %[val_sw_m],  %[pdst_sw_m]  \n\t"  \
178                                                      \
179             : [pdst_sw_m] "=m" (*pdst_sw_m)          \
180             : [val_sw_m] "r" (val_sw_m)              \
181         );                                           \
182     }
183 
184     #define SD(val, pdst)                                             \
185     {                                                                 \
186         uint8_t *pdst_sd_m = (uint8_t *) (pdst);                      \
187         uint32_t val0_sd_m, val1_sd_m;                                \
188                                                                       \
189         val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF);          \
190         val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF);  \
191                                                                       \
192         SW(val0_sd_m, pdst_sd_m);                                     \
193         SW(val1_sd_m, pdst_sd_m + 4);                                 \
194     }
195 #endif // (__mips_isa_rev >= 6)
196 
197 /* Description : Load 4 words with stride
198    Arguments   : Inputs  - psrc    (source pointer to load from)
199                          - stride
200                  Outputs - out0, out1, out2, out3
201    Details     : Loads word in 'out0' from (psrc)
202                  Loads word in 'out1' from (psrc + stride)
203                  Loads word in 'out2' from (psrc + 2 * stride)
204                  Loads word in 'out3' from (psrc + 3 * stride)
205 */
206 #define LW4(psrc, stride, out0, out1, out2, out3)  \
207 {                                                  \
208     out0 = LW((psrc));                             \
209     out1 = LW((psrc) + stride);                    \
210     out2 = LW((psrc) + 2 * stride);                \
211     out3 = LW((psrc) + 3 * stride);                \
212 }
213 
214 #define LW2(psrc, stride, out0, out1)  \
215 {                                      \
216     out0 = LW((psrc));                 \
217     out1 = LW((psrc) + stride);        \
218 }
219 
220 /* Description : Load double words with stride
221    Arguments   : Inputs  - psrc    (source pointer to load from)
222                          - stride
223                  Outputs - out0, out1
224    Details     : Loads double word in 'out0' from (psrc)
225                  Loads double word in 'out1' from (psrc + stride)
226 */
227 #define LD2(psrc, stride, out0, out1)  \
228 {                                      \
229     out0 = LD((psrc));                 \
230     out1 = LD((psrc) + stride);        \
231 }
232 #define LD4(psrc, stride, out0, out1, out2, out3)  \
233 {                                                  \
234     LD2((psrc), stride, out0, out1);               \
235     LD2((psrc) + 2 * stride, stride, out2, out3);  \
236 }
237 
238 /* Description : Store 4 words with stride
239    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
240    Details     : Stores word from 'in0' to (pdst)
241                  Stores word from 'in1' to (pdst + stride)
242                  Stores word from 'in2' to (pdst + 2 * stride)
243                  Stores word from 'in3' to (pdst + 3 * stride)
244 */
245 #define SW4(in0, in1, in2, in3, pdst, stride)  \
246 {                                              \
247     SW(in0, (pdst))                            \
248     SW(in1, (pdst) + stride);                  \
249     SW(in2, (pdst) + 2 * stride);              \
250     SW(in3, (pdst) + 3 * stride);              \
251 }
252 
253 /* Description : Store 4 double words with stride
254    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
255    Details     : Stores double word from 'in0' to (pdst)
256                  Stores double word from 'in1' to (pdst + stride)
257                  Stores double word from 'in2' to (pdst + 2 * stride)
258                  Stores double word from 'in3' to (pdst + 3 * stride)
259 */
260 #define SD4(in0, in1, in2, in3, pdst, stride)  \
261 {                                              \
262     SD(in0, (pdst))                            \
263     SD(in1, (pdst) + stride);                  \
264     SD(in2, (pdst) + 2 * stride);              \
265     SD(in3, (pdst) + 3 * stride);              \
266 }
267 
268 /* Description : Load vector elements with stride
269    Arguments   : Inputs  - psrc    (source pointer to load from)
270                          - stride
271                  Outputs - out0, out1
272                  Return Type - as per RTYPE
273    Details     : Loads elements in 'out0' from (psrc)
274                  Loads elements in 'out1' from (psrc + stride)
275 */
276 #define LD_V2(RTYPE, psrc, stride, out0, out1)  \
277 {                                               \
278     out0 = LD_V(RTYPE, (psrc));                 \
279     out1 = LD_V(RTYPE, (psrc) + stride);        \
280 }
281 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
282 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
283 #define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
284 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
285 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
286 
287 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2)  \
288 {                                                     \
289     LD_V2(RTYPE, (psrc), stride, out0, out1);         \
290     out2 = LD_V(RTYPE, (psrc) + 2 * stride);          \
291 }
292 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
293 #define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__)
294 
295 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3)   \
296 {                                                            \
297     LD_V2(RTYPE, (psrc), stride, out0, out1);                \
298     LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);  \
299 }
300 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
301 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
302 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
303 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
304 #define LD_SW4(...) LD_V4(v4i32, __VA_ARGS__)
305 
306 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4)  \
307 {                                                                 \
308     LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);         \
309     out4 = LD_V(RTYPE, (psrc) + 4 * stride);                      \
310 }
311 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
312 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
313 
314 #define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5)  \
315 {                                                                       \
316     LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
317     LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5);              \
318 }
319 #define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
320 #define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
321 #define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
322 #define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
323 
324 #define LD_V7(RTYPE, psrc, stride,                               \
325               out0, out1, out2, out3, out4, out5, out6)          \
326 {                                                                \
327     LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
328     LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
329 }
330 #define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__)
331 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
332 
333 #define LD_V8(RTYPE, psrc, stride,                                      \
334               out0, out1, out2, out3, out4, out5, out6, out7)           \
335 {                                                                       \
336     LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
337     LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
338 }
339 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
340 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
341 #define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
342 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
343 #define LD_SW8(...) LD_V8(v4i32, __VA_ARGS__)
344 
345 #define LD_V16(RTYPE, psrc, stride,                                   \
346                out0, out1, out2, out3, out4, out5, out6, out7,        \
347                out8, out9, out10, out11, out12, out13, out14, out15)  \
348 {                                                                     \
349     LD_V8(RTYPE, (psrc), stride,                                      \
350           out0, out1, out2, out3, out4, out5, out6, out7);            \
351     LD_V8(RTYPE, (psrc) + 8 * stride, stride,                         \
352           out8, out9, out10, out11, out12, out13, out14, out15);      \
353 }
354 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
355 
356 /* Description : Store vectors with stride
357    Arguments   : Inputs  - in0, in1, stride
358                  Outputs - pdst    (destination pointer to store to)
359    Details     : Stores elements from 'in0' to (pdst)
360                  Stores elements from 'in1' to (pdst + stride)
361 */
362 #define ST_V2(RTYPE, in0, in1, pdst, stride)  \
363 {                                             \
364     ST_V(RTYPE, in0, (pdst));                 \
365     ST_V(RTYPE, in1, (pdst) + stride);        \
366 }
367 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
368 #define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
369 #define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
370 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
371 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
372 
373 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
374 {                                                         \
375     ST_V2(RTYPE, in0, in1, (pdst), stride);               \
376     ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
377 }
378 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
379 #define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
380 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
381 #define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
382 
383 #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride)  \
384 {                                                                 \
385     ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride);             \
386     ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride);          \
387 }
388 #define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
389 
390 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
391 {                                                                           \
392     ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride);                       \
393     ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);          \
394 }
395 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
396 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
397 #define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
398 
399 /* Description : Store half word elements of vector with stride
400  * Arguments   : Inputs  - in   source vector
401  *                       - pdst    (destination pointer to store to)
402  *                       - stride
403  * Details     : Stores half word 'idx0' from 'in' to (pdst)
404  *               Stores half word 'idx1' from 'in' to (pdst + stride)
405  *               Similar for other elements
406  */
407 #define ST_H1(in, idx, pdst)                             \
408 {                                                        \
409     uint16_t out0_m;                                     \
410     out0_m = __msa_copy_u_h((v8i16) in, idx);            \
411     SH(out0_m, (pdst));                                  \
412 }
413 #define ST_H2(in, idx0, idx1, pdst, stride)              \
414 {                                                        \
415     uint16_t out0_m, out1_m;                             \
416     out0_m = __msa_copy_u_h((v8i16) in, idx0);           \
417     out1_m = __msa_copy_u_h((v8i16) in, idx1);           \
418     SH(out0_m, (pdst));                                  \
419     SH(out1_m, (pdst) + stride);                         \
420 }
421 #define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)  \
422 {                                                        \
423     uint16_t out0_m, out1_m, out2_m, out3_m;             \
424     out0_m = __msa_copy_u_h((v8i16) in, idx0);           \
425     out1_m = __msa_copy_u_h((v8i16) in, idx1);           \
426     out2_m = __msa_copy_u_h((v8i16) in, idx2);           \
427     out3_m = __msa_copy_u_h((v8i16) in, idx3);           \
428     SH(out0_m, (pdst));                                  \
429     SH(out1_m, (pdst) + stride);                         \
430     SH(out2_m, (pdst) + 2 * stride);                     \
431     SH(out3_m, (pdst) + 3 * stride);                     \
432 }
433 #define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5,            \
434               idx6, idx7, pdst, stride)                          \
435 {                                                                \
436     ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)              \
437     ST_H4(in, idx4, idx5, idx6, idx7, (pdst) + 4*stride, stride) \
438 }
439 
440 /* Description : Store word elements of vector with stride
441  * Arguments   : Inputs  - in   source vector
442  *                       - pdst    (destination pointer to store to)
443  *                       - stride
444  * Details     : Stores word 'idx0' from 'in' to (pdst)
445  *               Stores word 'idx1' from 'in' to (pdst + stride)
446  *               Similar for other elements
447  */
448 #define ST_W1(in, idx, pdst)                             \
449 {                                                        \
450     uint32_t out0_m;                                     \
451     out0_m = __msa_copy_u_w((v4i32) in, idx);            \
452     SW(out0_m, (pdst));                                  \
453 }
454 #define ST_W2(in, idx0, idx1, pdst, stride)              \
455 {                                                        \
456     uint32_t out0_m, out1_m;                             \
457     out0_m = __msa_copy_u_w((v4i32) in, idx0);           \
458     out1_m = __msa_copy_u_w((v4i32) in, idx1);           \
459     SW(out0_m, (pdst));                                  \
460     SW(out1_m, (pdst) + stride);                         \
461 }
462 #define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)  \
463 {                                                        \
464     uint32_t out0_m, out1_m, out2_m, out3_m;             \
465     out0_m = __msa_copy_u_w((v4i32) in, idx0);           \
466     out1_m = __msa_copy_u_w((v4i32) in, idx1);           \
467     out2_m = __msa_copy_u_w((v4i32) in, idx2);           \
468     out3_m = __msa_copy_u_w((v4i32) in, idx3);           \
469     SW(out0_m, (pdst));                                  \
470     SW(out1_m, (pdst) + stride);                         \
471     SW(out2_m, (pdst) + 2*stride);                       \
472     SW(out3_m, (pdst) + 3*stride);                       \
473 }
474 #define ST_W8(in0, in1, idx0, idx1, idx2, idx3,                 \
475               idx4, idx5, idx6, idx7, pdst, stride)             \
476 {                                                               \
477     ST_W4(in0, idx0, idx1, idx2, idx3, pdst, stride)            \
478     ST_W4(in1, idx4, idx5, idx6, idx7, pdst + 4*stride, stride) \
479 }
480 
481 /* Description : Store double word elements of vector with stride
482  * Arguments   : Inputs  - in   source vector
483  *                       - pdst    (destination pointer to store to)
484  *                       - stride
485  * Details     : Stores double word 'idx0' from 'in' to (pdst)
486  *               Stores double word 'idx1' from 'in' to (pdst + stride)
487  *               Similar for other elements
488  */
489 #define ST_D1(in, idx, pdst)                   \
490 {                                              \
491     uint64_t out0_m;                           \
492     out0_m = __msa_copy_u_d((v2i64) in, idx);  \
493     SD(out0_m, (pdst));                        \
494 }
495 #define ST_D2(in, idx0, idx1, pdst, stride)    \
496 {                                              \
497     uint64_t out0_m, out1_m;                   \
498     out0_m = __msa_copy_u_d((v2i64) in, idx0); \
499     out1_m = __msa_copy_u_d((v2i64) in, idx1); \
500     SD(out0_m, (pdst));                        \
501     SD(out1_m, (pdst) + stride);               \
502 }
503 #define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
504 {                                                             \
505     uint64_t out0_m, out1_m, out2_m, out3_m;                  \
506     out0_m = __msa_copy_u_d((v2i64) in0, idx0);               \
507     out1_m = __msa_copy_u_d((v2i64) in0, idx1);               \
508     out2_m = __msa_copy_u_d((v2i64) in1, idx2);               \
509     out3_m = __msa_copy_u_d((v2i64) in1, idx3);               \
510     SD(out0_m, (pdst));                                       \
511     SD(out1_m, (pdst) + stride);                              \
512     SD(out2_m, (pdst) + 2 * stride);                          \
513     SD(out3_m, (pdst) + 3 * stride);                          \
514 }
515 #define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3,              \
516               idx4, idx5, idx6, idx7, pdst, stride)                    \
517 {                                                                      \
518     ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)              \
519     ST_D4(in2, in3, idx4, idx5, idx6, idx7, pdst + 4 * stride, stride) \
520 }
521 
522 /* Description : Store as 12x8 byte block to destination memory from
523                  input vectors
524    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
525    Details     : Index 0 double word element from input vector 'in0' is copied
526                  and stored to destination memory at (pblk_12x8_m) followed by
527                  index 2 word element from same input vector 'in0' at
528                  (pblk_12x8_m + 8)
529                  Similar to remaining lines
530 */
531 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
532 {                                                                        \
533     uint64_t out0_m, out1_m, out2_m, out3_m;                             \
534     uint64_t out4_m, out5_m, out6_m, out7_m;                             \
535     uint32_t out8_m, out9_m, out10_m, out11_m;                           \
536     uint32_t out12_m, out13_m, out14_m, out15_m;                         \
537     uint8_t *pblk_12x8_m = (uint8_t *) (pdst);                           \
538                                                                          \
539     out0_m = __msa_copy_u_d((v2i64) in0, 0);                             \
540     out1_m = __msa_copy_u_d((v2i64) in1, 0);                             \
541     out2_m = __msa_copy_u_d((v2i64) in2, 0);                             \
542     out3_m = __msa_copy_u_d((v2i64) in3, 0);                             \
543     out4_m = __msa_copy_u_d((v2i64) in4, 0);                             \
544     out5_m = __msa_copy_u_d((v2i64) in5, 0);                             \
545     out6_m = __msa_copy_u_d((v2i64) in6, 0);                             \
546     out7_m = __msa_copy_u_d((v2i64) in7, 0);                             \
547                                                                          \
548     out8_m =  __msa_copy_u_w((v4i32) in0, 2);                            \
549     out9_m =  __msa_copy_u_w((v4i32) in1, 2);                            \
550     out10_m = __msa_copy_u_w((v4i32) in2, 2);                            \
551     out11_m = __msa_copy_u_w((v4i32) in3, 2);                            \
552     out12_m = __msa_copy_u_w((v4i32) in4, 2);                            \
553     out13_m = __msa_copy_u_w((v4i32) in5, 2);                            \
554     out14_m = __msa_copy_u_w((v4i32) in6, 2);                            \
555     out15_m = __msa_copy_u_w((v4i32) in7, 2);                            \
556                                                                          \
557     SD(out0_m, pblk_12x8_m);                                             \
558     SW(out8_m, pblk_12x8_m + 8);                                         \
559     pblk_12x8_m += stride;                                               \
560     SD(out1_m, pblk_12x8_m);                                             \
561     SW(out9_m, pblk_12x8_m + 8);                                         \
562     pblk_12x8_m += stride;                                               \
563     SD(out2_m, pblk_12x8_m);                                             \
564     SW(out10_m, pblk_12x8_m + 8);                                        \
565     pblk_12x8_m += stride;                                               \
566     SD(out3_m, pblk_12x8_m);                                             \
567     SW(out11_m, pblk_12x8_m + 8);                                        \
568     pblk_12x8_m += stride;                                               \
569     SD(out4_m, pblk_12x8_m);                                             \
570     SW(out12_m, pblk_12x8_m + 8);                                        \
571     pblk_12x8_m += stride;                                               \
572     SD(out5_m, pblk_12x8_m);                                             \
573     SW(out13_m, pblk_12x8_m + 8);                                        \
574     pblk_12x8_m += stride;                                               \
575     SD(out6_m, pblk_12x8_m);                                             \
576     SW(out14_m, pblk_12x8_m + 8);                                        \
577     pblk_12x8_m += stride;                                               \
578     SD(out7_m, pblk_12x8_m);                                             \
579     SW(out15_m, pblk_12x8_m + 8);                                        \
580 }
581 
582 /* Description : average with rounding (in0 + in1 + 1) / 2.
583    Arguments   : Inputs  - in0, in1, in2, in3,
584                  Outputs - out0, out1
585                  Return Type - as per RTYPE
586    Details     : Each byte element from 'in0' vector is added with each byte
587                  element from 'in1' vector. The addition of the elements plus 1
588                 (for rounding) is done unsigned with full precision,
589                 i.e. the result has one extra bit. Unsigned division by 2
590                 (or logical shift right by one bit) is performed before writing
591                 the result to vector 'out0'
592                 Similar for the pair of 'in2' and 'in3'
593 */
594 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)       \
595 {                                                             \
596     out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1);  \
597     out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3);  \
598 }
599 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
600 
601 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
602                  out0, out1, out2, out3)                        \
603 {                                                               \
604     AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)             \
605     AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)             \
606 }
607 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
608 
609 /* Description : Immediate number of columns to slide
610    Arguments   : Inputs  - s, d, slide_val
611                  Outputs - out
612                  Return Type - as per RTYPE
613    Details     : Byte elements from 'd' vector are slide into 's' by
614                  number of elements specified by 'slide_val'
615 */
616 #define SLDI_B(RTYPE, d, s, slide_val, out)                       \
617 {                                                                 \
618     out = (RTYPE) __msa_sldi_b((v16i8) d, (v16i8) s, slide_val);  \
619 }
620 
621 #define SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1)  \
622 {                                                              \
623     SLDI_B(RTYPE, d0, s0, slide_val, out0)                     \
624     SLDI_B(RTYPE, d1, s1, slide_val, out1)                     \
625 }
626 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
627 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
628 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
629 #define SLDI_B2_SW(...) SLDI_B2(v4i32, __VA_ARGS__)
630 
631 #define SLDI_B3(RTYPE, d0, s0, d1, s1, d2, s2, slide_val,  \
632                 out0, out1, out2)                          \
633 {                                                          \
634     SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1)  \
635     SLDI_B(RTYPE, d2, s2, slide_val, out2)                 \
636 }
637 #define SLDI_B3_UB(...) SLDI_B3(v16u8, __VA_ARGS__)
638 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
639 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
640 
641 #define SLDI_B4(RTYPE, d0, s0, d1, s1, d2, s2, d3, s3,     \
642                 slide_val, out0, out1, out2, out3)         \
643 {                                                          \
644     SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1)  \
645     SLDI_B2(RTYPE, d2, s2, d3, s3, slide_val, out2, out3)  \
646 }
647 #define SLDI_B4_UB(...) SLDI_B4(v16u8, __VA_ARGS__)
648 #define SLDI_B4_SB(...) SLDI_B4(v16i8, __VA_ARGS__)
649 #define SLDI_B4_SH(...) SLDI_B4(v8i16, __VA_ARGS__)
650 
651 /* Description : Shuffle byte vector elements as per mask vector
652    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
653                  Outputs - out0, out1
654                  Return Type - as per RTYPE
655    Details     : Selective byte elements from in0 & in1 are copied to out0 as
656                  per control vector mask0
657                  Selective byte elements from in2 & in3 are copied to out1 as
658                  per control vector mask1
659 */
660 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
661 {                                                                          \
662     out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0);  \
663     out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2);  \
664 }
665 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
666 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
667 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
668 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
669 
670 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
671                 out0, out1, out2)                                          \
672 {                                                                          \
673     VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
674     out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4);  \
675 }
676 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
677 
678 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,       \
679                 out0, out1, out2, out3)                            \
680 {                                                                  \
681     VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
682     VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
683 }
684 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
685 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
686 
687 /* Description : Shuffle halfword vector elements as per mask vector
688    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
689                  Outputs - out0, out1
690                  Return Type - as per RTYPE
691    Details     : Selective halfword elements from in0 & in1 are copied to out0
692                  as per control vector mask0
693                  Selective halfword elements from in2 & in3 are copied to out1
694                  as per control vector mask1
695 */
696 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \
697 {                                                                          \
698     out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0);  \
699     out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2);  \
700 }
701 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
702 
703 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \
704                 out0, out1, out2)                                          \
705 {                                                                          \
706     VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \
707     out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4);  \
708 }
709 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
710 
711 /* Description : Shuffle byte vector elements as per mask vector
712    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
713                  Outputs - out0, out1
714                  Return Type - as per RTYPE
715    Details     : Selective byte elements from in0 & in1 are copied to out0 as
716                  per control vector mask0
717                  Selective byte elements from in2 & in3 are copied to out1 as
718                  per control vector mask1
719 */
720 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)      \
721 {                                                                         \
722     out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
723     out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
724 }
725 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
726 
727 /* Description : Dot product of byte vector elements
728    Arguments   : Inputs  - mult0, mult1
729                            cnst0, cnst1
730                  Outputs - out0, out1
731                  Return Type - as per RTYPE
732    Details     : Unsigned byte elements from mult0 are multiplied with
733                  unsigned byte elements from cnst0 producing a result
734                  twice the size of input i.e. unsigned halfword.
735                  Then this multiplication results of adjacent odd-even elements
736                  are added together and stored to the out vector
737                  (2 unsigned halfword results)
738 */
739 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
740 {                                                                 \
741     out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0);  \
742     out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1);  \
743 }
744 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
745 
746 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,           \
747                  cnst0, cnst1, cnst2, cnst3,                  \
748                  out0, out1, out2, out3)                      \
749 {                                                             \
750     DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
751     DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
752 }
753 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
754 
755 /* Description : Dot product of byte vector elements
756    Arguments   : Inputs  - mult0, mult1
757                            cnst0, cnst1
758                  Outputs - out0, out1
759                  Return Type - as per RTYPE
760    Details     : Signed byte elements from mult0 are multiplied with
761                  signed byte elements from cnst0 producing a result
762                  twice the size of input i.e. signed halfword.
763                  Then this multiplication results of adjacent odd-even elements
764                  are added together and stored to the out vector
765                  (2 signed halfword results)
766 */
767 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
768 {                                                                 \
769     out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0);  \
770     out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1);  \
771 }
772 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
773 
774 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2,  \
775                  out0, out1, out2)                                 \
776 {                                                                  \
777     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);       \
778     out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2);   \
779 }
780 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
781 
782 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
783                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
784 {                                                                     \
785     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
786     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
787 }
788 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
789 
790 /* Description : Dot product of halfword vector elements
791    Arguments   : Inputs  - mult0, mult1
792                            cnst0, cnst1
793                  Outputs - out0, out1
794                  Return Type - as per RTYPE
795    Details     : Signed halfword elements from mult0 are multiplied with
796                  signed halfword elements from cnst0 producing a result
797                  twice the size of input i.e. signed word.
798                  Then this multiplication results of adjacent odd-even elements
799                  are added together and stored to the out vector
800                  (2 signed word results)
801 */
802 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
803 {                                                                 \
804     out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0);  \
805     out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1);  \
806 }
807 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
808 
809 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,           \
810                  cnst0, cnst1, cnst2, cnst3,                  \
811                  out0, out1, out2, out3)                      \
812 {                                                             \
813     DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
814     DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
815 }
816 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
817 
818 /* Description : Dot product & addition of byte vector elements
819    Arguments   : Inputs  - mult0, mult1
820                            cnst0, cnst1
821                  Outputs - out0, out1
822                  Return Type - as per RTYPE
823    Details     : Signed byte elements from mult0 are multiplied with
824                  signed byte elements from cnst0 producing a result
825                  twice the size of input i.e. signed halfword.
826                  Then this multiplication results of adjacent odd-even elements
827                  are added to the out vector
828                  (2 signed halfword results)
829 */
830 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
831 {                                                                  \
832     out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0,                   \
833                                    (v16i8) mult0, (v16i8) cnst0);  \
834     out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1,                   \
835                                    (v16i8) mult1, (v16i8) cnst1);  \
836 }
837 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
838 
839 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \
840                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
841 {                                                                      \
842     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
843     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
844 }
845 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
846 
847 /* Description : Dot product & addition of byte vector elements
848    Arguments   : Inputs  - mult0, mult1
849                            cnst0, cnst1
850                  Outputs - out0, out1
851                  Return Type - as per RTYPE
852    Details     : Unsigned byte elements from mult0 are multiplied with
853                  unsigned byte elements from cnst0 producing a result
854                  twice the size of input i.e. unsigned halfword.
855                  Then this multiplication results of adjacent odd-even elements
856                  are added to the out vector
857                  (2 unsigned halfword results)
858 */
859 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
860 {                                                                  \
861     out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0,                   \
862                                    (v16u8) mult0, (v16u8) cnst0);  \
863     out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1,                   \
864                                    (v16u8) mult1, (v16u8) cnst1);  \
865 }
866 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
867 
868 /* Description : Dot product & addition of halfword vector elements
869    Arguments   : Inputs  - mult0, mult1
870                            cnst0, cnst1
871                  Outputs - out0, out1
872                  Return Type - as per RTYPE
873    Details     : Signed halfword elements from mult0 are multiplied with
874                  signed halfword elements from cnst0 producing a result
875                  twice the size of input i.e. signed word.
876                  Then this multiplication results of adjacent odd-even elements
877                  are added to the out vector
878                  (2 signed word results)
879 */
880 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \
881 {                                                                  \
882     out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0,                   \
883                                    (v8i16) mult0, (v8i16) cnst0);  \
884     out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1,                   \
885                                    (v8i16) mult1, (v8i16) cnst1);  \
886 }
887 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
888 
889 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3,                   \
890                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \
891 {                                                                      \
892     DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \
893     DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \
894 }
895 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
896 
897 /* Description : Minimum values between unsigned elements of
898                  either vector are copied to the output vector
899    Arguments   : Inputs  - in0, in1, min_vec
900                  Outputs - in0, in1, (in place)
901                  Return Type - as per RTYPE
902    Details     : Minimum of unsigned halfword element values from 'in0' and
903                  'min_value' are written to output vector 'in0'
904 */
905 #define MIN_UH2(RTYPE, in0, in1, min_vec)               \
906 {                                                       \
907     in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec);  \
908     in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec);  \
909 }
910 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
911 
912 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec)  \
913 {                                                    \
914     MIN_UH2(RTYPE, in0, in1, min_vec);               \
915     MIN_UH2(RTYPE, in2, in3, min_vec);               \
916 }
917 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
918 
919 /* Description : Clips all halfword elements of input vector between min & max
920                  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
921    Arguments   : Inputs  - in    (input vector)
922                          - min   (min threshold)
923                          - max   (max threshold)
924                  Outputs - in    (output vector with clipped elements)
925                  Return Type - signed halfword
926 */
927 #define CLIP_SH(in, min, max)                     \
928 {                                                 \
929     in = __msa_max_s_h((v8i16) min, (v8i16) in);  \
930     in = __msa_min_s_h((v8i16) max, (v8i16) in);  \
931 }
932 
933 /* Description : Clips all signed halfword elements of input vector
934                  between 0 & 255
935    Arguments   : Inputs  - in    (input vector)
936                  Outputs - in    (output vector with clipped elements)
937                  Return Type - signed halfwords
938 */
939 #define CLIP_SH_0_255(in)                       \
940 {                                               \
941     in = __msa_maxi_s_h((v8i16) in, 0);         \
942     in = (v8i16) __msa_sat_u_h((v8u16) in, 7);  \
943 }
944 
945 #define CLIP_SH2_0_255(in0, in1)  \
946 {                                 \
947     CLIP_SH_0_255(in0);           \
948     CLIP_SH_0_255(in1);           \
949 }
950 
951 #define CLIP_SH4_0_255(in0, in1, in2, in3)  \
952 {                                           \
953     CLIP_SH2_0_255(in0, in1);               \
954     CLIP_SH2_0_255(in2, in3);               \
955 }
956 
957 #define CLIP_SH8_0_255(in0, in1, in2, in3,  \
958                        in4, in5, in6, in7)  \
959 {                                           \
960     CLIP_SH4_0_255(in0, in1, in2, in3);     \
961     CLIP_SH4_0_255(in4, in5, in6, in7);     \
962 }
963 
964 /* Description : Clips all signed word elements of input vector
965                  between 0 & 255
966    Arguments   : Inputs  - in    (input vector)
967                  Outputs - in    (output vector with clipped elements)
968                  Return Type - signed word
969 */
970 #define CLIP_SW_0_255(in)                       \
971 {                                               \
972     in = __msa_maxi_s_w((v4i32) in, 0);         \
973     in = (v4i32) __msa_sat_u_w((v4u32) in, 7);  \
974 }
975 
976 #define CLIP_SW2_0_255(in0, in1)  \
977 {                                 \
978     CLIP_SW_0_255(in0);           \
979     CLIP_SW_0_255(in1);           \
980 }
981 
982 #define CLIP_SW4_0_255(in0, in1, in2, in3)  \
983 {                                           \
984     CLIP_SW2_0_255(in0, in1);               \
985     CLIP_SW2_0_255(in2, in3);               \
986 }
987 
988 #define CLIP_SW8_0_255(in0, in1, in2, in3,  \
989                        in4, in5, in6, in7)  \
990 {                                           \
991     CLIP_SW4_0_255(in0, in1, in2, in3);     \
992     CLIP_SW4_0_255(in4, in5, in6, in7);     \
993 }
994 
995 /* Description : Addition of 4 signed word elements
996                  4 signed word elements of input vector are added together and
997                  resulted integer sum is returned
998    Arguments   : Inputs  - in       (signed word vector)
999                  Outputs - sum_m    (i32 sum)
1000                  Return Type - signed word
1001 */
1002 #define HADD_SW_S32(in)                               \
1003 ( {                                                   \
1004     v2i64 res0_m, res1_m;                             \
1005     int32_t sum_m;                                    \
1006                                                       \
1007     res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in);  \
1008     res1_m = __msa_splati_d(res0_m, 1);               \
1009     res0_m += res1_m;                                 \
1010     sum_m = __msa_copy_s_w((v4i32) res0_m, 0);        \
1011     sum_m;                                            \
1012 } )
1013 
1014 /* Description : Addition of 8 unsigned halfword elements
1015                  8 unsigned halfword elements of input vector are added
1016                  together and resulted integer sum is returned
1017    Arguments   : Inputs  - in       (unsigned halfword vector)
1018                  Outputs - sum_m    (u32 sum)
1019                  Return Type - unsigned word
1020 */
1021 #define HADD_UH_U32(in)                                  \
1022 ( {                                                      \
1023     v4u32 res_m;                                         \
1024     v2u64 res0_m, res1_m;                                \
1025     uint32_t sum_m;                                      \
1026                                                          \
1027     res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in);      \
1028     res0_m = __msa_hadd_u_d(res_m, res_m);               \
1029     res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1);  \
1030     res0_m += res1_m;                                    \
1031     sum_m = __msa_copy_u_w((v4i32) res0_m, 0);           \
1032     sum_m;                                               \
1033 } )
1034 
1035 /* Description : Horizontal addition of signed byte vector elements
1036    Arguments   : Inputs  - in0, in1
1037                  Outputs - out0, out1
1038                  Return Type - as per RTYPE
1039    Details     : Each signed odd byte element from 'in0' is added to
1040                  even signed byte element from 'in0' (pairwise) and the
1041                  halfword result is stored in 'out0'
1042 */
1043 #define HADD_SB2(RTYPE, in0, in1, out0, out1)                 \
1044 {                                                             \
1045     out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0);  \
1046     out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1);  \
1047 }
1048 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1049 
1050 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1051 {                                                                    \
1052     HADD_SB2(RTYPE, in0, in1, out0, out1);                           \
1053     HADD_SB2(RTYPE, in2, in3, out2, out3);                           \
1054 }
1055 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1056 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1057 
1058 /* Description : Horizontal addition of unsigned byte vector elements
1059    Arguments   : Inputs  - in0, in1
1060                  Outputs - out0, out1
1061                  Return Type - as per RTYPE
1062    Details     : Each unsigned odd byte element from 'in0' is added to
1063                  even unsigned byte element from 'in0' (pairwise) and the
1064                  halfword result is stored in 'out0'
1065 */
1066 #define HADD_UB2(RTYPE, in0, in1, out0, out1)                 \
1067 {                                                             \
1068     out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0);  \
1069     out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1);  \
1070 }
1071 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1072 
1073 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2)      \
1074 {                                                             \
1075     HADD_UB2(RTYPE, in0, in1, out0, out1);                    \
1076     out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2);  \
1077 }
1078 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1079 
1080 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1081 {                                                                    \
1082     HADD_UB2(RTYPE, in0, in1, out0, out1);                           \
1083     HADD_UB2(RTYPE, in2, in3, out2, out3);                           \
1084 }
1085 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1086 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1087 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1088 
1089 /* Description : Horizontal subtraction of unsigned byte vector elements
1090    Arguments   : Inputs  - in0, in1
1091                  Outputs - out0, out1
1092                  Return Type - as per RTYPE
1093    Details     : Each unsigned odd byte element from 'in0' is subtracted from
1094                  even unsigned byte element from 'in0' (pairwise) and the
1095                  halfword result is stored in 'out0'
1096 */
1097 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)                 \
1098 {                                                             \
1099     out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0);  \
1100     out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1);  \
1101 }
1102 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1103 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1104 
1105 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \
1106 {                                                                    \
1107     HSUB_UB2(RTYPE, in0, in1, out0, out1);                           \
1108     HSUB_UB2(RTYPE, in2, in3, out2, out3);                           \
1109 }
1110 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1111 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1112 
1113 /* Description : SAD (Sum of Absolute Difference)
1114    Arguments   : Inputs  - in0, in1, ref0, ref1  (unsigned byte src & ref)
1115                  Outputs - sad_m                 (halfword vector with sad)
1116                  Return Type - unsigned halfword
1117    Details     : Absolute difference of all the byte elements from 'in0' with
1118                  'ref0' is calculated and preserved in 'diff0'. From the 16
1119                  unsigned absolute diff values, even-odd pairs are added
1120                  together to generate 8 halfword results.
1121 */
1122 #if HAVE_MSA2
1123 #define SAD_UB2_UH(in0, in1, ref0, ref1)                                 \
1124 ( {                                                                      \
1125     v8u16 sad_m = { 0 };                                                 \
1126     sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in0, (v16u8) ref0); \
1127     sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in1, (v16u8) ref1); \
1128     sad_m;                                                               \
1129 } )
1130 #else
1131 #define SAD_UB2_UH(in0, in1, ref0, ref1)                        \
1132 ( {                                                             \
1133     v16u8 diff0_m, diff1_m;                                     \
1134     v8u16 sad_m = { 0 };                                        \
1135                                                                 \
1136     diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0);        \
1137     diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1);        \
1138                                                                 \
1139     sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m);  \
1140     sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m);  \
1141                                                                 \
1142     sad_m;                                                      \
1143 } )
1144 #endif // #if HAVE_MSA2
1145 
1146 /* Description : Insert specified word elements from input vectors to 1
1147                  destination vector
1148    Arguments   : Inputs  - in0, in1, in2, in3 (4 input vectors)
1149                  Outputs - out                (output vector)
1150                  Return Type - as per RTYPE
1151 */
1152 #define INSERT_W2(RTYPE, in0, in1, out)                 \
1153 {                                                       \
1154     out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0);  \
1155     out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1);  \
1156 }
1157 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1158 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1159 
1160 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out)       \
1161 {                                                       \
1162     out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0);  \
1163     out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1);  \
1164     out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2);  \
1165     out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3);  \
1166 }
1167 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1168 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1169 #define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__)
1170 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1171 
1172 /* Description : Insert specified double word elements from input vectors to 1
1173                  destination vector
1174    Arguments   : Inputs  - in0, in1      (2 input vectors)
1175                  Outputs - out           (output vector)
1176                  Return Type - as per RTYPE
1177 */
1178 #define INSERT_D2(RTYPE, in0, in1, out)                 \
1179 {                                                       \
1180     out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0);  \
1181     out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1);  \
1182 }
1183 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1184 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1185 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
1186 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1187 
1188 /* Description : Interleave even byte elements from vectors
1189    Arguments   : Inputs  - in0, in1, in2, in3
1190                  Outputs - out0, out1
1191                  Return Type - as per RTYPE
1192    Details     : Even byte elements of 'in0' and even byte
1193                  elements of 'in1' are interleaved and copied to 'out0'
1194                  Even byte elements of 'in2' and even byte
1195                  elements of 'in3' are interleaved and copied to 'out1'
1196 */
1197 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1198 {                                                            \
1199     out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0);  \
1200     out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2);  \
1201 }
1202 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1203 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1204 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1205 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1206 
1207 /* Description : Interleave even halfword elements from vectors
1208    Arguments   : Inputs  - in0, in1, in2, in3
1209                  Outputs - out0, out1
1210                  Return Type - as per RTYPE
1211    Details     : Even halfword elements of 'in0' and even halfword
1212                  elements of 'in1' are interleaved and copied to 'out0'
1213                  Even halfword elements of 'in2' and even halfword
1214                  elements of 'in3' are interleaved and copied to 'out1'
1215 */
1216 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1217 {                                                            \
1218     out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0);  \
1219     out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2);  \
1220 }
1221 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1222 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1223 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1224 
1225 /* Description : Interleave even word elements from vectors
1226    Arguments   : Inputs  - in0, in1, in2, in3
1227                  Outputs - out0, out1
1228                  Return Type - as per RTYPE
1229    Details     : Even word elements of 'in0' and even word
1230                  elements of 'in1' are interleaved and copied to 'out0'
1231                  Even word elements of 'in2' and even word
1232                  elements of 'in3' are interleaved and copied to 'out1'
1233 */
1234 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1235 {                                                            \
1236     out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0);  \
1237     out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2);  \
1238 }
1239 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1240 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1241 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1242 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1243 
1244 /* Description : Interleave even double word elements from vectors
1245    Arguments   : Inputs  - in0, in1, in2, in3
1246                  Outputs - out0, out1
1247                  Return Type - as per RTYPE
1248    Details     : Even double word elements of 'in0' and even double word
1249                  elements of 'in1' are interleaved and copied to 'out0'
1250                  Even double word elements of 'in2' and even double word
1251                  elements of 'in3' are interleaved and copied to 'out1'
1252 */
1253 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1254 {                                                            \
1255     out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0);  \
1256     out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2);  \
1257 }
1258 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1259 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1260 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1261 
1262 /* Description : Interleave left half of byte elements from vectors
1263    Arguments   : Inputs  - in0, in1, in2, in3
1264                  Outputs - out0, out1
1265                  Return Type - as per RTYPE
1266    Details     : Left half of byte elements of in0 and left half of byte
1267                  elements of in1 are interleaved and copied to out0.
1268                  Left half of byte elements of in2 and left half of byte
1269                  elements of in3 are interleaved and copied to out1.
1270 */
1271 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1272 {                                                           \
1273     out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1274     out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3);  \
1275 }
1276 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1277 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1278 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1279 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1280 
1281 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1282                 out0, out1, out2, out3)                         \
1283 {                                                               \
1284     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1285     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1286 }
1287 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1288 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1289 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1290 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1291 
1292 /* Description : Interleave left half of halfword elements from vectors
1293    Arguments   : Inputs  - in0, in1, in2, in3
1294                  Outputs - out0, out1
1295                  Return Type - as per RTYPE
1296    Details     : Left half of halfword elements of in0 and left half of halfword
1297                  elements of in1 are interleaved and copied to out0.
1298                  Left half of halfword elements of in2 and left half of halfword
1299                  elements of in3 are interleaved and copied to out1.
1300 */
1301 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1302 {                                                           \
1303     out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1304     out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3);  \
1305 }
1306 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1307 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1308 
1309 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1310                 out0, out1, out2, out3)                         \
1311 {                                                               \
1312     ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1313     ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1314 }
1315 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1316 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1317 
1318 /* Description : Interleave left half of word elements from vectors
1319    Arguments   : Inputs  - in0, in1, in2, in3
1320                  Outputs - out0, out1
1321                  Return Type - as per RTYPE
1322    Details     : Left half of word elements of in0 and left half of word
1323                  elements of in1 are interleaved and copied to out0.
1324                  Left half of word elements of in2 and left half of word
1325                  elements of in3 are interleaved and copied to out1.
1326 */
1327 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1328 {                                                           \
1329     out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1330     out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3);  \
1331 }
1332 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1333 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1334 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1335 
1336 /* Description : Interleave right half of byte elements from vectors
1337    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1338                  Outputs - out0, out1, out2, out3
1339                  Return Type - as per RTYPE
1340    Details     : Right half of byte elements of in0 and right half of byte
1341                  elements of in1 are interleaved and copied to out0.
1342                  Right half of byte elements of in2 and right half of byte
1343                  elements of in3 are interleaved and copied to out1.
1344                  Similar for other pairs
1345 */
1346 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1347 {                                                           \
1348     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1349     out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3);  \
1350 }
1351 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1352 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1353 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1354 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1355 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1356 
1357 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1358 {                                                                       \
1359     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1360     out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5);              \
1361 }
1362 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1363 #define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__)
1364 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1365 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1366 
1367 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1368                 out0, out1, out2, out3)                         \
1369 {                                                               \
1370     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1371     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1372 }
1373 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1374 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1375 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1376 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1377 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1378 
1379 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,    \
1380                 in8, in9, in10, in11, in12, in13, in14, in15,     \
1381                 out0, out1, out2, out3, out4, out5, out6, out7)   \
1382 {                                                                 \
1383     ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,        \
1384             out0, out1, out2, out3);                              \
1385     ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,  \
1386             out4, out5, out6, out7);                              \
1387 }
1388 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1389 #define ILVR_B8_SW(...) ILVR_B8(v4i32, __VA_ARGS__)
1390 
1391 /* Description : Interleave right half of halfword elements from vectors
1392    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1393                  Outputs - out0, out1, out2, out3
1394                  Return Type - as per RTYPE
1395    Details     : Right half of halfword elements of in0 and right half of
1396                  halfword elements of in1 are interleaved and copied to out0.
1397                  Right half of halfword elements of in2 and right half of
1398                  halfword elements of in3 are interleaved and copied to out1.
1399                  Similar for other pairs
1400 */
1401 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1402 {                                                           \
1403     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1404     out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3);  \
1405 }
1406 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1407 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1408 
1409 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1410 {                                                                       \
1411     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1412     out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5);              \
1413 }
1414 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1415 
1416 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1417                 out0, out1, out2, out3)                         \
1418 {                                                               \
1419     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1420     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1421 }
1422 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1423 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1424 
1425 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1426 {                                                           \
1427     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1428     out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3);  \
1429 }
1430 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1431 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1432 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1433 
1434 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1435                 out0, out1, out2, out3)                         \
1436 {                                                               \
1437     ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1438     ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1439 }
1440 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1441 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1442 
1443 /* Description : Interleave right half of double word elements from vectors
1444    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1445                  Outputs - out0, out1, out2, out3
1446                  Return Type - as per RTYPE
1447    Details     : Right half of double word elements of in0 and right half of
1448                  double word elements of in1 are interleaved and copied to out0.
1449                  Right half of double word elements of in2 and right half of
1450                  double word elements of in3 are interleaved and copied to out1.
1451 */
1452 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1453 {                                                           \
1454     out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1);  \
1455     out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3);  \
1456 }
1457 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1458 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1459 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1460 
1461 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1462 {                                                                       \
1463     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1464     out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5);              \
1465 }
1466 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1467 
1468 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1469                 out0, out1, out2, out3)                         \
1470 {                                                               \
1471     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1472     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1473 }
1474 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1475 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1476 
1477 /* Description : Interleave left half of double word elements from vectors
1478    Arguments   : Inputs  - in0, in1, in2, in3
1479                  Outputs - out0, out1
1480                  Return Type - as per RTYPE
1481    Details     : Left half of double word elements of in0 and left half of
1482                  double word elements of in1 are interleaved and copied to out0.
1483                  Left half of double word elements of in2 and left half of
1484                  double word elements of in3 are interleaved and copied to out1.
1485 */
1486 #define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1487 {                                                           \
1488     out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1);  \
1489     out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3);  \
1490 }
1491 #define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__)
1492 #define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__)
1493 #define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__)
1494 
1495 /* Description : Interleave both left and right half of input vectors
1496    Arguments   : Inputs  - in0, in1
1497                  Outputs - out0, out1
1498                  Return Type - as per RTYPE
1499    Details     : Right half of byte elements from 'in0' and 'in1' are
1500                  interleaved and stored to 'out0'
1501                  Left half of byte elements from 'in0' and 'in1' are
1502                  interleaved and stored to 'out1'
1503 */
1504 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)               \
1505 {                                                           \
1506     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
1507     out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
1508 }
1509 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1510 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1511 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1512 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1513 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1514 
1515 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)               \
1516 {                                                           \
1517     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \
1518     out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \
1519 }
1520 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
1521 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1522 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1523 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1524 
1525 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)               \
1526 {                                                           \
1527     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \
1528     out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \
1529 }
1530 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1531 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1532 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1533 
1534 /* Description : Maximum values between signed elements of vector and
1535                  5-bit signed immediate value are copied to the output vector
1536    Arguments   : Inputs  - in0, in1, in2, in3, max_val
1537                  Outputs - in0, in1, in2, in3 (in place)
1538                  Return Type - as per RTYPE
1539    Details     : Maximum of signed halfword element values from 'in0' and
1540                  'max_val' are written to output vector 'in0'
1541 */
1542 #define MAXI_SH2(RTYPE, in0, in1, max_val)               \
1543 {                                                        \
1544     in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val);  \
1545     in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val);  \
1546 }
1547 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1548 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1549 
1550 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val)  \
1551 {                                                     \
1552     MAXI_SH2(RTYPE, in0, in1, max_val);               \
1553     MAXI_SH2(RTYPE, in2, in3, max_val);               \
1554 }
1555 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1556 #define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__)
1557 
1558 #define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val)  \
1559 {                                                                         \
1560     MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val);                         \
1561     MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val);                         \
1562 }
1563 #define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__)
1564 #define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__)
1565 
1566 /* Description : Saturate the halfword element values to the max
1567                  unsigned value of (sat_val+1 bits)
1568                  The element data width remains unchanged
1569    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1570                  Outputs - in0, in1, in2, in3 (in place)
1571                  Return Type - as per RTYPE
1572    Details     : Each unsigned halfword element from 'in0' is saturated to the
1573                  value generated with (sat_val+1) bit range
1574                  Results are in placed to original vectors
1575 */
1576 #define SAT_UH2(RTYPE, in0, in1, sat_val)               \
1577 {                                                       \
1578     in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val);  \
1579     in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val);  \
1580 }
1581 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1582 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1583 
1584 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1585 {                                                    \
1586     SAT_UH2(RTYPE, in0, in1, sat_val);               \
1587     SAT_UH2(RTYPE, in2, in3, sat_val);               \
1588 }
1589 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1590 #define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__)
1591 
1592 #define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val)  \
1593 {                                                                        \
1594     SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val);                         \
1595     SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val);                         \
1596 }
1597 #define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__)
1598 #define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__)
1599 
1600 /* Description : Saturate the halfword element values to the max
1601                  unsigned value of (sat_val+1 bits)
1602                  The element data width remains unchanged
1603    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1604                  Outputs - in0, in1, in2, in3 (in place)
1605                  Return Type - as per RTYPE
1606    Details     : Each unsigned halfword element from 'in0' is saturated to the
1607                  value generated with (sat_val+1) bit range
1608                  Results are in placed to original vectors
1609 */
1610 #define SAT_SH2(RTYPE, in0, in1, sat_val)               \
1611 {                                                       \
1612     in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val);  \
1613     in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val);  \
1614 }
1615 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1616 
1617 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val)          \
1618 {                                                       \
1619     SAT_SH2(RTYPE, in0, in1, sat_val);                  \
1620     in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val);  \
1621 }
1622 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1623 
1624 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val)  \
1625 {                                                    \
1626     SAT_SH2(RTYPE, in0, in1, sat_val);               \
1627     SAT_SH2(RTYPE, in2, in3, sat_val);               \
1628 }
1629 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1630 
1631 /* Description : Saturate the word element values to the max
1632                  unsigned value of (sat_val+1 bits)
1633                  The element data width remains unchanged
1634    Arguments   : Inputs  - in0, in1, in2, in3, sat_val
1635                  Outputs - in0, in1, in2, in3 (in place)
1636                  Return Type - as per RTYPE
1637    Details     : Each unsigned word element from 'in0' is saturated to the
1638                  value generated with (sat_val+1) bit range
1639                  Results are in placed to original vectors
1640 */
1641 #define SAT_SW2(RTYPE, in0, in1, sat_val)               \
1642 {                                                       \
1643     in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val);  \
1644     in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val);  \
1645 }
1646 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1647 
1648 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val)  \
1649 {                                                    \
1650     SAT_SW2(RTYPE, in0, in1, sat_val);               \
1651     SAT_SW2(RTYPE, in2, in3, sat_val);               \
1652 }
1653 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1654 
1655 /* Description : Indexed halfword element values are replicated to all
1656                  elements in output vector
1657    Arguments   : Inputs  - in, idx0, idx1
1658                  Outputs - out0, out1
1659                  Return Type - as per RTYPE
1660    Details     : 'idx0' element value from 'in' vector is replicated to all
1661                   elements in 'out0' vector
1662                   Valid index range for halfword operation is 0-7
1663 */
1664 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1)  \
1665 {                                                     \
1666     out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0);  \
1667     out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1);  \
1668 }
1669 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1670 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1671 
1672 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2,        \
1673                   out0, out1, out2)                   \
1674 {                                                     \
1675     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1676     out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2);  \
1677 }
1678 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1679 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1680 
1681 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
1682                   out0, out1, out2, out3)             \
1683 {                                                     \
1684     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \
1685     SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);     \
1686 }
1687 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1688 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1689 
1690 /* Description : Indexed word element values are replicated to all
1691                  elements in output vector
1692    Arguments   : Inputs  - in, stidx
1693                  Outputs - out0, out1
1694                  Return Type - as per RTYPE
1695    Details     : 'stidx' element value from 'in' vector is replicated to all
1696                   elements in 'out0' vector
1697                  'stidx + 1' element value from 'in' vector is replicated to all
1698                   elements in 'out1' vector
1699                   Valid index range for halfword operation is 0-3
1700 */
1701 #define SPLATI_W2(RTYPE, in, stidx, out0, out1)            \
1702 {                                                          \
1703     out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx);      \
1704     out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1));  \
1705 }
1706 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1707 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1708 
1709 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3)  \
1710 {                                                     \
1711     SPLATI_W2(RTYPE, in, 0, out0, out1);              \
1712     SPLATI_W2(RTYPE, in, 2, out2, out3);              \
1713 }
1714 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1715 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1716 
1717 /* Description : Pack even byte elements of vector pairs
1718    Arguments   : Inputs  - in0, in1, in2, in3
1719                  Outputs - out0, out1
1720                  Return Type - as per RTYPE
1721    Details     : Even byte elements of in0 are copied to the left half of
1722                  out0 & even byte elements of in1 are copied to the right
1723                  half of out0.
1724                  Even byte elements of in2 are copied to the left half of
1725                  out1 & even byte elements of in3 are copied to the right
1726                  half of out1.
1727 */
1728 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1729 {                                                            \
1730     out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \
1731     out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3);  \
1732 }
1733 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1734 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1735 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1736 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1737 
1738 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \
1739 {                                                                        \
1740     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \
1741     out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5);              \
1742 }
1743 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1744 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1745 
1746 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1747                  out0, out1, out2, out3)                         \
1748 {                                                                \
1749     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1750     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1751 }
1752 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1753 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1754 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1755 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1756 
1757 /* Description : Pack even halfword elements of vector pairs
1758    Arguments   : Inputs  - in0, in1, in2, in3
1759                  Outputs - out0, out1
1760                  Return Type - as per RTYPE
1761    Details     : Even halfword elements of in0 are copied to the left half of
1762                  out0 & even halfword elements of in1 are copied to the right
1763                  half of out0.
1764                  Even halfword elements of in2 are copied to the left half of
1765                  out1 & even halfword elements of in3 are copied to the right
1766                  half of out1.
1767 */
1768 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1769 {                                                            \
1770     out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1);  \
1771     out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3);  \
1772 }
1773 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1774 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1775 
1776 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1777                  out0, out1, out2, out3)                         \
1778 {                                                                \
1779     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1780     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1781 }
1782 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1783 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1784 
1785 /* Description : Pack even double word elements of vector pairs
1786    Arguments   : Inputs  - in0, in1, in2, in3
1787                  Outputs - out0, out1
1788                  Return Type - as per RTYPE
1789    Details     : Even double elements of in0 are copied to the left half of
1790                  out0 & even double elements of in1 are copied to the right
1791                  half of out0.
1792                  Even double elements of in2 are copied to the left half of
1793                  out1 & even double elements of in3 are copied to the right
1794                  half of out1.
1795 */
1796 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1797 {                                                            \
1798     out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \
1799     out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3);  \
1800 }
1801 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1802 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1803 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1804 
1805 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1806                  out0, out1, out2, out3)                         \
1807 {                                                                \
1808     PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1809     PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1810 }
1811 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1812 
1813 /* Description : Pack odd double word elements of vector pairs
1814    Arguments   : Inputs  - in0, in1
1815                  Outputs - out0, out1
1816                  Return Type - as per RTYPE
1817    Details     : As operation is on same input 'in0' vector, index 1 double word
1818                  element is overwritten to index 0 and result is written to out0
1819                  As operation is on same input 'in1' vector, index 1 double word
1820                  element is overwritten to index 0 and result is written to out1
1821 */
1822 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \
1823 {                                                            \
1824     out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1);  \
1825     out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3);  \
1826 }
1827 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1828 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1829 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1830 
1831 /* Description : Each byte element is logically xor'ed with immediate 128
1832    Arguments   : Inputs  - in0, in1
1833                  Outputs - in0, in1 (in-place)
1834                  Return Type - as per RTYPE
1835    Details     : Each unsigned byte element from input vector 'in0' is
1836                  logically xor'ed with 128 and result is in-place stored in
1837                  'in0' vector
1838                  Each unsigned byte element from input vector 'in1' is
1839                  logically xor'ed with 128 and result is in-place stored in
1840                  'in1' vector
1841                  Similar for other pairs
1842 */
1843 #define XORI_B2_128(RTYPE, in0, in1)               \
1844 {                                                  \
1845     in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128);  \
1846     in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128);  \
1847 }
1848 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1849 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1850 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1851 
1852 #define XORI_B3_128(RTYPE, in0, in1, in2)          \
1853 {                                                  \
1854     XORI_B2_128(RTYPE, in0, in1);                  \
1855     in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128);  \
1856 }
1857 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1858 
1859 #define XORI_B4_128(RTYPE, in0, in1, in2, in3)  \
1860 {                                               \
1861     XORI_B2_128(RTYPE, in0, in1);               \
1862     XORI_B2_128(RTYPE, in2, in3);               \
1863 }
1864 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1865 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1866 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1867 
1868 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4)  \
1869 {                                                    \
1870     XORI_B3_128(RTYPE, in0, in1, in2);               \
1871     XORI_B2_128(RTYPE, in3, in4);                    \
1872 }
1873 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1874 
1875 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5)  \
1876 {                                                         \
1877     XORI_B4_128(RTYPE, in0, in1, in2, in3);               \
1878     XORI_B2_128(RTYPE, in4, in5);                         \
1879 }
1880 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1881 
1882 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6)  \
1883 {                                                              \
1884     XORI_B4_128(RTYPE, in0, in1, in2, in3);                    \
1885     XORI_B3_128(RTYPE, in4, in5, in6);                         \
1886 }
1887 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1888 
1889 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7)  \
1890 {                                                                   \
1891     XORI_B4_128(RTYPE, in0, in1, in2, in3);                         \
1892     XORI_B4_128(RTYPE, in4, in5, in6, in7);                         \
1893 }
1894 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1895 #define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__)
1896 
1897 /* Description : Addition of signed halfword elements and signed saturation
1898    Arguments   : Inputs  - in0, in1, in2, in3
1899                  Outputs - out0, out1
1900                  Return Type - as per RTYPE
1901    Details     : Signed halfword elements from 'in0' are added to signed
1902                  halfword elements of 'in1'. The result is then signed saturated
1903                  between -32768 to +32767 (as per halfword data type)
1904                  Similar for other pairs
1905 */
1906 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)       \
1907 {                                                             \
1908     out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1);  \
1909     out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3);  \
1910 }
1911 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1912 
1913 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1914                  out0, out1, out2, out3)                         \
1915 {                                                                \
1916     ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);             \
1917     ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);             \
1918 }
1919 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1920 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1921 
1922 /* Description : Shift left all elements of vector (generic for all data types)
1923    Arguments   : Inputs  - in0, in1, in2, in3, shift
1924                  Outputs - in0, in1, in2, in3 (in place)
1925                  Return Type - as per input vector RTYPE
1926    Details     : Each element of vector 'in0' is left shifted by 'shift' and
1927                  result is in place written to 'in0'
1928                  Similar for other pairs
1929 */
1930 #define SLLI_2V(in0, in1, shift)  \
1931 {                                 \
1932     in0 = in0 << shift;           \
1933     in1 = in1 << shift;           \
1934 }
1935 #define SLLI_4V(in0, in1, in2, in3, shift)  \
1936 {                                           \
1937     in0 = in0 << shift;                     \
1938     in1 = in1 << shift;                     \
1939     in2 = in2 << shift;                     \
1940     in3 = in3 << shift;                     \
1941 }
1942 
1943 /* Description : Arithmetic shift right all elements of vector
1944                  (generic for all data types)
1945    Arguments   : Inputs  - in0, in1, in2, in3, shift
1946                  Outputs - in0, in1, in2, in3 (in place)
1947                  Return Type - as per input vector RTYPE
1948    Details     : Each element of vector 'in0' is right shifted by 'shift' and
1949                  result is in place written to 'in0'
1950                  Here, 'shift' is GP variable passed in
1951                  Similar for other pairs
1952 */
1953 #define SRA_4V(in0, in1, in2, in3, shift)  \
1954 {                                          \
1955     in0 = in0 >> shift;                    \
1956     in1 = in1 >> shift;                    \
1957     in2 = in2 >> shift;                    \
1958     in3 = in3 >> shift;                    \
1959 }
1960 
1961 /* Description : Shift right logical all halfword elements of vector
1962    Arguments   : Inputs  - in0, in1, in2, in3, shift
1963                  Outputs - in0, in1, in2, in3 (in place)
1964                  Return Type - as per RTYPE
1965    Details     : Each element of vector 'in0' is shifted right logical by
1966                  number of bits respective element holds in vector 'shift' and
1967                  result is in place written to 'in0'
1968                  Here, 'shift' is a vector passed in
1969                  Similar for other pairs
1970 */
1971 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift)            \
1972 {                                                           \
1973     in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift);  \
1974     in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift);  \
1975     in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift);  \
1976     in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift);  \
1977 }
1978 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1979 
1980 #define SRLR_H4(RTYPE, in0, in1, in2, in3, shift)            \
1981 {                                                            \
1982     in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift);  \
1983     in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift);  \
1984     in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift);  \
1985     in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift);  \
1986 }
1987 #define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__)
1988 #define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__)
1989 
1990 #define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift)  \
1991 {                                                                      \
1992     SRLR_H4(RTYPE, in0, in1, in2, in3, shift);                         \
1993     SRLR_H4(RTYPE, in4, in5, in6, in7, shift);                         \
1994 }
1995 #define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__)
1996 #define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__)
1997 
1998 /* Description : Shift right arithmetic rounded halfwords
1999    Arguments   : Inputs  - in0, in1, shift
2000                  Outputs - in0, in1, (in place)
2001                  Return Type - as per RTYPE
2002    Details     : Each element of vector 'in0' is shifted right arithmetic by
2003                  number of bits respective element holds in vector 'shift'.
2004                  The last discarded bit is added to shifted value for rounding
2005                  and the result is in place written to 'in0'
2006                  Here, 'shift' is a vector passed in
2007                  Similar for other pairs
2008 */
2009 #define SRAR_H2(RTYPE, in0, in1, shift)                      \
2010 {                                                            \
2011     in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift);  \
2012     in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift);  \
2013 }
2014 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2015 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2016 
2017 #define SRAR_H3(RTYPE, in0, in1, in2, shift)                 \
2018 {                                                            \
2019     SRAR_H2(RTYPE, in0, in1, shift)                          \
2020     in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift);  \
2021 }
2022 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2023 
2024 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift)  \
2025 {                                                  \
2026     SRAR_H2(RTYPE, in0, in1, shift)                \
2027     SRAR_H2(RTYPE, in2, in3, shift)                \
2028 }
2029 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2030 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2031 
2032 /* Description : Shift right arithmetic rounded words
2033    Arguments   : Inputs  - in0, in1, shift
2034                  Outputs - in0, in1, (in place)
2035                  Return Type - as per RTYPE
2036    Details     : Each element of vector 'in0' is shifted right arithmetic by
2037                  number of bits respective element holds in vector 'shift'.
2038                  The last discarded bit is added to shifted value for rounding
2039                  and the result is in place written to 'in0'
2040                  Here, 'shift' is a vector passed in
2041                  Similar for other pairs
2042 */
2043 #define SRAR_W2(RTYPE, in0, in1, shift)                      \
2044 {                                                            \
2045     in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift);  \
2046     in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift);  \
2047 }
2048 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2049 
2050 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift)  \
2051 {                                                  \
2052     SRAR_W2(RTYPE, in0, in1, shift)                \
2053     SRAR_W2(RTYPE, in2, in3, shift)                \
2054 }
2055 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2056 
2057 /* Description : Shift right arithmetic rounded (immediate)
2058    Arguments   : Inputs  - in0, in1, in2, in3, shift
2059                  Outputs - in0, in1, in2, in3 (in place)
2060                  Return Type - as per RTYPE
2061    Details     : Each element of vector 'in0' is shifted right arithmetic by
2062                  value in 'shift'.
2063                  The last discarded bit is added to shifted value for rounding
2064                  and the result is in place written to 'in0'
2065                  Similar for other pairs
2066 */
2067 #define SRARI_H2(RTYPE, in0, in1, shift)              \
2068 {                                                     \
2069     in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift);  \
2070     in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift);  \
2071 }
2072 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2073 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2074 
2075 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift)    \
2076 {                                                     \
2077     SRARI_H2(RTYPE, in0, in1, shift);                 \
2078     SRARI_H2(RTYPE, in2, in3, shift);                 \
2079 }
2080 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2081 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2082 
2083 /* Description : Shift right arithmetic rounded (immediate)
2084    Arguments   : Inputs  - in0, in1, shift
2085                  Outputs - in0, in1     (in place)
2086                  Return Type - as per RTYPE
2087    Details     : Each element of vector 'in0' is shifted right arithmetic by
2088                  value in 'shift'.
2089                  The last discarded bit is added to shifted value for rounding
2090                  and the result is in place written to 'in0'
2091                  Similar for other pairs
2092 */
2093 #define SRARI_W2(RTYPE, in0, in1, shift)              \
2094 {                                                     \
2095     in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift);  \
2096     in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift);  \
2097 }
2098 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2099 
2100 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift)  \
2101 {                                                   \
2102     SRARI_W2(RTYPE, in0, in1, shift);               \
2103     SRARI_W2(RTYPE, in2, in3, shift);               \
2104 }
2105 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2106 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2107 
2108 /* Description : Multiplication of pairs of vectors
2109    Arguments   : Inputs  - in0, in1, in2, in3
2110                  Outputs - out0, out1
2111    Details     : Each element from 'in0' is multiplied with elements from 'in1'
2112                  and result is written to 'out0'
2113                  Similar for other pairs
2114 */
2115 #define MUL2(in0, in1, in2, in3, out0, out1)  \
2116 {                                             \
2117     out0 = in0 * in1;                         \
2118     out1 = in2 * in3;                         \
2119 }
2120 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2121 {                                                                             \
2122     MUL2(in0, in1, in2, in3, out0, out1);                                     \
2123     MUL2(in4, in5, in6, in7, out2, out3);                                     \
2124 }
2125 
2126 /* Description : Addition of 2 pairs of vectors
2127    Arguments   : Inputs  - in0, in1, in2, in3
2128                  Outputs - out0, out1
2129    Details     : Each element from 2 pairs vectors is added and 2 results are
2130                  produced
2131 */
2132 #define ADD2(in0, in1, in2, in3, out0, out1)  \
2133 {                                             \
2134     out0 = in0 + in1;                         \
2135     out1 = in2 + in3;                         \
2136 }
2137 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2138 {                                                                             \
2139     ADD2(in0, in1, in2, in3, out0, out1);                                     \
2140     ADD2(in4, in5, in6, in7, out2, out3);                                     \
2141 }
2142 
2143 /* Description : Subtraction of 2 pairs of vectors
2144    Arguments   : Inputs  - in0, in1, in2, in3
2145                  Outputs - out0, out1
2146    Details     : Each element from 2 pairs vectors is subtracted and 2 results
2147                  are produced
2148 */
2149 #define SUB2(in0, in1, in2, in3, out0, out1)  \
2150 {                                             \
2151     out0 = in0 - in1;                         \
2152     out1 = in2 - in3;                         \
2153 }
2154 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \
2155 {                                                                             \
2156     out0 = in0 - in1;                                                         \
2157     out1 = in2 - in3;                                                         \
2158     out2 = in4 - in5;                                                         \
2159     out3 = in6 - in7;                                                         \
2160 }
2161 
2162 /* Description : Sign extend byte elements from right half of the vector
2163    Arguments   : Input  - in    (byte vector)
2164                  Output - out   (sign extended halfword vector)
2165                  Return Type - signed halfword
2166    Details     : Sign bit of byte elements from input vector 'in' is
2167                  extracted and interleaved with same vector 'in' to generate
2168                  8 halfword elements keeping sign intact
2169 */
2170 #define UNPCK_R_SB_SH(in, out)                       \
2171 {                                                    \
2172     v16i8 sign_m;                                    \
2173                                                      \
2174     sign_m = __msa_clti_s_b((v16i8) in, 0);          \
2175     out = (v8i16) __msa_ilvr_b(sign_m, (v16i8) in);  \
2176 }
2177 
2178 /* Description : Sign extend halfword elements from right half of the vector
2179    Arguments   : Inputs  - in    (input halfword vector)
2180                  Outputs - out   (sign extended word vectors)
2181                  Return Type - signed word
2182    Details     : Sign bit of halfword elements from input vector 'in' is
2183                  extracted and interleaved with same vector 'in0' to generate
2184                  4 word elements keeping sign intact
2185 */
2186 #if HAVE_MSA2
2187 #define UNPCK_R_SH_SW(in, out)                           \
2188 {                                                        \
2189     out = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \
2190 }
2191 #else
2192 #define UNPCK_R_SH_SW(in, out)                       \
2193 {                                                    \
2194     v8i16 sign_m;                                    \
2195                                                      \
2196     sign_m = __msa_clti_s_h((v8i16) in, 0);          \
2197     out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in);  \
2198 }
2199 #endif // #if HAVE_MSA2
2200 
2201 /* Description : Sign extend byte elements from input vector and return
2202                  halfword results in pair of vectors
2203    Arguments   : Inputs  - in           (1 input byte vector)
2204                  Outputs - out0, out1   (sign extended 2 halfword vectors)
2205                  Return Type - signed halfword
2206    Details     : Sign bit of byte elements from input vector 'in' is
2207                  extracted and interleaved right with same vector 'in0' to
2208                  generate 8 signed halfword elements in 'out0'
2209                  Then interleaved left with same vector 'in0' to
2210                  generate 8 signed halfword elements in 'out1'
2211 */
2212 #if HAVE_MSA2
2213 #define UNPCK_SB_SH(in, out0, out1)                       \
2214 {                                                         \
2215     out0 = (v4i32) __builtin_msa2_w2x_lo_s_b((v16i8) in); \
2216     out1 = (v4i32) __builtin_msa2_w2x_hi_s_b((v16i8) in); \
2217 }
2218 #else
2219 #define UNPCK_SB_SH(in, out0, out1)                  \
2220 {                                                    \
2221     v16i8 tmp_m;                                     \
2222                                                      \
2223     tmp_m = __msa_clti_s_b((v16i8) in, 0);           \
2224     ILVRL_B2_SH(tmp_m, in, out0, out1);              \
2225 }
2226 #endif // #if HAVE_MSA2
2227 
2228 /* Description : Zero extend unsigned byte elements to halfword elements
2229    Arguments   : Inputs  - in           (1 input unsigned byte vector)
2230                  Outputs - out0, out1   (unsigned 2 halfword vectors)
2231                  Return Type - signed halfword
2232    Details     : Zero extended right half of vector is returned in 'out0'
2233                  Zero extended left half of vector is returned in 'out1'
2234 */
2235 #define UNPCK_UB_SH(in, out0, out1)                   \
2236 {                                                     \
2237     v16i8 zero_m = { 0 };                             \
2238                                                       \
2239     ILVRL_B2_SH(zero_m, in, out0, out1);              \
2240 }
2241 
2242 /* Description : Sign extend halfword elements from input vector and return
2243                  result in pair of vectors
2244    Arguments   : Inputs  - in           (1 input halfword vector)
2245                  Outputs - out0, out1   (sign extended 2 word vectors)
2246                  Return Type - signed word
2247    Details     : Sign bit of halfword elements from input vector 'in' is
2248                  extracted and interleaved right with same vector 'in0' to
2249                  generate 4 signed word elements in 'out0'
2250                  Then interleaved left with same vector 'in0' to
2251                  generate 4 signed word elements in 'out1'
2252 */
2253 #if HAVE_MSA2
2254 #define UNPCK_SH_SW(in, out0, out1)                       \
2255 {                                                         \
2256     out0 = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \
2257     out1 = (v4i32) __builtin_msa2_w2x_hi_s_h((v8i16) in); \
2258 }
2259 #else
2260 #define UNPCK_SH_SW(in, out0, out1)                  \
2261 {                                                    \
2262     v8i16 tmp_m;                                     \
2263                                                      \
2264     tmp_m = __msa_clti_s_h((v8i16) in, 0);           \
2265     ILVRL_H2_SW(tmp_m, in, out0, out1);              \
2266 }
2267 #endif // #if HAVE_MSA2
2268 
2269 /* Description : Swap two variables
2270    Arguments   : Inputs  - in0, in1
2271                  Outputs - in0, in1 (in-place)
2272    Details     : Swapping of two input variables using xor
2273 */
2274 #define SWAP(in0, in1)  \
2275 {                       \
2276     in0 = in0 ^ in1;    \
2277     in1 = in0 ^ in1;    \
2278     in0 = in0 ^ in1;    \
2279 }
2280 
2281 /* Description : Butterfly of 4 input vectors
2282    Arguments   : Inputs  - in0, in1, in2, in3
2283                  Outputs - out0, out1, out2, out3
2284    Details     : Butterfly operation
2285 */
2286 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3)  \
2287 {                                                                \
2288     out0 = in0 + in3;                                            \
2289     out1 = in1 + in2;                                            \
2290                                                                  \
2291     out2 = in1 - in2;                                            \
2292     out3 = in0 - in3;                                            \
2293 }
2294 
2295 /* Description : Butterfly of 8 input vectors
2296    Arguments   : Inputs  - in0 ...  in7
2297                  Outputs - out0 .. out7
2298    Details     : Butterfly operation
2299 */
2300 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,          \
2301                     out0, out1, out2, out3, out4, out5, out6, out7)  \
2302 {                                                                    \
2303     out0 = in0 + in7;                                                \
2304     out1 = in1 + in6;                                                \
2305     out2 = in2 + in5;                                                \
2306     out3 = in3 + in4;                                                \
2307                                                                      \
2308     out4 = in3 - in4;                                                \
2309     out5 = in2 - in5;                                                \
2310     out6 = in1 - in6;                                                \
2311     out7 = in0 - in7;                                                \
2312 }
2313 
2314 /* Description : Butterfly of 16 input vectors
2315    Arguments   : Inputs  - in0 ...  in15
2316                  Outputs - out0 .. out15
2317    Details     : Butterfly operation
2318 */
2319 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                \
2320                      in8, in9,  in10, in11, in12, in13, in14, in15,         \
2321                      out0, out1, out2, out3, out4, out5, out6, out7,        \
2322                      out8, out9, out10, out11, out12, out13, out14, out15)  \
2323 {                                                                           \
2324     out0 = in0 + in15;                                                      \
2325     out1 = in1 + in14;                                                      \
2326     out2 = in2 + in13;                                                      \
2327     out3 = in3 + in12;                                                      \
2328     out4 = in4 + in11;                                                      \
2329     out5 = in5 + in10;                                                      \
2330     out6 = in6 + in9;                                                       \
2331     out7 = in7 + in8;                                                       \
2332                                                                             \
2333     out8 = in7 - in8;                                                       \
2334     out9 = in6 - in9;                                                       \
2335     out10 = in5 - in10;                                                     \
2336     out11 = in4 - in11;                                                     \
2337     out12 = in3 - in12;                                                     \
2338     out13 = in2 - in13;                                                     \
2339     out14 = in1 - in14;                                                     \
2340     out15 = in0 - in15;                                                     \
2341 }
2342 
2343 /* Description : Transposes input 4x4 byte block
2344    Arguments   : Inputs  - in0, in1, in2, in3      (input 4x4 byte block)
2345                  Outputs - out0, out1, out2, out3  (output 4x4 byte block)
2346                  Return Type - unsigned byte
2347    Details     :
2348 */
2349 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3)  \
2350 {                                                                       \
2351     v16i8 zero_m = { 0 };                                               \
2352     v16i8 s0_m, s1_m, s2_m, s3_m;                                       \
2353                                                                         \
2354     ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m);                         \
2355     ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m);                                \
2356                                                                         \
2357     out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m);                            \
2358     out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4);               \
2359     out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4);               \
2360     out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4);               \
2361 }
2362 
2363 /* Description : Transposes input 8x4 byte block into 4x8
2364    Arguments   : Inputs  - in0, in1, in2, in3      (input 8x4 byte block)
2365                  Outputs - out0, out1, out2, out3  (output 4x8 byte block)
2366                  Return Type - as per RTYPE
2367    Details     :
2368 */
2369 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
2370                         out0, out1, out2, out3)                         \
2371 {                                                                       \
2372     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
2373                                                                         \
2374     ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m);                    \
2375     tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
2376     ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m);                    \
2377                                                                         \
2378     tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \
2379     ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m);                        \
2380                                                                         \
2381     ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2);                        \
2382     out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0);            \
2383     out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
2384 }
2385 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2386 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2387 
2388 /* Description : Transposes input 8x8 byte block
2389    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
2390                            (input 8x8 byte block)
2391                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2392                            (output 8x8 byte block)
2393                  Return Type - as per RTYPE
2394    Details     :
2395 */
2396 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \
2397                         out0, out1, out2, out3, out4, out5, out6, out7)  \
2398 {                                                                        \
2399     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                \
2400     v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                \
2401     v16i8 zeros = { 0 };                                                 \
2402                                                                          \
2403     ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                   \
2404                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                          \
2405     ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                         \
2406     ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                         \
2407     ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                         \
2408     ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                         \
2409     SLDI_B4(RTYPE, zeros, out0, zeros, out2, zeros, out4, zeros, out6,   \
2410             8, out1, out3, out5, out7);                                  \
2411 }
2412 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2413 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2414 
2415 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2416    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
2417                            in8, in9, in10, in11, in12, in13, in14, in15
2418                  Outputs - out0, out1, out2, out3
2419                  Return Type - unsigned byte
2420    Details     :
2421 */
2422 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
2423                             in8, in9, in10, in11, in12, in13, in14, in15,  \
2424                             out0, out1, out2, out3)                        \
2425 {                                                                          \
2426     v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
2427                                                                            \
2428     ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m);                      \
2429     out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \
2430                                                                            \
2431     ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                      \
2432     out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \
2433                                                                            \
2434     ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m);                     \
2435                                                                            \
2436     tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
2437     ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                     \
2438                                                                            \
2439     tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \
2440     ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);               \
2441     out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2442     out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2443                                                                            \
2444     tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1);            \
2445     tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m);        \
2446     out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2447     out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2448 }
2449 
2450 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2451    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
2452                            in8, in9, in10, in11, in12, in13, in14, in15
2453                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2454                  Return Type - unsigned byte
2455    Details     :
2456 */
2457 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,          \
2458                             in8, in9, in10, in11, in12, in13, in14, in15,    \
2459                             out0, out1, out2, out3, out4, out5, out6, out7)  \
2460 {                                                                            \
2461     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
2462     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
2463                                                                              \
2464     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
2465     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
2466     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
2467     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
2468                                                                              \
2469     tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7);              \
2470     tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7);              \
2471     tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5);              \
2472     tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5);              \
2473     out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3);                \
2474     tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3);              \
2475     out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1);                \
2476     tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1);              \
2477                                                                              \
2478     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
2479     out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2480     out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2481                                                                              \
2482     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \
2483     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5);              \
2484     out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2485     out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2486                                                                              \
2487     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
2488     out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2489     out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2490                                                                              \
2491     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \
2492     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \
2493     out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2494     out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \
2495 }
2496 
2497 /* Description : Transposes 4x4 block with half word elements in vectors
2498    Arguments   : Inputs  - in0, in1, in2, in3
2499                  Outputs - out0, out1, out2, out3
2500                  Return Type - signed halfword
2501    Details     :
2502 */
2503 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)  \
2504 {                                                                       \
2505     v8i16 s0_m, s1_m;                                                   \
2506                                                                         \
2507     ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                         \
2508     ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                \
2509     out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0);            \
2510     out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \
2511 }
2512 
2513 /* Description : Transposes 8x8 block with half word elements in vectors
2514    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
2515                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2516                  Return Type - as per RTYPE
2517    Details     :
2518 */
2519 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \
2520                        out0, out1, out2, out3, out4, out5, out6, out7)  \
2521 {                                                                       \
2522     v8i16 s0_m, s1_m;                                                   \
2523     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
2524     v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                               \
2525                                                                         \
2526     ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                         \
2527     ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                            \
2528     ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                         \
2529     ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                            \
2530     ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                         \
2531     ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                            \
2532     ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                         \
2533     ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                            \
2534     PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,     \
2535              tmp3_m, tmp7_m, out0, out2, out4, out6);                   \
2536     out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m);       \
2537     out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m);       \
2538     out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m);       \
2539     out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m);       \
2540 }
2541 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2542 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2543 
2544 /* Description : Transposes 4x4 block with word elements in vectors
2545    Arguments   : Inputs  - in0, in1, in2, in3
2546                  Outputs - out0, out1, out2, out3
2547                  Return Type - signed word
2548    Details     :
2549 */
2550 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)  \
2551 {                                                                       \
2552     v4i32 s0_m, s1_m, s2_m, s3_m;                                       \
2553                                                                         \
2554     ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                  \
2555     ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                  \
2556                                                                         \
2557     out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m);            \
2558     out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m);            \
2559     out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m);            \
2560     out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m);            \
2561 }
2562 
2563 /* Description : Average byte elements from pair of vectors and store 8x4 byte
2564                  block in destination memory
2565    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2566    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2567                  averaged (a + b)/2 and stored in 'tmp0_m'
2568                  Each byte element from input vector pair 'in2' and 'in3' are
2569                  averaged (a + b)/2 and stored in 'tmp1_m'
2570                  Each byte element from input vector pair 'in4' and 'in5' are
2571                  averaged (a + b)/2 and stored in 'tmp2_m'
2572                  Each byte element from input vector pair 'in6' and 'in7' are
2573                  averaged (a + b)/2 and stored in 'tmp3_m'
2574                  The half vector results from all 4 vectors are stored in
2575                  destination memory as 8x4 byte block
2576 */
2577 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2578 {                                                                           \
2579     uint64_t out0_m, out1_m, out2_m, out3_m;                                \
2580     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
2581                                                                             \
2582     tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                       \
2583     tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                       \
2584     tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                       \
2585     tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                       \
2586                                                                             \
2587     out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0);                             \
2588     out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0);                             \
2589     out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0);                             \
2590     out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0);                             \
2591     SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                      \
2592 }
2593 
2594 /* Description : Average byte elements from pair of vectors and store 16x4 byte
2595                  block in destination memory
2596    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2597    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2598                  averaged (a + b)/2 and stored in 'tmp0_m'
2599                  Each byte element from input vector pair 'in2' and 'in3' are
2600                  averaged (a + b)/2 and stored in 'tmp1_m'
2601                  Each byte element from input vector pair 'in4' and 'in5' are
2602                  averaged (a + b)/2 and stored in 'tmp2_m'
2603                  Each byte element from input vector pair 'in6' and 'in7' are
2604                  averaged (a + b)/2 and stored in 'tmp3_m'
2605                  The results from all 4 vectors are stored in destination
2606                  memory as 16x4 byte block
2607 */
2608 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2609 {                                                                            \
2610     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
2611                                                                              \
2612     tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                        \
2613     tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                        \
2614     tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                        \
2615     tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                        \
2616                                                                              \
2617     ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride);                    \
2618 }
2619 
2620 /* Description : Average rounded byte elements from pair of vectors and store
2621                  8x4 byte block in destination memory
2622    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2623    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2624                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2625                  Each byte element from input vector pair 'in2' and 'in3' are
2626                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2627                  Each byte element from input vector pair 'in4' and 'in5' are
2628                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2629                  Each byte element from input vector pair 'in6' and 'in7' are
2630                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2631                  The half vector results from all 4 vectors are stored in
2632                  destination memory as 8x4 byte block
2633 */
2634 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2635 {                                                                            \
2636     uint64_t out0_m, out1_m, out2_m, out3_m;                                 \
2637     v16u8 tp0_m, tp1_m, tp2_m, tp3_m;                                        \
2638                                                                              \
2639     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                      \
2640                 tp0_m, tp1_m, tp2_m, tp3_m);                                 \
2641                                                                              \
2642     out0_m = __msa_copy_u_d((v2i64) tp0_m, 0);                               \
2643     out1_m = __msa_copy_u_d((v2i64) tp1_m, 0);                               \
2644     out2_m = __msa_copy_u_d((v2i64) tp2_m, 0);                               \
2645     out3_m = __msa_copy_u_d((v2i64) tp3_m, 0);                               \
2646     SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                       \
2647 }
2648 
2649 /* Description : Average rounded byte elements from pair of vectors and store
2650                  16x4 byte block in destination memory
2651    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2652    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2653                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2654                  Each byte element from input vector pair 'in2' and 'in3' are
2655                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2656                  Each byte element from input vector pair 'in4' and 'in5' are
2657                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2658                  Each byte element from input vector pair 'in6' and 'in7' are
2659                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2660                  The vector results from all 4 vectors are stored in
2661                  destination memory as 16x4 byte block
2662 */
2663 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \
2664 {                                                                             \
2665     v16u8 t0_m, t1_m, t2_m, t3_m;                                             \
2666                                                                               \
2667     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                       \
2668                 t0_m, t1_m, t2_m, t3_m);                                      \
2669     ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride);                             \
2670 }
2671 
2672 /* Description : Average rounded byte elements from pair of vectors,
2673                  average rounded with destination and store 8x4 byte block
2674                  in destination memory
2675    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2676    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2677                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2678                  Each byte element from input vector pair 'in2' and 'in3' are
2679                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2680                  Each byte element from input vector pair 'in4' and 'in5' are
2681                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2682                  Each byte element from input vector pair 'in6' and 'in7' are
2683                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2684                  The half vector results from all 4 vectors are stored in
2685                  destination memory as 8x4 byte block
2686 */
2687 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
2688                           pdst, stride)                            \
2689 {                                                                  \
2690     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                          \
2691     v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                          \
2692                                                                    \
2693     LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);          \
2694     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
2695                 tmp0_m, tmp1_m, tmp2_m, tmp3_m);                   \
2696     AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
2697                   dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
2698 }
2699 
2700 /* Description : Average rounded byte elements from pair of vectors,
2701                  average rounded with destination and store 16x4 byte block
2702                  in destination memory
2703    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2704    Details     : Each byte element from input vector pair 'in0' and 'in1' are
2705                  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2706                  Each byte element from input vector pair 'in2' and 'in3' are
2707                  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2708                  Each byte element from input vector pair 'in4' and 'in5' are
2709                  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2710                  Each byte element from input vector pair 'in6' and 'in7' are
2711                  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2712                  The vector results from all 4 vectors are stored in
2713                  destination memory as 16x4 byte block
2714 */
2715 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \
2716                            pdst, stride)                            \
2717 {                                                                   \
2718     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                           \
2719     v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                           \
2720                                                                     \
2721     LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);           \
2722     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,             \
2723                 tmp0_m, tmp1_m, tmp2_m, tmp3_m);                    \
2724     AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \
2725                    dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \
2726 }
2727 
2728 /* Description : Add block 4x4
2729    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
2730    Details     : Least significant 4 bytes from each input vector are added to
2731                  the destination bytes, clipped between 0-255 and then stored.
2732 */
2733 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)         \
2734 {                                                                 \
2735     uint32_t src0_m, src1_m, src2_m, src3_m;                      \
2736     uint32_t out0_m, out1_m, out2_m, out3_m;                      \
2737     v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
2738     v16i8 dst0_m = { 0 };                                         \
2739     v16i8 dst1_m = { 0 };                                         \
2740     v16i8 zero_m = { 0 };                                         \
2741                                                                   \
2742     ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
2743     LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
2744     INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
2745     INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
2746     ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
2747     ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
2748     CLIP_SH2_0_255(res0_m, res1_m);                               \
2749     PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
2750                                                                   \
2751     out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);                   \
2752     out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);                   \
2753     out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);                   \
2754     out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);                   \
2755     SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);            \
2756 }
2757 
2758 /* Description : Dot product and addition of 3 signed halfword input vectors
2759    Arguments   : Inputs  - in0, in1, in2, coeff0, coeff1, coeff2
2760                  Outputs - out0_m
2761                  Return Type - signed halfword
2762    Details     : Dot product of 'in0' with 'coeff0'
2763                  Dot product of 'in1' with 'coeff1'
2764                  Dot product of 'in2' with 'coeff2'
2765                  Addition of all the 3 vector results
2766 
2767                  out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2768 */
2769 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)         \
2770 ( {                                                                 \
2771     v8i16 out0_m;                                                   \
2772                                                                     \
2773     out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0);           \
2774     out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1);  \
2775     out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2);  \
2776                                                                     \
2777     out0_m;                                                         \
2778 } )
2779 
2780 /* Description : Pack even elements of input vectors & xor with 128
2781    Arguments   : Inputs  - in0, in1
2782                  Outputs - out_m
2783                  Return Type - unsigned byte
2784    Details     : Signed byte even elements from 'in0' and 'in1' are packed
2785                  together in one vector and the resulted vector is xor'ed with
2786                  128 to shift the range from signed to unsigned byte
2787 */
2788 #define PCKEV_XORI128_UB(in0, in1)                            \
2789 ( {                                                           \
2790     v16u8 out_m;                                              \
2791     out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
2792     out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128);         \
2793     out_m;                                                    \
2794 } )
2795 
2796 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2797                  as 8x4 unsigned byte block
2798    Arguments   : Inputs  - in0, in1, in2, in3, dst0, dst1, pdst, stride
2799 */
2800 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,           \
2801                                 dst0, dst1, pdst, stride)     \
2802 {                                                             \
2803     v16u8 tmp0_m, tmp1_m;                                     \
2804     uint8_t *pdst_m = (uint8_t *) (pdst);                     \
2805                                                               \
2806     tmp0_m = PCKEV_XORI128_UB(in0, in1);                      \
2807     tmp1_m = PCKEV_XORI128_UB(in2, in3);                      \
2808     AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);  \
2809     ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride);        \
2810 }
2811 
2812 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
2813                  of results and store 4 words in destination memory as per
2814                  stride
2815    Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
2816 */
2817 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride)  \
2818 {                                                         \
2819     uint32_t out0_m, out1_m, out2_m, out3_m;              \
2820     v16i8 tmp0_m, tmp1_m;                                 \
2821                                                           \
2822     PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m);      \
2823                                                           \
2824     out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0);           \
2825     out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2);           \
2826     out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0);           \
2827     out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2);           \
2828                                                           \
2829     SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);    \
2830 }
2831 
2832 /* Description : Pack even byte elements and store byte vector in destination
2833                  memory
2834    Arguments   : Inputs  - in0, in1, pdst
2835 */
2836 #define PCKEV_ST_SB(in0, in1, pdst)                   \
2837 {                                                     \
2838     v16i8 tmp_m;                                      \
2839     tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0);  \
2840     ST_SB(tmp_m, (pdst));                             \
2841 }
2842 
2843 /* Description : Horizontal 2 tap filter kernel code
2844    Arguments   : Inputs  - in0, in1, mask, coeff, shift
2845 */
2846 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)            \
2847 ( {                                                                 \
2848     v16i8 tmp0_m;                                                   \
2849     v8u16 tmp1_m;                                                   \
2850                                                                     \
2851     tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0);  \
2852     tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff);         \
2853     tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift);          \
2854     tmp1_m = __msa_sat_u_h(tmp1_m, shift);                          \
2855                                                                     \
2856     tmp1_m;                                                         \
2857 } )
2858 #endif  /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */
2859