• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // MSA common macros
11 //
12 // Author(s):  Prashant Patil   (prashant.patil@imgtec.com)
13 
14 #ifndef WEBP_DSP_MSA_MACRO_H_
15 #define WEBP_DSP_MSA_MACRO_H_
16 
17 #include "src/dsp/dsp.h"
18 
19 #if defined(WEBP_USE_MSA)
20 
21 #include <stdint.h>
22 #include <msa.h>
23 
24 #if defined(__clang__)
25   #define CLANG_BUILD
26 #endif
27 
28 #ifdef CLANG_BUILD
29   #define ALPHAVAL  (-1)
30   #define ADDVI_H(a, b)  __msa_addvi_h((v8i16)a, b)
31   #define ADDVI_W(a, b)  __msa_addvi_w((v4i32)a, b)
32   #define SRAI_B(a, b)  __msa_srai_b((v16i8)a, b)
33   #define SRAI_H(a, b)  __msa_srai_h((v8i16)a, b)
34   #define SRAI_W(a, b)  __msa_srai_w((v4i32)a, b)
35   #define SRLI_H(a, b)  __msa_srli_h((v8i16)a, b)
36   #define SLLI_B(a, b)  __msa_slli_b((v4i32)a, b)
37   #define ANDI_B(a, b)  __msa_andi_b((v16u8)a, b)
38   #define ORI_B(a, b)   __msa_ori_b((v16u8)a, b)
39 #else
40   #define ALPHAVAL  (0xff)
41   #define ADDVI_H(a, b)  (a + b)
42   #define ADDVI_W(a, b)  (a + b)
43   #define SRAI_B(a, b)  (a >> b)
44   #define SRAI_H(a, b)  (a >> b)
45   #define SRAI_W(a, b)  (a >> b)
46   #define SRLI_H(a, b)  (a << b)
47   #define SLLI_B(a, b)  (a << b)
48   #define ANDI_B(a, b)  (a & b)
49   #define ORI_B(a, b)   (a | b)
50 #endif
51 
52 #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc))
53 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
54 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
55 
56 #define LD_H(RTYPE, psrc) *((RTYPE*)(psrc))
57 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
58 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
59 
60 #define LD_W(RTYPE, psrc) *((RTYPE*)(psrc))
61 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
62 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
63 
64 #define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
65 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
66 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
67 
68 #define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
69 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
70 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
71 
72 #define ST_W(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
73 #define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
74 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
75 
76 #define MSA_LOAD_FUNC(TYPE, INSTR, FUNC_NAME)             \
77   static inline TYPE FUNC_NAME(const void* const psrc) {  \
78     const uint8_t* const psrc_m = (const uint8_t*)psrc;   \
79     TYPE val_m;                                           \
80     asm volatile (                                        \
81       "" #INSTR " %[val_m], %[psrc_m]  \n\t"              \
82       : [val_m] "=r" (val_m)                              \
83       : [psrc_m] "m" (*psrc_m));                          \
84     return val_m;                                         \
85   }
86 
87 #define MSA_LOAD(psrc, FUNC_NAME)  FUNC_NAME(psrc)
88 
89 #define MSA_STORE_FUNC(TYPE, INSTR, FUNC_NAME)               \
90   static inline void FUNC_NAME(TYPE val, void* const pdst) { \
91     uint8_t* const pdst_m = (uint8_t*)pdst;                  \
92     TYPE val_m = val;                                        \
93     asm volatile (                                           \
94       " " #INSTR "  %[val_m],  %[pdst_m]  \n\t"              \
95       : [pdst_m] "=m" (*pdst_m)                              \
96       : [val_m] "r" (val_m));                                \
97   }
98 
99 #define MSA_STORE(val, pdst, FUNC_NAME)  FUNC_NAME(val, pdst)
100 
101 #if (__mips_isa_rev >= 6)
102   MSA_LOAD_FUNC(uint16_t, lh, msa_lh);
103   #define LH(psrc)  MSA_LOAD(psrc, msa_lh)
104   MSA_LOAD_FUNC(uint32_t, lw, msa_lw);
105   #define LW(psrc)  MSA_LOAD(psrc, msa_lw)
106   #if (__mips == 64)
107     MSA_LOAD_FUNC(uint64_t, ld, msa_ld);
108     #define LD(psrc)  MSA_LOAD(psrc, msa_ld)
109   #else  // !(__mips == 64)
110     #define LD(psrc)  ((((uint64_t)MSA_LOAD(psrc + 4, msa_lw)) << 32) | \
111                        MSA_LOAD(psrc, msa_lw))
112   #endif  // (__mips == 64)
113 
114   MSA_STORE_FUNC(uint16_t, sh, msa_sh);
115   #define SH(val, pdst)  MSA_STORE(val, pdst, msa_sh)
116   MSA_STORE_FUNC(uint32_t, sw, msa_sw);
117   #define SW(val, pdst)  MSA_STORE(val, pdst, msa_sw)
118   MSA_STORE_FUNC(uint64_t, sd, msa_sd);
119   #define SD(val, pdst)  MSA_STORE(val, pdst, msa_sd)
120 #else  // !(__mips_isa_rev >= 6)
121   MSA_LOAD_FUNC(uint16_t, ulh, msa_ulh);
122   #define LH(psrc)  MSA_LOAD(psrc, msa_ulh)
123   MSA_LOAD_FUNC(uint32_t, ulw, msa_ulw);
124   #define LW(psrc)  MSA_LOAD(psrc, msa_ulw)
125   #if (__mips == 64)
126     MSA_LOAD_FUNC(uint64_t, uld, msa_uld);
127     #define LD(psrc)  MSA_LOAD(psrc, msa_uld)
128   #else  // !(__mips == 64)
129     #define LD(psrc)  ((((uint64_t)MSA_LOAD(psrc + 4, msa_ulw)) << 32) | \
130                         MSA_LOAD(psrc, msa_ulw))
131   #endif  // (__mips == 64)
132 
133   MSA_STORE_FUNC(uint16_t, ush, msa_ush);
134   #define SH(val, pdst)  MSA_STORE(val, pdst, msa_ush)
135   MSA_STORE_FUNC(uint32_t, usw, msa_usw);
136   #define SW(val, pdst)  MSA_STORE(val, pdst, msa_usw)
137   #define SD(val, pdst) do {                                               \
138     uint8_t* const pdst_sd_m = (uint8_t*)(pdst);                           \
139     const uint32_t val0_m = (uint32_t)(val & 0x00000000FFFFFFFF);          \
140     const uint32_t val1_m = (uint32_t)((val >> 32) & 0x00000000FFFFFFFF);  \
141     SW(val0_m, pdst_sd_m);                                                 \
142     SW(val1_m, pdst_sd_m + 4);                                             \
143   } while (0)
144 #endif  // (__mips_isa_rev >= 6)
145 
146 /* Description : Load 4 words with stride
147  * Arguments   : Inputs  - psrc, stride
148  *               Outputs - out0, out1, out2, out3
149  * Details     : Load word in 'out0' from (psrc)
150  *               Load word in 'out1' from (psrc + stride)
151  *               Load word in 'out2' from (psrc + 2 * stride)
152  *               Load word in 'out3' from (psrc + 3 * stride)
153  */
154 #define LW4(psrc, stride, out0, out1, out2, out3) do {  \
155   const uint8_t* ptmp = (const uint8_t*)psrc;           \
156   out0 = LW(ptmp);                                      \
157   ptmp += stride;                                       \
158   out1 = LW(ptmp);                                      \
159   ptmp += stride;                                       \
160   out2 = LW(ptmp);                                      \
161   ptmp += stride;                                       \
162   out3 = LW(ptmp);                                      \
163 } while (0)
164 
165 /* Description : Store words with stride
166  * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
167  * Details     : Store word from 'in0' to (pdst)
168  *               Store word from 'in1' to (pdst + stride)
169  *               Store word from 'in2' to (pdst + 2 * stride)
170  *               Store word from 'in3' to (pdst + 3 * stride)
171  */
172 #define SW4(in0, in1, in2, in3, pdst, stride) do {  \
173   uint8_t* ptmp = (uint8_t*)pdst;                   \
174   SW(in0, ptmp);                                    \
175   ptmp += stride;                                   \
176   SW(in1, ptmp);                                    \
177   ptmp += stride;                                   \
178   SW(in2, ptmp);                                    \
179   ptmp += stride;                                   \
180   SW(in3, ptmp);                                    \
181 } while (0)
182 
183 #define SW3(in0, in1, in2, pdst, stride) do {  \
184   uint8_t* ptmp = (uint8_t*)pdst;              \
185   SW(in0, ptmp);                               \
186   ptmp += stride;                              \
187   SW(in1, ptmp);                               \
188   ptmp += stride;                              \
189   SW(in2, ptmp);                               \
190 } while (0)
191 
192 #define SW2(in0, in1, pdst, stride) do {  \
193   uint8_t* ptmp = (uint8_t*)pdst;         \
194   SW(in0, ptmp);                          \
195   ptmp += stride;                         \
196   SW(in1, ptmp);                          \
197 } while (0)
198 
199 /* Description : Store 4 double words with stride
200  * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
201  * Details     : Store double word from 'in0' to (pdst)
202  *               Store double word from 'in1' to (pdst + stride)
203  *               Store double word from 'in2' to (pdst + 2 * stride)
204  *               Store double word from 'in3' to (pdst + 3 * stride)
205  */
206 #define SD4(in0, in1, in2, in3, pdst, stride) do {  \
207   uint8_t* ptmp = (uint8_t*)pdst;                   \
208   SD(in0, ptmp);                                    \
209   ptmp += stride;                                   \
210   SD(in1, ptmp);                                    \
211   ptmp += stride;                                   \
212   SD(in2, ptmp);                                    \
213   ptmp += stride;                                   \
214   SD(in3, ptmp);                                    \
215 } while (0)
216 
217 /* Description : Load vectors with 16 byte elements with stride
218  * Arguments   : Inputs  - psrc, stride
219  *               Outputs - out0, out1
220  *               Return Type - as per RTYPE
221  * Details     : Load 16 byte elements in 'out0' from (psrc)
222  *               Load 16 byte elements in 'out1' from (psrc + stride)
223  */
224 #define LD_B2(RTYPE, psrc, stride, out0, out1) do {  \
225   out0 = LD_B(RTYPE, psrc);                          \
226   out1 = LD_B(RTYPE, psrc + stride);                 \
227 } while (0)
228 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
229 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
230 
231 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) do {  \
232   LD_B2(RTYPE, psrc, stride, out0, out1);                  \
233   out2 = LD_B(RTYPE, psrc + 2 * stride);                   \
234 } while (0)
235 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
236 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
237 
238 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) do {  \
239   LD_B2(RTYPE, psrc, stride, out0, out1);                        \
240   LD_B2(RTYPE, psrc + 2 * stride , stride, out2, out3);          \
241 } while (0)
242 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
243 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
244 
245 #define LD_B8(RTYPE, psrc, stride,                                  \
246               out0, out1, out2, out3, out4, out5, out6, out7) do {  \
247   LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3);               \
248   LD_B4(RTYPE, psrc + 4 * stride, stride, out4, out5, out6, out7);  \
249 } while (0)
250 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
251 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
252 
253 /* Description : Load vectors with 8 halfword elements with stride
254  * Arguments   : Inputs  - psrc, stride
255  *               Outputs - out0, out1
256  * Details     : Load 8 halfword elements in 'out0' from (psrc)
257  *               Load 8 halfword elements in 'out1' from (psrc + stride)
258  */
259 #define LD_H2(RTYPE, psrc, stride, out0, out1) do {  \
260   out0 = LD_H(RTYPE, psrc);                          \
261   out1 = LD_H(RTYPE, psrc + stride);                 \
262 } while (0)
263 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
264 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
265 
266 /* Description : Load vectors with 4 word elements with stride
267  * Arguments   : Inputs  - psrc, stride
268  *               Outputs - out0, out1, out2, out3
269  * Details     : Load 4 word elements in 'out0' from (psrc + 0 * stride)
270  *               Load 4 word elements in 'out1' from (psrc + 1 * stride)
271  *               Load 4 word elements in 'out2' from (psrc + 2 * stride)
272  *               Load 4 word elements in 'out3' from (psrc + 3 * stride)
273  */
274 #define LD_W2(RTYPE, psrc, stride, out0, out1) do {  \
275   out0 = LD_W(RTYPE, psrc);                          \
276   out1 = LD_W(RTYPE, psrc + stride);                 \
277 } while (0)
278 #define LD_UW2(...) LD_W2(v4u32, __VA_ARGS__)
279 #define LD_SW2(...) LD_W2(v4i32, __VA_ARGS__)
280 
281 #define LD_W3(RTYPE, psrc, stride, out0, out1, out2) do {  \
282   LD_W2(RTYPE, psrc, stride, out0, out1);                  \
283   out2 = LD_W(RTYPE, psrc + 2 * stride);                   \
284 } while (0)
285 #define LD_UW3(...) LD_W3(v4u32, __VA_ARGS__)
286 #define LD_SW3(...) LD_W3(v4i32, __VA_ARGS__)
287 
288 #define LD_W4(RTYPE, psrc, stride, out0, out1, out2, out3) do {  \
289   LD_W2(RTYPE, psrc, stride, out0, out1);                        \
290   LD_W2(RTYPE, psrc + 2 * stride, stride, out2, out3);           \
291 } while (0)
292 #define LD_UW4(...) LD_W4(v4u32, __VA_ARGS__)
293 #define LD_SW4(...) LD_W4(v4i32, __VA_ARGS__)
294 
295 /* Description : Store vectors of 16 byte elements with stride
296  * Arguments   : Inputs - in0, in1, pdst, stride
297  * Details     : Store 16 byte elements from 'in0' to (pdst)
298  *               Store 16 byte elements from 'in1' to (pdst + stride)
299  */
300 #define ST_B2(RTYPE, in0, in1, pdst, stride) do {  \
301   ST_B(RTYPE, in0, pdst);                          \
302   ST_B(RTYPE, in1, pdst + stride);                 \
303 } while (0)
304 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
305 #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
306 
307 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) do {  \
308   ST_B2(RTYPE, in0, in1, pdst, stride);                      \
309   ST_B2(RTYPE, in2, in3, pdst + 2 * stride, stride);         \
310 } while (0)
311 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
312 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
313 
314 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,    \
315               pdst, stride) do {                                \
316   ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);               \
317   ST_B4(RTYPE, in4, in5, in6, in7, pdst + 4 * stride, stride);  \
318 } while (0)
319 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
320 
321 /* Description : Store vectors of 4 word elements with stride
322  * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
323  * Details     : Store 4 word elements from 'in0' to (pdst + 0 * stride)
324  *               Store 4 word elements from 'in1' to (pdst + 1 * stride)
325  *               Store 4 word elements from 'in2' to (pdst + 2 * stride)
326  *               Store 4 word elements from 'in3' to (pdst + 3 * stride)
327  */
328 #define ST_W2(RTYPE, in0, in1, pdst, stride) do {  \
329   ST_W(RTYPE, in0, pdst);                          \
330   ST_W(RTYPE, in1, pdst + stride);                 \
331 } while (0)
332 #define ST_UW2(...) ST_W2(v4u32, __VA_ARGS__)
333 #define ST_SW2(...) ST_W2(v4i32, __VA_ARGS__)
334 
335 #define ST_W3(RTYPE, in0, in1, in2, pdst, stride) do {  \
336   ST_W2(RTYPE, in0, in1, pdst, stride);                 \
337   ST_W(RTYPE, in2, pdst + 2 * stride);                  \
338 } while (0)
339 #define ST_UW3(...) ST_W3(v4u32, __VA_ARGS__)
340 #define ST_SW3(...) ST_W3(v4i32, __VA_ARGS__)
341 
342 #define ST_W4(RTYPE, in0, in1, in2, in3, pdst, stride) do {  \
343   ST_W2(RTYPE, in0, in1, pdst, stride);                      \
344   ST_W2(RTYPE, in2, in3, pdst + 2 * stride, stride);         \
345 } while (0)
346 #define ST_UW4(...) ST_W4(v4u32, __VA_ARGS__)
347 #define ST_SW4(...) ST_W4(v4i32, __VA_ARGS__)
348 
349 /* Description : Store vectors of 8 halfword elements with stride
350  * Arguments   : Inputs - in0, in1, pdst, stride
351  * Details     : Store 8 halfword elements from 'in0' to (pdst)
352  *               Store 8 halfword elements from 'in1' to (pdst + stride)
353  */
354 #define ST_H2(RTYPE, in0, in1, pdst, stride) do {  \
355   ST_H(RTYPE, in0, pdst);                          \
356   ST_H(RTYPE, in1, pdst + stride);                 \
357 } while (0)
358 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
359 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
360 
361 /* Description : Store 2x4 byte block to destination memory from input vector
362  * Arguments   : Inputs - in, stidx, pdst, stride
363  * Details     : Index 'stidx' halfword element from 'in' vector is copied to
364  *               the GP register and stored to (pdst)
365  *               Index 'stidx+1' halfword element from 'in' vector is copied to
366  *               the GP register and stored to (pdst + stride)
367  *               Index 'stidx+2' halfword element from 'in' vector is copied to
368  *               the GP register and stored to (pdst + 2 * stride)
369  *               Index 'stidx+3' halfword element from 'in' vector is copied to
370  *               the GP register and stored to (pdst + 3 * stride)
371  */
372 #define ST2x4_UB(in, stidx, pdst, stride) do {                   \
373   uint8_t* pblk_2x4_m = (uint8_t*)pdst;                          \
374   const uint16_t out0_m = __msa_copy_s_h((v8i16)in, stidx);      \
375   const uint16_t out1_m = __msa_copy_s_h((v8i16)in, stidx + 1);  \
376   const uint16_t out2_m = __msa_copy_s_h((v8i16)in, stidx + 2);  \
377   const uint16_t out3_m = __msa_copy_s_h((v8i16)in, stidx + 3);  \
378   SH(out0_m, pblk_2x4_m);                                        \
379   pblk_2x4_m += stride;                                          \
380   SH(out1_m, pblk_2x4_m);                                        \
381   pblk_2x4_m += stride;                                          \
382   SH(out2_m, pblk_2x4_m);                                        \
383   pblk_2x4_m += stride;                                          \
384   SH(out3_m, pblk_2x4_m);                                        \
385 } while (0)
386 
387 /* Description : Store 4x4 byte block to destination memory from input vector
388  * Arguments   : Inputs - in0, in1, pdst, stride
389  * Details     : 'Idx0' word element from input vector 'in0' is copied to the
390  *               GP register and stored to (pdst)
391  *               'Idx1' word element from input vector 'in0' is copied to the
392  *               GP register and stored to (pdst + stride)
393  *               'Idx2' word element from input vector 'in0' is copied to the
394  *               GP register and stored to (pdst + 2 * stride)
395  *               'Idx3' word element from input vector 'in0' is copied to the
396  *               GP register and stored to (pdst + 3 * stride)
397  */
398 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) do {  \
399   uint8_t* const pblk_4x4_m = (uint8_t*)pdst;                          \
400   const uint32_t out0_m = __msa_copy_s_w((v4i32)in0, idx0);            \
401   const uint32_t out1_m = __msa_copy_s_w((v4i32)in0, idx1);            \
402   const uint32_t out2_m = __msa_copy_s_w((v4i32)in1, idx2);            \
403   const uint32_t out3_m = __msa_copy_s_w((v4i32)in1, idx3);            \
404   SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);             \
405 } while (0)
406 
407 #define ST4x8_UB(in0, in1, pdst, stride) do {                     \
408   uint8_t* const pblk_4x8 = (uint8_t*)pdst;                       \
409   ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
410   ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
411 } while (0)
412 
413 /* Description : Immediate number of elements to slide
414  * Arguments   : Inputs  - in0, in1, slide_val
415  *               Outputs - out
416  *               Return Type - as per RTYPE
417  * Details     : Byte elements from 'in1' vector are slid into 'in0' by
418  *               value specified in the 'slide_val'
419  */
420 #define SLDI_B(RTYPE, in0, in1, slide_val)                      \
421         (RTYPE)__msa_sldi_b((v16i8)in0, (v16i8)in1, slide_val)  \
422 
423 #define SLDI_UB(...) SLDI_B(v16u8, __VA_ARGS__)
424 #define SLDI_SB(...) SLDI_B(v16i8, __VA_ARGS__)
425 #define SLDI_SH(...) SLDI_B(v8i16, __VA_ARGS__)
426 
427 /* Description : Shuffle byte vector elements as per mask vector
428  * Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
429  *               Outputs - out0, out1
430  *               Return Type - as per RTYPE
431  * Details     : Byte elements from 'in0' & 'in1' are copied selectively to
432  *               'out0' as per control vector 'mask0'
433  */
434 #define VSHF_B(RTYPE, in0, in1, mask)                              \
435         (RTYPE)__msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0)
436 
437 #define VSHF_UB(...) VSHF_B(v16u8, __VA_ARGS__)
438 #define VSHF_SB(...) VSHF_B(v16i8, __VA_ARGS__)
439 #define VSHF_UH(...) VSHF_B(v8u16, __VA_ARGS__)
440 #define VSHF_SH(...) VSHF_B(v8i16, __VA_ARGS__)
441 
442 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) do {  \
443   out0 = VSHF_B(RTYPE, in0, in1, mask0);                                   \
444   out1 = VSHF_B(RTYPE, in2, in3, mask1);                                   \
445 } while (0)
446 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
447 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
448 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
449 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
450 
451 /* Description : Shuffle halfword vector elements as per mask vector
452  * Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
453  *               Outputs - out0, out1
454  *               Return Type - as per RTYPE
455  * Details     : halfword elements from 'in0' & 'in1' are copied selectively to
456  *               'out0' as per control vector 'mask0'
457  */
458 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) do {  \
459   out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0);        \
460   out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2);        \
461 } while (0)
462 #define VSHF_H2_UH(...) VSHF_H2(v8u16, __VA_ARGS__)
463 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
464 
465 /* Description : Dot product of byte vector elements
466  * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
467  *               Outputs - out0, out1
468  *               Return Type - as per RTYPE
469  * Details     : Signed byte elements from 'mult0' are multiplied with
470  *               signed byte elements from 'cnst0' producing a result
471  *               twice the size of input i.e. signed halfword.
472  *               The multiplication result of adjacent odd-even elements
473  *               are added together and written to the 'out0' vector
474 */
475 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {  \
476   out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);           \
477   out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);           \
478 } while (0)
479 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
480 
481 /* Description : Dot product of halfword vector elements
482  * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
483  *               Outputs - out0, out1
484  *               Return Type - as per RTYPE
485  * Details     : Signed halfword elements from 'mult0' are multiplied with
486  *               signed halfword elements from 'cnst0' producing a result
487  *               twice the size of input i.e. signed word.
488  *               The multiplication result of adjacent odd-even elements
489  *               are added together and written to the 'out0' vector
490  */
491 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {  \
492   out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);           \
493   out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);           \
494 } while (0)
495 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
496 
497 /* Description : Dot product of unsigned word vector elements
498  * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
499  *               Outputs - out0, out1
500  *               Return Type - as per RTYPE
501  * Details     : Unsigned word elements from 'mult0' are multiplied with
502  *               unsigned word elements from 'cnst0' producing a result
503  *               twice the size of input i.e. unsigned double word.
504  *               The multiplication result of adjacent odd-even elements
505  *               are added together and written to the 'out0' vector
506  */
507 #define DOTP_UW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {  \
508   out0 = (RTYPE)__msa_dotp_u_d((v4u32)mult0, (v4u32)cnst0);           \
509   out1 = (RTYPE)__msa_dotp_u_d((v4u32)mult1, (v4u32)cnst1);           \
510 } while (0)
511 #define DOTP_UW2_UD(...) DOTP_UW2(v2u64, __VA_ARGS__)
512 
513 /* Description : Dot product & addition of halfword vector elements
514  * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
515  *               Outputs - out0, out1
516  *               Return Type - as per RTYPE
517  * Details     : Signed halfword elements from 'mult0' are multiplied with
518  *               signed halfword elements from 'cnst0' producing a result
519  *               twice the size of input i.e. signed word.
520  *               The multiplication result of adjacent odd-even elements
521  *               are added to the 'out0' vector
522  */
523 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {      \
524   out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \
525   out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \
526 } while (0)
527 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
528 
529 /* Description : Clips all signed halfword elements of input vector
530  *               between 0 & 255
531  * Arguments   : Input/output  - val
532  *               Return Type - signed halfword
533  */
534 #define CLIP_SH_0_255(val) do {                   \
535   const v8i16 max_m = __msa_ldi_h(255);           \
536   val = __msa_maxi_s_h((v8i16)val, 0);            \
537   val = __msa_min_s_h(max_m, (v8i16)val);         \
538 } while (0)
539 
540 #define CLIP_SH2_0_255(in0, in1) do {  \
541   CLIP_SH_0_255(in0);                  \
542   CLIP_SH_0_255(in1);                  \
543 } while (0)
544 
545 #define CLIP_SH4_0_255(in0, in1, in2, in3) do {  \
546   CLIP_SH2_0_255(in0, in1);                      \
547   CLIP_SH2_0_255(in2, in3);                      \
548 } while (0)
549 
550 /* Description : Clips all unsigned halfword elements of input vector
551  *               between 0 & 255
552  * Arguments   : Input  - in
553  *               Output - out_m
554  *               Return Type - unsigned halfword
555  */
556 #define CLIP_UH_0_255(in) do {                    \
557   const v8u16 max_m = (v8u16)__msa_ldi_h(255);    \
558   in = __msa_maxi_u_h((v8u16) in, 0);             \
559   in = __msa_min_u_h((v8u16) max_m, (v8u16) in);  \
560 } while (0)
561 
562 #define CLIP_UH2_0_255(in0, in1) do {  \
563   CLIP_UH_0_255(in0);                  \
564   CLIP_UH_0_255(in1);                  \
565 } while (0)
566 
567 /* Description : Clips all signed word elements of input vector
568  *               between 0 & 255
569  * Arguments   : Input/output  - val
570  *               Return Type - signed word
571  */
572 #define CLIP_SW_0_255(val) do {                   \
573   const v4i32 max_m = __msa_ldi_w(255);           \
574   val = __msa_maxi_s_w((v4i32)val, 0);            \
575   val = __msa_min_s_w(max_m, (v4i32)val);         \
576 } while (0)
577 
578 #define CLIP_SW4_0_255(in0, in1, in2, in3) do {   \
579   CLIP_SW_0_255(in0);                             \
580   CLIP_SW_0_255(in1);                             \
581   CLIP_SW_0_255(in2);                             \
582   CLIP_SW_0_255(in3);                             \
583 } while (0)
584 
585 /* Description : Horizontal addition of 4 signed word elements of input vector
586  * Arguments   : Input  - in       (signed word vector)
587  *               Output - sum_m    (i32 sum)
588  *               Return Type - signed word (GP)
589  * Details     : 4 signed word elements of 'in' vector are added together and
590  *               the resulting integer sum is returned
591  */
func_hadd_sw_s32(v4i32 in)592 static WEBP_INLINE int32_t func_hadd_sw_s32(v4i32 in) {
593   const v2i64 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);
594   const v2i64 res1_m = __msa_splati_d(res0_m, 1);
595   const v2i64 out = res0_m + res1_m;
596   int32_t sum_m = __msa_copy_s_w((v4i32)out, 0);
597   return sum_m;
598 }
599 #define HADD_SW_S32(in) func_hadd_sw_s32(in)
600 
601 /* Description : Horizontal addition of 8 signed halfword elements
602  * Arguments   : Input  - in       (signed halfword vector)
603  *               Output - sum_m    (s32 sum)
604  *               Return Type - signed word
605  * Details     : 8 signed halfword elements of input vector are added
606  *               together and the resulting integer sum is returned
607  */
func_hadd_sh_s32(v8i16 in)608 static WEBP_INLINE int32_t func_hadd_sh_s32(v8i16 in) {
609   const v4i32 res = __msa_hadd_s_w(in, in);
610   const v2i64 res0 = __msa_hadd_s_d(res, res);
611   const v2i64 res1 = __msa_splati_d(res0, 1);
612   const v2i64 res2 = res0 + res1;
613   const int32_t sum_m = __msa_copy_s_w((v4i32)res2, 0);
614   return sum_m;
615 }
616 #define HADD_SH_S32(in) func_hadd_sh_s32(in)
617 
618 /* Description : Horizontal addition of 8 unsigned halfword elements
619  * Arguments   : Input  - in       (unsigned halfword vector)
620  *               Output - sum_m    (u32 sum)
621  *               Return Type - unsigned word
622  * Details     : 8 unsigned halfword elements of input vector are added
623  *               together and the resulting integer sum is returned
624  */
func_hadd_uh_u32(v8u16 in)625 static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
626   uint32_t sum_m;
627   const v4u32 res_m = __msa_hadd_u_w(in, in);
628   v2u64 res0_m = __msa_hadd_u_d(res_m, res_m);
629   v2u64 res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);
630   res0_m = res0_m + res1_m;
631   sum_m = __msa_copy_s_w((v4i32)res0_m, 0);
632   return sum_m;
633 }
634 #define HADD_UH_U32(in) func_hadd_uh_u32(in)
635 
636 /* Description : Horizontal addition of signed half word vector elements
637    Arguments   : Inputs  - in0, in1
638                  Outputs - out0, out1
639                  Return Type - as per RTYPE
640    Details     : Each signed odd half word element from 'in0' is added to
641                  even signed half word element from 'in0' (pairwise) and the
642                  halfword result is written in 'out0'
643 */
644 #define HADD_SH2(RTYPE, in0, in1, out0, out1) do {       \
645   out0 = (RTYPE)__msa_hadd_s_w((v8i16)in0, (v8i16)in0);  \
646   out1 = (RTYPE)__msa_hadd_s_w((v8i16)in1, (v8i16)in1);  \
647 } while (0)
648 #define HADD_SH2_SW(...) HADD_SH2(v4i32, __VA_ARGS__)
649 
650 #define HADD_SH4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) do {  \
651   HADD_SH2(RTYPE, in0, in1, out0, out1);                                  \
652   HADD_SH2(RTYPE, in2, in3, out2, out3);                                  \
653 } while (0)
654 #define HADD_SH4_SW(...) HADD_SH4(v4i32, __VA_ARGS__)
655 
656 /* Description : Horizontal subtraction of unsigned byte vector elements
657  * Arguments   : Inputs  - in0, in1
658  *               Outputs - out0, out1
659  *               Return Type - as per RTYPE
660  * Details     : Each unsigned odd byte element from 'in0' is subtracted from
661  *               even unsigned byte element from 'in0' (pairwise) and the
662  *               halfword result is written to 'out0'
663  */
664 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) do {       \
665   out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \
666   out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \
667 } while (0)
668 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
669 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
670 #define HSUB_UB2_SW(...) HSUB_UB2(v4i32, __VA_ARGS__)
671 
672 /* Description : Set element n input vector to GPR value
673  * Arguments   : Inputs - in0, in1, in2, in3
674  *               Output - out
675  *               Return Type - as per RTYPE
676  * Details     : Set element 0 in vector 'out' to value specified in 'in0'
677  */
678 #define INSERT_W2(RTYPE, in0, in1, out) do {        \
679   out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
680   out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \
681 } while (0)
682 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
683 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
684 
685 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) do {  \
686   out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);      \
687   out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);      \
688   out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);      \
689   out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);      \
690 } while (0)
691 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
692 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
693 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
694 
695 /* Description : Set element n of double word input vector to GPR value
696  * Arguments   : Inputs - in0, in1
697  *               Output - out
698  *               Return Type - as per RTYPE
699  * Details     : Set element 0 in vector 'out' to GPR value specified in 'in0'
700  *               Set element 1 in vector 'out' to GPR value specified in 'in1'
701  */
702 #define INSERT_D2(RTYPE, in0, in1, out) do {        \
703   out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
704   out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
705 } while (0)
706 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
707 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
708 
709 /* Description : Interleave even byte elements from vectors
710  * Arguments   : Inputs  - in0, in1, in2, in3
711  *               Outputs - out0, out1
712  *               Return Type - as per RTYPE
713  * Details     : Even byte elements of 'in0' and 'in1' are interleaved
714  *               and written to 'out0'
715  */
716 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
717   out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);        \
718   out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);        \
719 } while (0)
720 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
721 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
722 #define ILVEV_B2_UH(...) ILVEV_B2(v8u16, __VA_ARGS__)
723 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
724 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
725 
726 /* Description : Interleave odd byte elements from vectors
727  * Arguments   : Inputs  - in0, in1, in2, in3
728  *               Outputs - out0, out1
729  *               Return Type - as per RTYPE
730  * Details     : Odd byte elements of 'in0' and 'in1' are interleaved
731  *               and written to 'out0'
732  */
733 #define ILVOD_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
734   out0 = (RTYPE)__msa_ilvod_b((v16i8)in1, (v16i8)in0);        \
735   out1 = (RTYPE)__msa_ilvod_b((v16i8)in3, (v16i8)in2);        \
736 } while (0)
737 #define ILVOD_B2_UB(...) ILVOD_B2(v16u8, __VA_ARGS__)
738 #define ILVOD_B2_SB(...) ILVOD_B2(v16i8, __VA_ARGS__)
739 #define ILVOD_B2_UH(...) ILVOD_B2(v8u16, __VA_ARGS__)
740 #define ILVOD_B2_SH(...) ILVOD_B2(v8i16, __VA_ARGS__)
741 #define ILVOD_B2_SD(...) ILVOD_B2(v2i64, __VA_ARGS__)
742 
743 /* Description : Interleave even halfword elements from vectors
744  * Arguments   : Inputs  - in0, in1, in2, in3
745  *               Outputs - out0, out1
746  *               Return Type - as per RTYPE
747  * Details     : Even halfword elements of 'in0' and 'in1' are interleaved
748  *               and written to 'out0'
749  */
750 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
751   out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);        \
752   out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);        \
753 } while (0)
754 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
755 #define ILVEV_H2_UH(...) ILVEV_H2(v8u16, __VA_ARGS__)
756 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
757 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
758 
759 /* Description : Interleave odd halfword elements from vectors
760  * Arguments   : Inputs  - in0, in1, in2, in3
761  *               Outputs - out0, out1
762  *               Return Type - as per RTYPE
763  * Details     : Odd halfword elements of 'in0' and 'in1' are interleaved
764  *               and written to 'out0'
765  */
766 #define ILVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
767   out0 = (RTYPE)__msa_ilvod_h((v8i16)in1, (v8i16)in0);        \
768   out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2);        \
769 } while (0)
770 #define ILVOD_H2_UB(...) ILVOD_H2(v16u8, __VA_ARGS__)
771 #define ILVOD_H2_UH(...) ILVOD_H2(v8u16, __VA_ARGS__)
772 #define ILVOD_H2_SH(...) ILVOD_H2(v8i16, __VA_ARGS__)
773 #define ILVOD_H2_SW(...) ILVOD_H2(v4i32, __VA_ARGS__)
774 
775 /* Description : Interleave even word elements from vectors
776  * Arguments   : Inputs  - in0, in1, in2, in3
777  *               Outputs - out0, out1
778  *               Return Type - as per RTYPE
779  * Details     : Even word elements of 'in0' and 'in1' are interleaved
780  *               and written to 'out0'
781  */
782 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
783   out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);        \
784   out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);        \
785 } while (0)
786 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
787 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
788 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
789 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
790 
791 /* Description : Interleave even-odd word elements from vectors
792  * Arguments   : Inputs  - in0, in1, in2, in3
793  *               Outputs - out0, out1
794  *               Return Type - as per RTYPE
795  * Details     : Even word elements of 'in0' and 'in1' are interleaved
796  *               and written to 'out0'
797  *               Odd word elements of 'in2' and 'in3' are interleaved
798  *               and written to 'out1'
799  */
800 #define ILVEVOD_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
801   out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);          \
802   out1 = (RTYPE)__msa_ilvod_w((v4i32)in3, (v4i32)in2);          \
803 } while (0)
804 #define ILVEVOD_W2_UB(...) ILVEVOD_W2(v16u8, __VA_ARGS__)
805 #define ILVEVOD_W2_UH(...) ILVEVOD_W2(v8u16, __VA_ARGS__)
806 #define ILVEVOD_W2_SH(...) ILVEVOD_W2(v8i16, __VA_ARGS__)
807 #define ILVEVOD_W2_SW(...) ILVEVOD_W2(v4i32, __VA_ARGS__)
808 
809 /* Description : Interleave even-odd half-word elements from vectors
810  * Arguments   : Inputs  - in0, in1, in2, in3
811  *               Outputs - out0, out1
812  *               Return Type - as per RTYPE
813  * Details     : Even half-word elements of 'in0' and 'in1' are interleaved
814  *               and written to 'out0'
815  *               Odd half-word elements of 'in2' and 'in3' are interleaved
816  *               and written to 'out1'
817  */
818 #define ILVEVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
819   out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);          \
820   out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2);          \
821 } while (0)
822 #define ILVEVOD_H2_UB(...) ILVEVOD_H2(v16u8, __VA_ARGS__)
823 #define ILVEVOD_H2_UH(...) ILVEVOD_H2(v8u16, __VA_ARGS__)
824 #define ILVEVOD_H2_SH(...) ILVEVOD_H2(v8i16, __VA_ARGS__)
825 #define ILVEVOD_H2_SW(...) ILVEVOD_H2(v4i32, __VA_ARGS__)
826 
827 /* Description : Interleave even double word elements from vectors
828  * Arguments   : Inputs  - in0, in1, in2, in3
829  *               Outputs - out0, out1
830  *               Return Type - as per RTYPE
831  * Details     : Even double word elements of 'in0' and 'in1' are interleaved
832  *               and written to 'out0'
833  */
834 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
835   out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);        \
836   out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);        \
837 } while (0)
838 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
839 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
840 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
841 #define ILVEV_D2_SD(...) ILVEV_D2(v2i64, __VA_ARGS__)
842 
843 /* Description : Interleave left half of byte elements from vectors
844  * Arguments   : Inputs  - in0, in1, in2, in3
845  *               Outputs - out0, out1
846  *               Return Type - as per RTYPE
847  * Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
848  *               and written to 'out0'.
849  */
850 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
851   out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);        \
852   out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);        \
853 } while (0)
854 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
855 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
856 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
857 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
858 #define ILVL_B2_SW(...) ILVL_B2(v4i32, __VA_ARGS__)
859 
860 /* Description : Interleave right half of byte elements from vectors
861  * Arguments   : Inputs  - in0, in1, in2, in3
862  *               Outputs - out0, out1
863  *               Return Type - as per RTYPE
864  * Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
865  *               and written to out0.
866  */
867 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
868   out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);        \
869   out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);        \
870 } while (0)
871 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
872 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
873 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
874 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
875 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
876 
877 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
878                 out0, out1, out2, out3) do {                    \
879   ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
880   ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
881 } while (0)
882 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
883 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
884 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
885 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
886 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
887 
888 /* Description : Interleave right half of halfword elements from vectors
889  * Arguments   : Inputs  - in0, in1, in2, in3
890  *               Outputs - out0, out1
891  *               Return Type - as per RTYPE
892  * Details     : Right half of halfword elements of 'in0' and 'in1' are
893  *               interleaved and written to 'out0'.
894  */
895 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
896   out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);        \
897   out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);        \
898 } while (0)
899 #define ILVR_H2_UB(...) ILVR_H2(v16u8, __VA_ARGS__)
900 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
901 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
902 
903 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
904                 out0, out1, out2, out3) do {                    \
905   ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
906   ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
907 } while (0)
908 #define ILVR_H4_UB(...) ILVR_H4(v16u8, __VA_ARGS__)
909 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
910 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
911 
912 /* Description : Interleave right half of double word elements from vectors
913  * Arguments   : Inputs  - in0, in1, in2, in3
914  *               Outputs - out0, out1
915  *               Return Type - as per RTYPE
916  * Details     : Right half of double word elements of 'in0' and 'in1' are
917  *               interleaved and written to 'out0'.
918  */
919 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
920   out0 = (RTYPE)__msa_ilvr_d((v2i64)in0, (v2i64)in1);        \
921   out1 = (RTYPE)__msa_ilvr_d((v2i64)in2, (v2i64)in3);        \
922 } while (0)
923 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
924 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
925 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
926 
927 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
928                 out0, out1, out2, out3) do {                    \
929   ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
930   ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
931 } while (0)
932 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
933 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
934 
935 /* Description : Interleave both left and right half of input vectors
936  * Arguments   : Inputs  - in0, in1
937  *               Outputs - out0, out1
938  *               Return Type - as per RTYPE
939  * Details     : Right half of byte elements from 'in0' and 'in1' are
940  *               interleaved and written to 'out0'
941  */
942 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) do {     \
943   out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
944   out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
945 } while (0)
946 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
947 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
948 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
949 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
950 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
951 
952 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) do {     \
953   out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
954   out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
955 } while (0)
956 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
957 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
958 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
959 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
960 #define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__)
961 
962 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) do {     \
963   out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
964   out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
965 } while (0)
966 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
967 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
968 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
969 #define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__)
970 
971 /* Description : Pack even byte elements of vector pairs
972  *  Arguments   : Inputs  - in0, in1, in2, in3
973  *                Outputs - out0, out1
974  *                Return Type - as per RTYPE
975  *  Details     : Even byte elements of 'in0' are copied to the left half of
976  *                'out0' & even byte elements of 'in1' are copied to the right
977  *                half of 'out0'.
978  */
979 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
980   out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);        \
981   out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);        \
982 } while (0)
983 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
984 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
985 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
986 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
987 
988 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
989                  out0, out1, out2, out3) do {                    \
990   PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
991   PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
992 } while (0)
993 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
994 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
995 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
996 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
997 
998 /* Description : Pack even halfword elements of vector pairs
999  * Arguments   : Inputs  - in0, in1, in2, in3
1000  *               Outputs - out0, out1
1001  *               Return Type - as per RTYPE
1002  * Details     : Even halfword elements of 'in0' are copied to the left half of
1003  *               'out0' & even halfword elements of 'in1' are copied to the
1004  *               right half of 'out0'.
1005  */
1006 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
1007   out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);        \
1008   out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);        \
1009 } while (0)
1010 #define PCKEV_H2_UH(...) PCKEV_H2(v8u16, __VA_ARGS__)
1011 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1012 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1013 #define PCKEV_H2_UW(...) PCKEV_H2(v4u32, __VA_ARGS__)
1014 
1015 /* Description : Pack even word elements of vector pairs
1016  * Arguments   : Inputs  - in0, in1, in2, in3
1017  *               Outputs - out0, out1
1018  *               Return Type - as per RTYPE
1019  * Details     : Even word elements of 'in0' are copied to the left half of
1020  *               'out0' & even word elements of 'in1' are copied to the
1021  *               right half of 'out0'.
1022  */
1023 #define PCKEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
1024   out0 = (RTYPE)__msa_pckev_w((v4i32)in0, (v4i32)in1);        \
1025   out1 = (RTYPE)__msa_pckev_w((v4i32)in2, (v4i32)in3);        \
1026 } while (0)
1027 #define PCKEV_W2_UH(...) PCKEV_W2(v8u16, __VA_ARGS__)
1028 #define PCKEV_W2_SH(...) PCKEV_W2(v8i16, __VA_ARGS__)
1029 #define PCKEV_W2_SW(...) PCKEV_W2(v4i32, __VA_ARGS__)
1030 #define PCKEV_W2_UW(...) PCKEV_W2(v4u32, __VA_ARGS__)
1031 
1032 /* Description : Pack odd halfword elements of vector pairs
1033  * Arguments   : Inputs  - in0, in1, in2, in3
1034  *               Outputs - out0, out1
1035  *               Return Type - as per RTYPE
1036  * Details     : Odd halfword elements of 'in0' are copied to the left half of
1037  *               'out0' & odd halfword elements of 'in1' are copied to the
1038  *               right half of 'out0'.
1039  */
1040 #define PCKOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
1041   out0 = (RTYPE)__msa_pckod_h((v8i16)in0, (v8i16)in1);        \
1042   out1 = (RTYPE)__msa_pckod_h((v8i16)in2, (v8i16)in3);        \
1043 } while (0)
1044 #define PCKOD_H2_UH(...) PCKOD_H2(v8u16, __VA_ARGS__)
1045 #define PCKOD_H2_SH(...) PCKOD_H2(v8i16, __VA_ARGS__)
1046 #define PCKOD_H2_SW(...) PCKOD_H2(v4i32, __VA_ARGS__)
1047 #define PCKOD_H2_UW(...) PCKOD_H2(v4u32, __VA_ARGS__)
1048 
1049 /* Description : Arithmetic immediate shift right all elements of word vector
1050  * Arguments   : Inputs  - in0, in1, shift
1051  *               Outputs - in place operation
1052  *               Return Type - as per input vector RTYPE
1053  * Details     : Each element of vector 'in0' is right shifted by 'shift' and
1054  *               the result is written in-place. 'shift' is a GP variable.
1055  */
1056 #define SRAI_W2(RTYPE, in0, in1, shift_val) do {  \
1057   in0 = (RTYPE)SRAI_W(in0, shift_val);            \
1058   in1 = (RTYPE)SRAI_W(in1, shift_val);            \
1059 } while (0)
1060 #define SRAI_W2_SW(...) SRAI_W2(v4i32, __VA_ARGS__)
1061 #define SRAI_W2_UW(...) SRAI_W2(v4u32, __VA_ARGS__)
1062 
1063 #define SRAI_W4(RTYPE, in0, in1, in2, in3, shift_val) do {  \
1064   SRAI_W2(RTYPE, in0, in1, shift_val);                      \
1065   SRAI_W2(RTYPE, in2, in3, shift_val);                      \
1066 } while (0)
1067 #define SRAI_W4_SW(...) SRAI_W4(v4i32, __VA_ARGS__)
1068 #define SRAI_W4_UW(...) SRAI_W4(v4u32, __VA_ARGS__)
1069 
1070 /* Description : Arithmetic shift right all elements of half-word vector
1071  * Arguments   : Inputs  - in0, in1, shift
1072  *               Outputs - in place operation
1073  *               Return Type - as per input vector RTYPE
1074  * Details     : Each element of vector 'in0' is right shifted by 'shift' and
1075  *               the result is written in-place. 'shift' is a GP variable.
1076  */
1077 #define SRAI_H2(RTYPE, in0, in1, shift_val) do {  \
1078   in0 = (RTYPE)SRAI_H(in0, shift_val);            \
1079   in1 = (RTYPE)SRAI_H(in1, shift_val);            \
1080 } while (0)
1081 #define SRAI_H2_SH(...) SRAI_H2(v8i16, __VA_ARGS__)
1082 #define SRAI_H2_UH(...) SRAI_H2(v8u16, __VA_ARGS__)
1083 
1084 /* Description : Arithmetic rounded shift right all elements of word vector
1085  * Arguments   : Inputs  - in0, in1, shift
1086  *               Outputs - in place operation
1087  *               Return Type - as per input vector RTYPE
1088  * Details     : Each element of vector 'in0' is right shifted by 'shift' and
1089  *               the result is written in-place. 'shift' is a GP variable.
1090  */
1091 #define SRARI_W2(RTYPE, in0, in1, shift) do {     \
1092   in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
1093   in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
1094 } while (0)
1095 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1096 
1097 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) do {  \
1098   SRARI_W2(RTYPE, in0, in1, shift);                      \
1099   SRARI_W2(RTYPE, in2, in3, shift);                      \
1100 } while (0)
1101 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
1102 #define SRARI_W4_UW(...) SRARI_W4(v4u32, __VA_ARGS__)
1103 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1104 
1105 /* Description : Shift right arithmetic rounded double words
1106  * Arguments   : Inputs  - in0, in1, shift
1107  *               Outputs - in place operation
1108  *               Return Type - as per RTYPE
1109  * Details     : Each element of vector 'in0' is shifted right arithmetically by
1110  *               the number of bits in the corresponding element in the vector
1111  *               'shift'. The last discarded bit is added to shifted value for
1112  *               rounding and the result is written in-place.
1113  *               'shift' is a vector.
1114  */
1115 #define SRAR_D2(RTYPE, in0, in1, shift) do {            \
1116   in0 = (RTYPE)__msa_srar_d((v2i64)in0, (v2i64)shift);  \
1117   in1 = (RTYPE)__msa_srar_d((v2i64)in1, (v2i64)shift);  \
1118 } while (0)
1119 #define SRAR_D2_SW(...) SRAR_D2(v4i32, __VA_ARGS__)
1120 #define SRAR_D2_SD(...) SRAR_D2(v2i64, __VA_ARGS__)
1121 #define SRAR_D2_UD(...) SRAR_D2(v2u64, __VA_ARGS__)
1122 
1123 #define SRAR_D4(RTYPE, in0, in1, in2, in3, shift) do {  \
1124   SRAR_D2(RTYPE, in0, in1, shift);                      \
1125   SRAR_D2(RTYPE, in2, in3, shift);                      \
1126 } while (0)
1127 #define SRAR_D4_SD(...) SRAR_D4(v2i64, __VA_ARGS__)
1128 #define SRAR_D4_UD(...) SRAR_D4(v2u64, __VA_ARGS__)
1129 
1130 /* Description : Addition of 2 pairs of half-word vectors
1131  * Arguments   : Inputs  - in0, in1, in2, in3
1132  *               Outputs - out0, out1
1133  * Details     : Each element in 'in0' is added to 'in1' and result is written
1134  *               to 'out0'.
1135  */
1136 #define ADDVI_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
1137   out0 = (RTYPE)ADDVI_H(in0, in1);                            \
1138   out1 = (RTYPE)ADDVI_H(in2, in3);                            \
1139 } while (0)
1140 #define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__)
1141 #define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__)
1142 
1143 /* Description : Addition of 2 pairs of word vectors
1144  * Arguments   : Inputs  - in0, in1, in2, in3
1145  *               Outputs - out0, out1
1146  * Details     : Each element in 'in0' is added to 'in1' and result is written
1147  *               to 'out0'.
1148  */
1149 #define ADDVI_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
1150   out0 = (RTYPE)ADDVI_W(in0, in1);                            \
1151   out1 = (RTYPE)ADDVI_W(in2, in3);                            \
1152 } while (0)
1153 #define ADDVI_W2_SW(...) ADDVI_W2(v4i32, __VA_ARGS__)
1154 
1155 /* Description : Fill 2 pairs of word vectors with GP registers
1156  * Arguments   : Inputs  - in0, in1
1157  *               Outputs - out0, out1
1158  * Details     : GP register in0 is replicated in each word element of out0
1159  *               GP register in1 is replicated in each word element of out1
1160  */
1161 #define FILL_W2(RTYPE, in0, in1, out0, out1) do {  \
1162   out0 = (RTYPE)__msa_fill_w(in0);                 \
1163   out1 = (RTYPE)__msa_fill_w(in1);                 \
1164 } while (0)
1165 #define FILL_W2_SW(...) FILL_W2(v4i32, __VA_ARGS__)
1166 
1167 /* Description : Addition of 2 pairs of vectors
1168  * Arguments   : Inputs  - in0, in1, in2, in3
1169  *               Outputs - out0, out1
1170  * Details     : Each element in 'in0' is added to 'in1' and result is written
1171  *               to 'out0'.
1172  */
1173 #define ADD2(in0, in1, in2, in3, out0, out1) do {  \
1174   out0 = in0 + in1;                                \
1175   out1 = in2 + in3;                                \
1176 } while (0)
1177 
1178 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
1179              out0, out1, out2, out3) do {             \
1180   ADD2(in0, in1, in2, in3, out0, out1);               \
1181   ADD2(in4, in5, in6, in7, out2, out3);               \
1182 } while (0)
1183 
1184 /* Description : Subtraction of 2 pairs of vectors
1185  * Arguments   : Inputs  - in0, in1, in2, in3
1186  *               Outputs - out0, out1
1187  * Details     : Each element in 'in1' is subtracted from 'in0' and result is
1188  *               written to 'out0'.
1189  */
1190 #define SUB2(in0, in1, in2, in3, out0, out1) do {  \
1191   out0 = in0 - in1;                                \
1192   out1 = in2 - in3;                                \
1193 } while (0)
1194 
1195 #define SUB3(in0, in1, in2, in3, in4, in5, out0, out1, out2) do {  \
1196   out0 = in0 - in1;                                                \
1197   out1 = in2 - in3;                                                \
1198   out2 = in4 - in5;                                                \
1199 } while (0)
1200 
1201 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
1202              out0, out1, out2, out3) do {             \
1203   out0 = in0 - in1;                                   \
1204   out1 = in2 - in3;                                   \
1205   out2 = in4 - in5;                                   \
1206   out3 = in6 - in7;                                   \
1207 } while (0)
1208 
1209 /* Description : Addition - Subtraction of input vectors
1210  * Arguments   : Inputs  - in0, in1
1211  *               Outputs - out0, out1
1212  * Details     : Each element in 'in1' is added to 'in0' and result is
1213  *               written to 'out0'.
1214  *               Each element in 'in1' is subtracted from 'in0' and result is
1215  *               written to 'out1'.
1216  */
1217 #define ADDSUB2(in0, in1, out0, out1) do {  \
1218   out0 = in0 + in1;                         \
1219   out1 = in0 - in1;                         \
1220 } while (0)
1221 
1222 /* Description : Multiplication of pairs of vectors
1223  * Arguments   : Inputs  - in0, in1, in2, in3
1224  *               Outputs - out0, out1
1225  * Details     : Each element from 'in0' is multiplied with elements from 'in1'
1226  *               and the result is written to 'out0'
1227  */
1228 #define MUL2(in0, in1, in2, in3, out0, out1) do {  \
1229   out0 = in0 * in1;                                \
1230   out1 = in2 * in3;                                \
1231 } while (0)
1232 
1233 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
1234              out0, out1, out2, out3) do {             \
1235   MUL2(in0, in1, in2, in3, out0, out1);               \
1236   MUL2(in4, in5, in6, in7, out2, out3);               \
1237 } while (0)
1238 
1239 /* Description : Sign extend halfword elements from right half of the vector
1240  * Arguments   : Input  - in    (halfword vector)
1241  *               Output - out   (sign extended word vector)
1242  *               Return Type - signed word
1243  * Details     : Sign bit of halfword elements from input vector 'in' is
1244  *               extracted and interleaved with same vector 'in0' to generate
1245  *               4 word elements keeping sign intact
1246  */
1247 #define UNPCK_R_SH_SW(in, out) do {                   \
1248   const v8i16 sign_m = __msa_clti_s_h((v8i16)in, 0);  \
1249   out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);       \
1250 } while (0)
1251 
1252 /* Description : Sign extend halfword elements from input vector and return
1253  *               the result in pair of vectors
1254  * Arguments   : Input   - in            (halfword vector)
1255  *               Outputs - out0, out1   (sign extended word vectors)
1256  *               Return Type - signed word
1257  * Details     : Sign bit of halfword elements from input vector 'in' is
1258  *               extracted and interleaved right with same vector 'in0' to
1259  *               generate 4 signed word elements in 'out0'
1260  *               Then interleaved left with same vector 'in0' to
1261  *               generate 4 signed word elements in 'out1'
1262  */
1263 #define UNPCK_SH_SW(in, out0, out1) do {              \
1264   const v8i16 tmp_m = __msa_clti_s_h((v8i16)in, 0);   \
1265   ILVRL_H2_SW(tmp_m, in, out0, out1);                 \
1266 } while (0)
1267 
1268 /* Description : Butterfly of 4 input vectors
1269  * Arguments   : Inputs  - in0, in1, in2, in3
1270  *               Outputs - out0, out1, out2, out3
1271  * Details     : Butterfly operation
1272  */
1273 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
1274   out0 = in0 + in3;                                                   \
1275   out1 = in1 + in2;                                                   \
1276   out2 = in1 - in2;                                                   \
1277   out3 = in0 - in3;                                                   \
1278 } while (0)
1279 
1280 /* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
1281  * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1282  *                         in8, in9, in10, in11, in12, in13, in14, in15
1283  *               Outputs - out0, out1, out2, out3
1284  *               Return Type - unsigned byte
1285  */
1286 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
1287                             in8, in9, in10, in11, in12, in13, in14, in15,  \
1288                             out0, out1, out2, out3) do {                   \
1289   v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m;                    \
1290   ILVEV_W2_SD(in0, in4, in8, in12, tmp2_m, tmp3_m);                        \
1291   ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                        \
1292   ILVEV_D2_UB(tmp2_m, tmp3_m, tmp0_m, tmp1_m, out1, out3);                 \
1293   ILVEV_W2_SD(in2, in6, in10, in14, tmp4_m, tmp5_m);                       \
1294   ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                       \
1295   ILVEV_D2_SD(tmp4_m, tmp5_m, tmp0_m, tmp1_m, tmp2_m, tmp3_m);             \
1296   ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);                 \
1297   ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out0, out2);               \
1298   ILVOD_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);                 \
1299   ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out1, out3);               \
1300 } while (0)
1301 
1302 /* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
1303  * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1304  *                         in8, in9, in10, in11, in12, in13, in14, in15
1305  *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1306  *               Return Type - unsigned byte
1307  */
1308 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
1309                             in8, in9, in10, in11, in12, in13, in14, in15,  \
1310                             out0, out1, out2, out3, out4, out5,            \
1311                             out6, out7) do {                               \
1312   v8i16 tmp0_m, tmp1_m, tmp4_m, tmp5_m, tmp6_m, tmp7_m;                    \
1313   v4i32 tmp2_m, tmp3_m;                                                    \
1314   ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
1315   ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
1316   ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
1317   ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
1318   ILVEV_B2_SH(out7, out6, out5, out4, tmp0_m, tmp1_m);                     \
1319   ILVOD_B2_SH(out7, out6, out5, out4, tmp4_m, tmp5_m);                     \
1320   ILVEV_B2_UB(out3, out2, out1, out0, out5, out7);                         \
1321   ILVOD_B2_SH(out3, out2, out1, out0, tmp6_m, tmp7_m);                     \
1322   ILVEV_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
1323   ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out0, out4);               \
1324   ILVOD_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
1325   ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out2, out6);               \
1326   ILVEV_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
1327   ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out1, out5);               \
1328   ILVOD_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
1329   ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out3, out7);               \
1330 } while (0)
1331 
1332 /* Description : Transpose 4x4 block with word elements in vectors
1333  * Arguments   : Inputs  - in0, in1, in2, in3
1334  *                Outputs - out0, out1, out2, out3
1335  *                Return Type - as per RTYPE
1336  */
1337 #define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3,                            \
1338                        out0, out1, out2, out3) do {                          \
1339   v4i32 s0_m, s1_m, s2_m, s3_m;                                              \
1340   ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                         \
1341   ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                         \
1342   out0 = (RTYPE)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);                      \
1343   out1 = (RTYPE)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                      \
1344   out2 = (RTYPE)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                      \
1345   out3 = (RTYPE)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                      \
1346 } while (0)
1347 #define TRANSPOSE4x4_SW_SW(...) TRANSPOSE4x4_W(v4i32, __VA_ARGS__)
1348 
1349 /* Description : Add block 4x4
1350  * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
1351  * Details     : Least significant 4 bytes from each input vector are added to
1352  *               the destination bytes, clipped between 0-255 and stored.
1353  */
1354 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) do {  \
1355   uint32_t src0_m, src1_m, src2_m, src3_m;                      \
1356   v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
1357   v16i8 dst0_m = { 0 };                                         \
1358   v16i8 dst1_m = { 0 };                                         \
1359   const v16i8 zero_m = { 0 };                                   \
1360   ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m);               \
1361   LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m);            \
1362   INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
1363   INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
1364   ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
1365   ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
1366   CLIP_SH2_0_255(res0_m, res1_m);                               \
1367   PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
1368   ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
1369 } while (0)
1370 
1371 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
1372  *               of results and store 4 words in destination memory as per
1373  *               stride
1374  * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
1375  */
1376 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) do {  \
1377   v16i8 tmp0_m, tmp1_m;                                        \
1378   PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m);             \
1379   ST4x4_UB(tmp0_m, tmp1_m, 0, 2, 0, 2, pdst, stride);          \
1380 } while (0)
1381 
1382 /* Description : average with rounding (in0 + in1 + 1) / 2.
1383  * Arguments   : Inputs  - in0, in1, in2, in3,
1384  *               Outputs - out0, out1
1385  *               Return Type - as per RTYPE
1386  * Details     : Each unsigned byte element from 'in0' vector is added with
1387  *               each unsigned byte element from 'in1' vector. Then the average
1388  *               with rounding is calculated and written to 'out0'
1389  */
1390 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
1391   out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1);       \
1392   out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3);       \
1393 } while (0)
1394 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
1395 
1396 #endif  // WEBP_USE_MSA
1397 #endif  // WEBP_DSP_MSA_MACRO_H_
1398