• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * All rights reserved.
4  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
5  *                Xiwei Gu   <guxiwei-hf@loongson.cn>
6  *                Lu Wang    <wanglu@loongson.cn>
7  *
8  * This file is a header file for loongarch builtin extention.
9  *
10  */
11 
12 #ifndef LOONGSON_INTRINSICS_H
13 #define LOONGSON_INTRINSICS_H
14 
15 /**
16  * MAJOR version: Macro usage changes.
17  * MINOR version: Add new functions, or bug fix.
18  * MICRO version: Comment changes or implementation changes.
19  */
20 #define LSOM_VERSION_MAJOR 1
21 #define LSOM_VERSION_MINOR 0
22 #define LSOM_VERSION_MICRO 3
23 
24 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
25 { \
26     _OUT0 = _INS(_IN0); \
27     _OUT1 = _INS(_IN1); \
28 }
29 
30 #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
31 { \
32     _OUT0 = _INS(_IN0, _IN1); \
33     _OUT1 = _INS(_IN2, _IN3); \
34 }
35 
36 #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
37 { \
38     _OUT0 = _INS(_IN0, _IN1, _IN2); \
39     _OUT1 = _INS(_IN3, _IN4, _IN5); \
40 }
41 
42 #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
43 { \
44     DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
45     DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
46 }
47 
48 #define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
49                   _OUT0, _OUT1, _OUT2, _OUT3) \
50 { \
51     DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
52     DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
53 }
54 
55 #define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
56                   _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
57 { \
58     DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4,  _IN5,  _OUT0, _OUT1); \
59     DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
60 }
61 
62 #ifdef __loongarch_sx
63 #include <lsxintrin.h>
64 /*
65  * =============================================================================
66  * Description : Dot product & addition of byte vector elements
67  * Arguments   : Inputs  - in_c, in_h, in_l
68  *               Outputs - out
69  *               Retrun Type - halfword
70  * Details     : Signed byte elements from in_h are multiplied by
71  *               signed byte elements from in_l, and then added adjacent to
72  *               each other to get results with the twice size of input.
73  *               Then the results plus to signed half word elements from in_c.
74  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
75  *        in_c : 1,2,3,4, 1,2,3,4
76  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
77  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
78  *         out : 23,40,41,26, 23,40,41,26
79  * =============================================================================
80  */
__lsx_vdp2add_h_b(__m128i in_c,__m128i in_h,__m128i in_l)81 static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l)
82 {
83     __m128i out;
84 
85     out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
86     out = __lsx_vmaddwod_h_b(out, in_h, in_l);
87     return out;
88 }
89 
90 /*
91  * =============================================================================
92  * Description : Dot product & addition of byte vector elements
93  * Arguments   : Inputs  - in_c, in_h, in_l
94  *               Outputs - out
95  *               Retrun Type - halfword
96  * Details     : Unsigned byte elements from in_h are multiplied by
97  *               unsigned byte elements from in_l, and then added adjacent to
98  *               each other to get results with the twice size of input.
99  *               The results plus to signed half word elements from in_c.
100  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
101  *        in_c : 1,2,3,4, 1,2,3,4
102  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
103  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
104  *         out : 23,40,41,26, 23,40,41,26
105  * =============================================================================
106  */
__lsx_vdp2add_h_bu(__m128i in_c,__m128i in_h,__m128i in_l)107 static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_l)
108 {
109     __m128i out;
110 
111     out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
112     out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
113     return out;
114 }
115 
116 /*
117  * =============================================================================
118  * Description : Dot product & addition of half word vector elements
119  * Arguments   : Inputs  - in_c, in_h, in_l
120  *               Outputs - out
121  *               Retrun Type - __m128i
122  * Details     : Signed half word elements from in_h are multiplied by
123  *               signed half word elements from in_l, and then added adjacent to
124  *               each other to get results with the twice size of input.
125  *               Then the results plus to signed word elements from in_c.
126  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
127  *        in_c : 1,2,3,4
128  *        in_h : 1,2,3,4, 5,6,7,8
129  *        in_l : 8,7,6,5, 4,3,2,1
130  *         out : 23,40,41,26
131  * =============================================================================
132  */
__lsx_vdp2add_w_h(__m128i in_c,__m128i in_h,__m128i in_l)133 static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l)
134 {
135     __m128i out;
136 
137     out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
138     out = __lsx_vmaddwod_w_h(out, in_h, in_l);
139     return out;
140 }
141 
142 /*
143  * =============================================================================
144  * Description : Dot product of byte vector elements
145  * Arguments   : Inputs  - in_h, in_l
146  *               Outputs - out
147  *               Retrun Type - halfword
148  * Details     : Signed byte elements from in_h are multiplied by
149  *               signed byte elements from in_l, and then added adjacent to
150  *               each other to get results with the twice size of input.
151  * Example     : out = __lsx_vdp2_h_b(in_h, in_l)
152  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
153  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
154  *         out : 22,38,38,22, 22,38,38,22
155  * =============================================================================
156  */
__lsx_vdp2_h_b(__m128i in_h,__m128i in_l)157 static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
158 {
159     __m128i out;
160 
161     out = __lsx_vmulwev_h_b(in_h, in_l);
162     out = __lsx_vmaddwod_h_b(out, in_h, in_l);
163     return out;
164 }
165 
166 /*
167  * =============================================================================
168  * Description : Dot product of byte vector elements
169  * Arguments   : Inputs  - in_h, in_l
170  *               Outputs - out
171  *               Retrun Type - halfword
172  * Details     : Unsigned byte elements from in_h are multiplied by
173  *               unsigned byte elements from in_l, and then added adjacent to
174  *               each other to get results with the twice size of input.
175  * Example     : out = __lsx_vdp2_h_bu(in_h, in_l)
176  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
177  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
178  *         out : 22,38,38,22, 22,38,38,22
179  * =============================================================================
180  */
__lsx_vdp2_h_bu(__m128i in_h,__m128i in_l)181 static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
182 {
183     __m128i out;
184 
185     out = __lsx_vmulwev_h_bu(in_h, in_l);
186     out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
187     return out;
188 }
189 
190 /*
191  * =============================================================================
192  * Description : Dot product of byte vector elements
193  * Arguments   : Inputs  - in_h, in_l
194  *               Outputs - out
195  *               Retrun Type - halfword
196  * Details     : Unsigned byte elements from in_h are multiplied by
197  *               signed byte elements from in_l, and then added adjacent to
198  *               each other to get results with the twice size of input.
199  * Example     : out = __lsx_vdp2_h_bu_b(in_h, in_l)
200  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
201  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
202  *         out : 22,38,38,22, 22,38,38,6
203  * =============================================================================
204  */
__lsx_vdp2_h_bu_b(__m128i in_h,__m128i in_l)205 static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
206 {
207     __m128i out;
208 
209     out = __lsx_vmulwev_h_bu_b(in_h, in_l);
210     out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
211     return out;
212 }
213 
214 /*
215  * =============================================================================
216  * Description : Dot product of byte vector elements
217  * Arguments   : Inputs  - in_h, in_l
218  *               Outputs - out
219  *               Retrun Type - halfword
220  * Details     : Signed byte elements from in_h are multiplied by
221  *               signed byte elements from in_l, and then added adjacent to
222  *               each other to get results with the twice size of input.
223  * Example     : out = __lsx_vdp2_w_h(in_h, in_l)
224  *        in_h : 1,2,3,4, 5,6,7,8
225  *        in_l : 8,7,6,5, 4,3,2,1
226  *         out : 22,38,38,22
227  * =============================================================================
228  */
__lsx_vdp2_w_h(__m128i in_h,__m128i in_l)229 static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l)
230 {
231     __m128i out;
232 
233     out = __lsx_vmulwev_w_h(in_h, in_l);
234     out = __lsx_vmaddwod_w_h(out, in_h, in_l);
235     return out;
236 }
237 
238 /*
239  * =============================================================================
240  * Description : Clip all halfword elements of input vector between min & max
241  *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) : (_in))
242  * Arguments   : Inputs  - _in  (input vector)
243  *                       - min  (min threshold)
244  *                       - max  (max threshold)
245  *               Outputs - out  (output vector with clipped elements)
246  *               Return Type - signed halfword
247  * Example     : out = __lsx_vclip_h(_in)
248  *         _in : -8,2,280,249, -8,255,280,249
249  *         min : 1,1,1,1, 1,1,1,1
250  *         max : 9,9,9,9, 9,9,9,9
251  *         out : 1,2,9,9, 1,9,9,9
252  * =============================================================================
253  */
__lsx_vclip_h(__m128i _in,__m128i min,__m128i max)254 static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max)
255 {
256     __m128i out;
257 
258     out = __lsx_vmax_h(min, _in);
259     out = __lsx_vmin_h(max, out);
260     return out;
261 }
262 
263 /*
264  * =============================================================================
265  * Description : Set each element of vector between 0 and 255
266  * Arguments   : Inputs  - _in
267  *               Outputs - out
268  *               Retrun Type - halfword
269  * Details     : Signed byte elements from _in are clamped between 0 and 255.
270  * Example     : out = __lsx_vclip255_h(_in)
271  *         _in : -8,255,280,249, -8,255,280,249
272  *         out : 0,255,255,249, 0,255,255,249
273  * =============================================================================
274  */
__lsx_vclip255_h(__m128i _in)275 static inline __m128i __lsx_vclip255_h(__m128i _in)
276 {
277     __m128i out;
278 
279     out = __lsx_vmaxi_h(_in, 0);
280     out = __lsx_vsat_hu(out, 7);
281     return out;
282 }
283 
284 /*
285  * =============================================================================
286  * Description : Set each element of vector between 0 and 255
287  * Arguments   : Inputs  - _in
288  *               Outputs - out
289  *               Retrun Type - word
290  * Details     : Signed byte elements from _in are clamped between 0 and 255.
291  * Example     : out = __lsx_vclip255_w(_in)
292  *         _in : -8,255,280,249
293  *         out : 0,255,255,249
294  * =============================================================================
295  */
__lsx_vclip255_w(__m128i _in)296 static inline __m128i __lsx_vclip255_w(__m128i _in)
297 {
298     __m128i out;
299 
300     out = __lsx_vmaxi_w(_in, 0);
301     out = __lsx_vsat_wu(out, 7);
302     return out;
303 }
304 
305 /*
306  * =============================================================================
307  * Description : Swap two variables
308  * Arguments   : Inputs  - _in0, _in1
309  *               Outputs - _in0, _in1 (in-place)
310  * Details     : Swapping of two input variables using xor
311  * Example     : LSX_SWAP(_in0, _in1)
312  *        _in0 : 1,2,3,4
313  *        _in1 : 5,6,7,8
314  *   _in0(out) : 5,6,7,8
315  *   _in1(out) : 1,2,3,4
316  * =============================================================================
317  */
318 #define LSX_SWAP(_in0, _in1)                                            \
319 {                                                                       \
320     _in0 = __lsx_vxor_v(_in0, _in1);                                    \
321     _in1 = __lsx_vxor_v(_in0, _in1);                                    \
322     _in0 = __lsx_vxor_v(_in0, _in1);                                    \
323 }                                                                       \
324 
325 /*
326  * =============================================================================
327  * Description : Transpose 4x4 block with word elements in vectors
328  * Arguments   : Inputs  - in0, in1, in2, in3
329  *               Outputs - out0, out1, out2, out3
330  * Details     :
331  * Example     :
332  *               1, 2, 3, 4            1, 5, 9,13
333  *               5, 6, 7, 8    to      2, 6,10,14
334  *               9,10,11,12  =====>    3, 7,11,15
335  *              13,14,15,16            4, 8,12,16
336  * =============================================================================
337  */
338 #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
339 {                                                                              \
340     __m128i _t0, _t1, _t2, _t3;                                                \
341                                                                                \
342     _t0   = __lsx_vilvl_w(_in1, _in0);                                         \
343     _t1   = __lsx_vilvh_w(_in1, _in0);                                         \
344     _t2   = __lsx_vilvl_w(_in3, _in2);                                         \
345     _t3   = __lsx_vilvh_w(_in3, _in2);                                         \
346     _out0 = __lsx_vilvl_d(_t2, _t0);                                           \
347     _out1 = __lsx_vilvh_d(_t2, _t0);                                           \
348     _out2 = __lsx_vilvl_d(_t3, _t1);                                           \
349     _out3 = __lsx_vilvh_d(_t3, _t1);                                           \
350 }
351 
352 /*
353  * =============================================================================
354  * Description : Transpose 8x8 block with byte elements in vectors
355  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
356  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
357  * Details     : The rows of the matrix become columns, and the columns become rows.
358  * Example     : LSX_TRANSPOSE8x8_B
359  *        _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
360  *        _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
361  *        _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
362  *        _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
363  *        _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
364  *        _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
365  *        _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
366  *        _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
367  *
368  *      _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
369  *      _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
370  *      _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
371  *      _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
372  *      _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
373  *      _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
374  *      _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
375  *      _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
376  * =============================================================================
377  */
378 #define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
379                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
380 {                                                                                 \
381    __m128i zero = {0};                                                            \
382    __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110};                      \
383    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                                \
384                                                                                   \
385    _t0 = __lsx_vilvl_b(_in2, _in0);                                               \
386    _t1 = __lsx_vilvl_b(_in3, _in1);                                               \
387    _t2 = __lsx_vilvl_b(_in6, _in4);                                               \
388    _t3 = __lsx_vilvl_b(_in7, _in5);                                               \
389    _t4 = __lsx_vilvl_b(_t1, _t0);                                                 \
390    _t5 = __lsx_vilvh_b(_t1, _t0);                                                 \
391    _t6 = __lsx_vilvl_b(_t3, _t2);                                                 \
392    _t7 = __lsx_vilvh_b(_t3, _t2);                                                 \
393    _out0 = __lsx_vilvl_w(_t6, _t4);                                               \
394    _out2 = __lsx_vilvh_w(_t6, _t4);                                               \
395    _out4 = __lsx_vilvl_w(_t7, _t5);                                               \
396    _out6 = __lsx_vilvh_w(_t7, _t5);                                               \
397    _out1 = __lsx_vshuf_b(zero, _out0, shuf8);                                     \
398    _out3 = __lsx_vshuf_b(zero, _out2, shuf8);                                     \
399    _out5 = __lsx_vshuf_b(zero, _out4, shuf8);                                     \
400    _out7 = __lsx_vshuf_b(zero, _out6, shuf8);                                     \
401 }
402 
403 /*
404  * =============================================================================
405  * Description : Transpose 8x8 block with half word elements in vectors
406  * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
407  *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
408  * Details     :
409  * Example     :
410  *              00,01,02,03,04,05,06,07           00,10,20,30,40,50,60,70
411  *              10,11,12,13,14,15,16,17           01,11,21,31,41,51,61,71
412  *              20,21,22,23,24,25,26,27           02,12,22,32,42,52,62,72
413  *              30,31,32,33,34,35,36,37    to     03,13,23,33,43,53,63,73
414  *              40,41,42,43,44,45,46,47  ======>  04,14,24,34,44,54,64,74
415  *              50,51,52,53,54,55,56,57           05,15,25,35,45,55,65,75
416  *              60,61,62,63,64,65,66,67           06,16,26,36,46,56,66,76
417  *              70,71,72,73,74,75,76,77           07,17,27,37,47,57,67,77
418  * =============================================================================
419  */
420 #define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
421                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
422 {                                                                                 \
423     __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                     \
424                                                                                   \
425     _s0 = __lsx_vilvl_h(_in6, _in4);                                              \
426     _s1 = __lsx_vilvl_h(_in7, _in5);                                              \
427     _t0 = __lsx_vilvl_h(_s1, _s0);                                                \
428     _t1 = __lsx_vilvh_h(_s1, _s0);                                                \
429     _s0 = __lsx_vilvh_h(_in6, _in4);                                              \
430     _s1 = __lsx_vilvh_h(_in7, _in5);                                              \
431     _t2 = __lsx_vilvl_h(_s1, _s0);                                                \
432     _t3 = __lsx_vilvh_h(_s1, _s0);                                                \
433     _s0 = __lsx_vilvl_h(_in2, _in0);                                              \
434     _s1 = __lsx_vilvl_h(_in3, _in1);                                              \
435     _t4 = __lsx_vilvl_h(_s1, _s0);                                                \
436     _t5 = __lsx_vilvh_h(_s1, _s0);                                                \
437     _s0 = __lsx_vilvh_h(_in2, _in0);                                              \
438     _s1 = __lsx_vilvh_h(_in3, _in1);                                              \
439     _t6 = __lsx_vilvl_h(_s1, _s0);                                                \
440     _t7 = __lsx_vilvh_h(_s1, _s0);                                                \
441                                                                                   \
442     _out0 = __lsx_vpickev_d(_t0, _t4);                                            \
443     _out2 = __lsx_vpickev_d(_t1, _t5);                                            \
444     _out4 = __lsx_vpickev_d(_t2, _t6);                                            \
445     _out6 = __lsx_vpickev_d(_t3, _t7);                                            \
446     _out1 = __lsx_vpickod_d(_t0, _t4);                                            \
447     _out3 = __lsx_vpickod_d(_t1, _t5);                                            \
448     _out5 = __lsx_vpickod_d(_t2, _t6);                                            \
449     _out7 = __lsx_vpickod_d(_t3, _t7);                                            \
450 }
451 
452 /*
453  * =============================================================================
454  * Description : Transpose input 8x4 byte block into 4x8
455  * Arguments   : Inputs  - _in0, _in1, _in2, _in3      (input 8x4 byte block)
456  *               Outputs - _out0, _out1, _out2, _out3  (output 4x8 byte block)
457  *               Return Type - as per RTYPE
458  * Details     : The rows of the matrix become columns, and the columns become rows.
459  * Example     : LSX_TRANSPOSE8x4_B
460  *        _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
461  *        _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
462  *        _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
463  *        _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
464  *        _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
465  *        _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
466  *        _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
467  *        _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
468  *
469  *       _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
470  *       _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
471  *       _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
472  *       _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
473  * =============================================================================
474  */
475 #define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,       \
476                            _out0, _out1, _out2, _out3)                           \
477 {                                                                                \
478     __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                  \
479                                                                                  \
480     _tmp0_m = __lsx_vpackev_w(_in4, _in0);                                       \
481     _tmp1_m = __lsx_vpackev_w(_in5, _in1);                                       \
482     _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                                   \
483     _tmp0_m = __lsx_vpackev_w(_in6, _in2);                                       \
484     _tmp1_m = __lsx_vpackev_w(_in7, _in3);                                       \
485                                                                                  \
486     _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                                   \
487     _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m);                                   \
488     _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m);                                   \
489                                                                                  \
490     _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m);                                     \
491     _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m);                                     \
492     _out1 = __lsx_vilvh_d(_out2, _out0);                                         \
493     _out3 = __lsx_vilvh_d(_out0, _out2);                                         \
494 }
495 
496 /*
497  * =============================================================================
498  * Description : Transpose 16x8 block with byte elements in vectors
499  * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, in8
500  *                         in9, in10, in11, in12, in13, in14, in15
501  *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
502  * Details     :
503  * Example     :
504  *              000,001,002,003,004,005,006,007
505  *              008,009,010,011,012,013,014,015
506  *              016,017,018,019,020,021,022,023
507  *              024,025,026,027,028,029,030,031
508  *              032,033,034,035,036,037,038,039
509  *              040,041,042,043,044,045,046,047        000,008,...,112,120
510  *              048,049,050,051,052,053,054,055        001,009,...,113,121
511  *              056,057,058,059,060,061,062,063   to   002,010,...,114,122
512  *              064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
513  *              072,073,074,075,076,077,078,079        004,012,...,116,124
514  *              080,081,082,083,084,085,086,087        005,013,...,117,125
515  *              088,089,090,091,092,093,094,095        006,014,...,118,126
516  *              096,097,098,099,100,101,102,103        007,015,...,119,127
517  *              104,105,106,107,108,109,110,111
518  *              112,113,114,115,116,117,118,119
519  *              120,121,122,123,124,125,126,127
520  * =============================================================================
521  */
522 #define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _in8,  \
523                             _in9, _in10, _in11, _in12, _in13, _in14, _in15, _out0, \
524                             _out1, _out2, _out3, _out4, _out5, _out6, _out7)       \
525 {                                                                                  \
526     __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;                \
527     __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                                \
528     DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5,       \
529               _tmp0, _tmp1, _tmp2, _tmp3);                                         \
530     DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15,        \
531               _in13, _tmp4, _tmp5, _tmp6, _tmp7);                                  \
532     DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2);                \
533     DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3);                \
534     DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6);                \
535     DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7);                \
536     DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4);                    \
537     DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6);                    \
538     DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5);                    \
539     DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7);                    \
540     DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2);            \
541     DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3);            \
542     DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6);            \
543     DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7);            \
544 }
545 
546 /*
547  * =============================================================================
548  * Description : Butterfly of 4 input vectors
549  * Arguments   : Inputs  - in0, in1, in2, in3
550  *               Outputs - out0, out1, out2, out3
551  * Details     : Butterfly operation
552  * Example     :
553  *               out0 = in0 + in3;
554  *               out1 = in1 + in2;
555  *               out2 = in1 - in2;
556  *               out3 = in0 - in3;
557  * =============================================================================
558  */
559 #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
560 {                                                                             \
561     _out0 = __lsx_vadd_b(_in0, _in3);                                         \
562     _out1 = __lsx_vadd_b(_in1, _in2);                                         \
563     _out2 = __lsx_vsub_b(_in1, _in2);                                         \
564     _out3 = __lsx_vsub_b(_in0, _in3);                                         \
565 }
566 #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
567 {                                                                             \
568     _out0 = __lsx_vadd_h(_in0, _in3);                                         \
569     _out1 = __lsx_vadd_h(_in1, _in2);                                         \
570     _out2 = __lsx_vsub_h(_in1, _in2);                                         \
571     _out3 = __lsx_vsub_h(_in0, _in3);                                         \
572 }
573 #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
574 {                                                                             \
575     _out0 = __lsx_vadd_w(_in0, _in3);                                         \
576     _out1 = __lsx_vadd_w(_in1, _in2);                                         \
577     _out2 = __lsx_vsub_w(_in1, _in2);                                         \
578     _out3 = __lsx_vsub_w(_in0, _in3);                                         \
579 }
580 #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
581 {                                                                             \
582     _out0 = __lsx_vadd_d(_in0, _in3);                                         \
583     _out1 = __lsx_vadd_d(_in1, _in2);                                         \
584     _out2 = __lsx_vsub_d(_in1, _in2);                                         \
585     _out3 = __lsx_vsub_d(_in0, _in3);                                         \
586 }
587 
588 /*
589  * =============================================================================
590  * Description : Butterfly of 8 input vectors
591  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
592  *               Outputs - _out0, _out1, _out2, _out3, ~
593  * Details     : Butterfly operation
594  * Example     :
595  *              _out0 = _in0 + _in7;
596  *              _out1 = _in1 + _in6;
597  *              _out2 = _in2 + _in5;
598  *              _out3 = _in3 + _in4;
599  *              _out4 = _in3 - _in4;
600  *              _out5 = _in2 - _in5;
601  *              _out6 = _in1 - _in6;
602  *              _out7 = _in0 - _in7;
603  * =============================================================================
604  */
605 #define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
606                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
607 {                                                                                \
608     _out0 = __lsx_vadd_b(_in0, _in7);                                            \
609     _out1 = __lsx_vadd_b(_in1, _in6);                                            \
610     _out2 = __lsx_vadd_b(_in2, _in5);                                            \
611     _out3 = __lsx_vadd_b(_in3, _in4);                                            \
612     _out4 = __lsx_vsub_b(_in3, _in4);                                            \
613     _out5 = __lsx_vsub_b(_in2, _in5);                                            \
614     _out6 = __lsx_vsub_b(_in1, _in6);                                            \
615     _out7 = __lsx_vsub_b(_in0, _in7);                                            \
616 }
617 
618 #define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
619                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
620 {                                                                                \
621     _out0 = __lsx_vadd_h(_in0, _in7);                                            \
622     _out1 = __lsx_vadd_h(_in1, _in6);                                            \
623     _out2 = __lsx_vadd_h(_in2, _in5);                                            \
624     _out3 = __lsx_vadd_h(_in3, _in4);                                            \
625     _out4 = __lsx_vsub_h(_in3, _in4);                                            \
626     _out5 = __lsx_vsub_h(_in2, _in5);                                            \
627     _out6 = __lsx_vsub_h(_in1, _in6);                                            \
628     _out7 = __lsx_vsub_h(_in0, _in7);                                            \
629 }
630 
631 #define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
632                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
633 {                                                                                \
634     _out0 = __lsx_vadd_w(_in0, _in7);                                            \
635     _out1 = __lsx_vadd_w(_in1, _in6);                                            \
636     _out2 = __lsx_vadd_w(_in2, _in5);                                            \
637     _out3 = __lsx_vadd_w(_in3, _in4);                                            \
638     _out4 = __lsx_vsub_w(_in3, _in4);                                            \
639     _out5 = __lsx_vsub_w(_in2, _in5);                                            \
640     _out6 = __lsx_vsub_w(_in1, _in6);                                            \
641     _out7 = __lsx_vsub_w(_in0, _in7);                                            \
642 }
643 
644 #define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
645                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
646 {                                                                                \
647     _out0 = __lsx_vadd_d(_in0, _in7);                                            \
648     _out1 = __lsx_vadd_d(_in1, _in6);                                            \
649     _out2 = __lsx_vadd_d(_in2, _in5);                                            \
650     _out3 = __lsx_vadd_d(_in3, _in4);                                            \
651     _out4 = __lsx_vsub_d(_in3, _in4);                                            \
652     _out5 = __lsx_vsub_d(_in2, _in5);                                            \
653     _out6 = __lsx_vsub_d(_in1, _in6);                                            \
654     _out7 = __lsx_vsub_d(_in0, _in7);                                            \
655 }
656 
657 #endif //LSX
658 
659 #ifdef __loongarch_asx
660 #include <lasxintrin.h>
661 /*
662  * =============================================================================
663  * Description : Dot product of byte vector elements
664  * Arguments   : Inputs - in_h, in_l
665  *               Output - out
666  *               Return Type - signed halfword
667  * Details     : Unsigned byte elements from in_h are multiplied with
668  *               unsigned byte elements from in_l producing a result
669  *               twice the size of input i.e. signed halfword.
670  *               Then this multiplied results of adjacent odd-even elements
671  *               are added to the out vector
672  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
673  * =============================================================================
674  */
__lasx_xvdp2_h_bu(__m256i in_h,__m256i in_l)675 static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l)
676 {
677     __m256i out;
678 
679     out = __lasx_xvmulwev_h_bu(in_h, in_l);
680     out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
681     return out;
682 }
683 
684 /*
685  * =============================================================================
686  * Description : Dot product of byte vector elements
687  * Arguments   : Inputs - in_h, in_l
688  *               Output - out
689  *               Return Type - signed halfword
690  * Details     : Signed byte elements from in_h are multiplied with
691  *               signed byte elements from in_l producing a result
692  *               twice the size of input i.e. signed halfword.
693  *               Then this iniplication results of adjacent odd-even elements
694  *               are added to the out vector
695  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
696  * =============================================================================
697  */
__lasx_xvdp2_h_b(__m256i in_h,__m256i in_l)698 static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l)
699 {
700     __m256i out;
701 
702     out = __lasx_xvmulwev_h_b(in_h, in_l);
703     out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
704     return out;
705 }
706 
707 /*
708  * =============================================================================
709  * Description : Dot product of halfword vector elements
710  * Arguments   : Inputs - in_h, in_l
711  *               Output - out
712  *               Return Type - signed word
713  * Details     : Signed halfword elements from in_h are multiplied with
714  *               signed halfword elements from in_l producing a result
715  *               twice the size of input i.e. signed word.
716  *               Then this multiplied results of adjacent odd-even elements
717  *               are added to the out vector.
718  * Example     : out = __lasx_xvdp2_w_h(in_h, in_l)
719  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
720  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
721  *         out : 22,38,38,22, 22,38,38,22
722  * =============================================================================
723  */
__lasx_xvdp2_w_h(__m256i in_h,__m256i in_l)724 static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l)
725 {
726     __m256i out;
727 
728     out = __lasx_xvmulwev_w_h(in_h, in_l);
729     out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
730     return out;
731 }
732 
733 /*
734  * =============================================================================
735  * Description : Dot product of word vector elements
736  * Arguments   : Inputs - in_h, in_l
737  *               Output - out
738  *               Retrun Type - signed double
739  * Details     : Signed word elements from in_h are multiplied with
740  *               signed word elements from in_l producing a result
741  *               twice the size of input i.e. signed double word.
742  *               Then this multiplied results of adjacent odd-even elements
743  *               are added to the out vector.
744  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
745  * =============================================================================
746  */
__lasx_xvdp2_d_w(__m256i in_h,__m256i in_l)747 static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l)
748 {
749     __m256i out;
750 
751     out = __lasx_xvmulwev_d_w(in_h, in_l);
752     out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
753     return out;
754 }
755 
756 /*
757  * =============================================================================
758  * Description : Dot product of halfword vector elements
759  * Arguments   : Inputs - in_h, in_l
760  *               Output - out
761  *               Return Type - signed word
762  * Details     : Unsigned halfword elements from in_h are multiplied with
763  *               signed halfword elements from in_l producing a result
764  *               twice the size of input i.e. unsigned word.
765  *               Multiplication result of adjacent odd-even elements
766  *               are added to the out vector
767  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
768  * =============================================================================
769  */
__lasx_xvdp2_w_hu_h(__m256i in_h,__m256i in_l)770 static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
771 {
772     __m256i out;
773 
774     out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
775     out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
776     return out;
777 }
778 
779 /*
780  * =============================================================================
781  * Description : Dot product & addition of byte vector elements
782  * Arguments   : Inputs - in_h, in_l
783  *               Output - out
784  *               Retrun Type - halfword
785  * Details     : Signed byte elements from in_h are multiplied with
786  *               signed byte elements from in_l producing a result
787  *               twice the size of input i.e. signed halfword.
788  *               Then this multiplied results of adjacent odd-even elements
789  *               are added to the in_c vector.
790  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
791  * =============================================================================
792  */
__lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h,__m256i in_l)793 static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h, __m256i in_l)
794 {
795     __m256i out;
796 
797     out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
798     out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
799     return out;
800 }
801 
802 /*
803  * =============================================================================
804  * Description : Dot product of halfword vector elements
805  * Arguments   : Inputs - in_c, in_h, in_l
806  *               Output - out
807  *               Return Type - per RTYPE
808  * Details     : Signed halfword elements from in_h are multiplied with
809  *               signed halfword elements from in_l producing a result
810  *               twice the size of input i.e. signed word.
811  *               Multiplication result of adjacent odd-even elements
812  *               are added to the in_c vector.
813  * Example     : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
814  *        in_c : 1,2,3,4, 1,2,3,4
815  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
816  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
817  *         out : 23,40,41,26, 23,40,41,26
818  * =============================================================================
819  */
__lasx_xvdp2add_w_h(__m256i in_c,__m256i in_h,__m256i in_l)820 static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
821 {
822     __m256i out;
823 
824     out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
825     out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
826     return out;
827 }
828 
829 /*
830  * =============================================================================
831  * Description : Dot product of halfword vector elements
832  * Arguments   : Inputs - in_c, in_h, in_l
833  *               Output - out
834  *               Return Type - signed word
835  * Details     : Unsigned halfword elements from in_h are multiplied with
836  *               unsigned halfword elements from in_l producing a result
837  *               twice the size of input i.e. signed word.
838  *               Multiplication result of adjacent odd-even elements
839  *               are added to the in_c vector.
840  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
841  * =============================================================================
842  */
__lasx_xvdp2add_w_hu(__m256i in_c,__m256i in_h,__m256i in_l)843 static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i in_l)
844 {
845     __m256i out;
846 
847     out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
848     out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
849     return out;
850 }
851 
852 /*
853  * =============================================================================
854  * Description : Dot product of halfword vector elements
855  * Arguments   : Inputs - in_c, in_h, in_l
856  *               Output - out
857  *               Return Type - signed word
858  * Details     : Unsigned halfword elements from in_h are multiplied with
859  *               signed halfword elements from in_l producing a result
860  *               twice the size of input i.e. signed word.
861  *               Multiplication result of adjacent odd-even elements
862  *               are added to the in_c vector
863  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
864  * =============================================================================
865  */
__lasx_xvdp2add_w_hu_h(__m256i in_c,__m256i in_h,__m256i in_l)866 static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i in_l)
867 {
868     __m256i out;
869 
870     out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
871     out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
872     return out;
873 }
874 
875 /*
876  * =============================================================================
877  * Description : Vector Unsigned Dot Product and Subtract
878  * Arguments   : Inputs - in_c, in_h, in_l
879  *               Output - out
880  *               Return Type - signed halfword
881  * Details     : Unsigned byte elements from in_h are multiplied with
882  *               unsigned byte elements from in_l producing a result
883  *               twice the size of input i.e. signed halfword.
884  *               Multiplication result of adjacent odd-even elements
885  *               are added together and subtracted from double width elements
886  *               in_c vector.
887  * Example     : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
888  * =============================================================================
889  */
__lasx_xvdp2sub_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)890 static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i in_l)
891 {
892     __m256i out;
893 
894     out = __lasx_xvmulwev_h_bu(in_h, in_l);
895     out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
896     out = __lasx_xvsub_h(in_c, out);
897     return out;
898 }
899 
900 /*
901  * =============================================================================
902  * Description : Vector Signed Dot Product and Subtract
903  * Arguments   : Inputs - in_c, in_h, in_l
904  *               Output - out
905  *               Return Type - signed word
906  * Details     : Signed halfword elements from in_h are multiplied with
907  *               Signed halfword elements from in_l producing a result
908  *               twice the size of input i.e. signed word.
909  *               Multiplication result of adjacent odd-even elements
910  *               are added together and subtracted from double width elements
911  *               in_c vector.
912  * Example     : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
913  *        in_c : 0,0,0,0, 0,0,0,0
914  *        in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
915  *        in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
916  *         out : -7,-3,0,0, 0,-1,0,-1
917  * =============================================================================
918  */
__lasx_xvdp2sub_w_h(__m256i in_c,__m256i in_h,__m256i in_l)919 static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
920 {
921     __m256i out;
922 
923     out = __lasx_xvmulwev_w_h(in_h, in_l);
924     out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
925     out = __lasx_xvsub_w(in_c, out);
926     return out;
927 }
928 
929 /*
930  * =============================================================================
931  * Description : Dot product of halfword vector elements
932  * Arguments   : Inputs - in_h, in_l
933  *               Output - out
934  *               Return Type - signed word
935  * Details     : Signed halfword elements from in_h are iniplied with
936  *               signed halfword elements from in_l producing a result
937  *               four times the size of input i.e. signed doubleword.
938  *               Then this iniplication results of four adjacent elements
939  *               are added together and stored to the out vector.
940  * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
941  *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
942  *        in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
943  *         out : -2,0,1,1
944  * =============================================================================
945  */
__lasx_xvdp4_d_h(__m256i in_h,__m256i in_l)946 static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l)
947 {
948     __m256i out;
949 
950     out = __lasx_xvmulwev_w_h(in_h, in_l);
951     out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
952     out = __lasx_xvhaddw_d_w(out, out);
953     return out;
954 }
955 
956 /*
957  * =============================================================================
958  * Description : The high half of the vector elements are expanded and
959  *               added after being doubled.
960  * Arguments   : Inputs - in_h, in_l
961  *               Output - out
962  * Details     : The in_h vector and the in_l vector are added after the
963  *               higher half of the two-fold sign extension (signed byte
964  *               to signed halfword) and stored to the out vector.
965  * Example     : See out = __lasx_xvaddwh_w_h(in_h, in_l)
966  * =============================================================================
967  */
__lasx_xvaddwh_h_b(__m256i in_h,__m256i in_l)968 static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
969 {
970     __m256i out;
971 
972     out = __lasx_xvilvh_b(in_h, in_l);
973     out = __lasx_xvhaddw_h_b(out, out);
974     return out;
975 }
976 
977 /*
978  * =============================================================================
979  * Description : The high half of the vector elements are expanded and
980  *               added after being doubled.
981  * Arguments   : Inputs - in_h, in_l
982  *               Output - out
983  * Details     : The in_h vector and the in_l vector are added after the
984  *               higher half of the two-fold sign extension (signed halfword
985  *               to signed word) and stored to the out vector.
986  * Example     : out = __lasx_xvaddwh_w_h(in_h, in_l)
987  *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
988  *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
989  *         out : 1,0,0,-1, 1,0,0, 2
990  * =============================================================================
991  */
__lasx_xvaddwh_w_h(__m256i in_h,__m256i in_l)992  static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l)
993 {
994     __m256i out;
995 
996     out = __lasx_xvilvh_h(in_h, in_l);
997     out = __lasx_xvhaddw_w_h(out, out);
998     return out;
999 }
1000 
1001 /*
1002  * =============================================================================
1003  * Description : The low half of the vector elements are expanded and
1004  *               added after being doubled.
1005  * Arguments   : Inputs - in_h, in_l
1006  *               Output - out
1007  * Details     : The in_h vector and the in_l vector are added after the
1008  *               lower half of the two-fold sign extension (signed byte
1009  *               to signed halfword) and stored to the out vector.
1010  * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1011  * =============================================================================
1012  */
__lasx_xvaddwl_h_b(__m256i in_h,__m256i in_l)1013 static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l)
1014 {
1015     __m256i out;
1016 
1017     out = __lasx_xvilvl_b(in_h, in_l);
1018     out = __lasx_xvhaddw_h_b(out, out);
1019     return out;
1020 }
1021 
1022 /*
1023  * =============================================================================
1024  * Description : The low half of the vector elements are expanded and
1025  *               added after being doubled.
1026  * Arguments   : Inputs - in_h, in_l
1027  *               Output - out
1028  * Details     : The in_h vector and the in_l vector are added after the
1029  *               lower half of the two-fold sign extension (signed halfword
1030  *               to signed word) and stored to the out vector.
1031  * Example     : out = __lasx_xvaddwl_w_h(in_h, in_l)
1032  *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1033  *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1034  *         out : 5,-1,4,2, 1,0,2,-1
1035  * =============================================================================
1036  */
__lasx_xvaddwl_w_h(__m256i in_h,__m256i in_l)1037 static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l)
1038 {
1039     __m256i out;
1040 
1041     out = __lasx_xvilvl_h(in_h, in_l);
1042     out = __lasx_xvhaddw_w_h(out, out);
1043     return out;
1044 }
1045 
1046 /*
1047  * =============================================================================
1048  * Description : The low half of the vector elements are expanded and
1049  *               added after being doubled.
1050  * Arguments   : Inputs - in_h, in_l
1051  *               Output - out
1052  * Details     : The out vector and the out vector are added after the
1053  *               lower half of the two-fold zero extension (unsigned byte
1054  *               to unsigned halfword) and stored to the out vector.
1055  * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1056  * =============================================================================
1057  */
__lasx_xvaddwl_h_bu(__m256i in_h,__m256i in_l)1058 static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l)
1059 {
1060     __m256i out;
1061 
1062     out = __lasx_xvilvl_b(in_h, in_l);
1063     out = __lasx_xvhaddw_hu_bu(out, out);
1064     return out;
1065 }
1066 
1067 /*
1068  * =============================================================================
1069  * Description : The low half of the vector elements are expanded and
1070  *               added after being doubled.
1071  * Arguments   : Inputs - in_h, in_l
1072  *               Output - out
1073  * Details     : The in_l vector after double zero extension (unsigned byte to
1074  *               signed halfword),added to the in_h vector.
1075  * Example     : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
1076  * =============================================================================
1077  */
__lasx_xvaddw_h_h_bu(__m256i in_h,__m256i in_l)1078 static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l)
1079 {
1080     __m256i out;
1081 
1082     out = __lasx_xvsllwil_hu_bu(in_l, 0);
1083     out = __lasx_xvadd_h(in_h, out);
1084     return out;
1085 }
1086 
1087 /*
1088  * =============================================================================
1089  * Description : The low half of the vector elements are expanded and
1090  *               added after being doubled.
1091  * Arguments   : Inputs - in_h, in_l
1092  *               Output - out
1093  * Details     : The in_l vector after double sign extension (signed halfword to
1094  *               signed word), added to the in_h vector.
1095  * Example     : out = __lasx_xvaddw_w_w_h(in_h, in_l)
1096  *        in_h : 0, 1,0,0, -1,0,0,1,
1097  *        in_l : 2,-1,1,2,  1,0,0,0, 0,0,1,0, 1,0,0,1,
1098  *         out : 2, 0,1,2, -1,0,1,1,
1099  * =============================================================================
1100  */
__lasx_xvaddw_w_w_h(__m256i in_h,__m256i in_l)1101 static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l)
1102 {
1103     __m256i out;
1104 
1105     out = __lasx_xvsllwil_w_h(in_l, 0);
1106     out = __lasx_xvadd_w(in_h, out);
1107     return out;
1108 }
1109 
1110 /*
1111  * =============================================================================
1112  * Description : Multiplication and addition calculation after expansion
1113  *               of the lower half of the vector.
1114  * Arguments   : Inputs - in_c, in_h, in_l
1115  *               Output - out
1116  * Details     : The in_h vector and the in_l vector are multiplied after
1117  *               the lower half of the two-fold sign extension (signed halfword
1118  *               to signed word), and the result is added to the vector in_c,
1119  *               then stored to the out vector.
1120  * Example     : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1121  *        in_c : 1,2,3,4, 5,6,7,8
1122  *        in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
1123  *        in_l : 200, 300, 400, 500,  2000, 3000, 4000, 5000,
1124  *              -200,-300,-400,-500, -2000,-3000,-4000,-5000
1125  *         out : 201, 602,1203,2004, -995, -1794,-2793,-3992
1126  * =============================================================================
1127  */
__lasx_xvmaddwl_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1128 static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
1129 {
1130     __m256i tmp0, tmp1, out;
1131 
1132     tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1133     tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1134     tmp0 = __lasx_xvmul_w(tmp0, tmp1);
1135     out  = __lasx_xvadd_w(tmp0, in_c);
1136     return out;
1137 }
1138 
1139 /*
1140  * =============================================================================
1141  * Description : Multiplication and addition calculation after expansion
1142  *               of the higher half of the vector.
1143  * Arguments   : Inputs - in_c, in_h, in_l
1144  *               Output - out
1145  * Details     : The in_h vector and the in_l vector are multiplied after
1146  *               the higher half of the two-fold sign extension (signed
1147  *               halfword to signed word), and the result is added to
1148  *               the vector in_c, then stored to the out vector.
1149  * Example     : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1150  * =============================================================================
1151  */
__lasx_xvmaddwh_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1152 static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
1153 {
1154     __m256i tmp0, tmp1, out;
1155 
1156     tmp0 = __lasx_xvilvh_h(in_h, in_h);
1157     tmp1 = __lasx_xvilvh_h(in_l, in_l);
1158     tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
1159     out  = __lasx_xvadd_w(tmp0, in_c);
1160     return out;
1161 }
1162 
1163 /*
1164  * =============================================================================
1165  * Description : Multiplication calculation after expansion of the lower
1166  *               half of the vector.
1167  * Arguments   : Inputs - in_h, in_l
1168  *               Output - out
1169  * Details     : The in_h vector and the in_l vector are multiplied after
1170  *               the lower half of the two-fold sign extension (signed
1171  *               halfword to signed word), then stored to the out vector.
1172  * Example     : out = __lasx_xvmulwl_w_h(in_h, in_l)
1173  *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1174  *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1175  *         out : 6,1,3,0, 0,0,1,0
1176  * =============================================================================
1177  */
__lasx_xvmulwl_w_h(__m256i in_h,__m256i in_l)1178 static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l)
1179 {
1180     __m256i tmp0, tmp1, out;
1181 
1182     tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1183     tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1184     out  = __lasx_xvmul_w(tmp0, tmp1);
1185     return out;
1186 }
1187 
1188 /*
1189  * =============================================================================
1190  * Description : Multiplication calculation after expansion of the lower
1191  *               half of the vector.
1192  * Arguments   : Inputs - in_h, in_l
1193  *               Output - out
1194  * Details     : The in_h vector and the in_l vector are multiplied after
1195  *               the lower half of the two-fold sign extension (signed
1196  *               halfword to signed word), then stored to the out vector.
1197  * Example     : out = __lasx_xvmulwh_w_h(in_h, in_l)
1198  *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1199  *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1200  *         out : 0,0,0,0, 0,0,0,1
1201  * =============================================================================
1202  */
__lasx_xvmulwh_w_h(__m256i in_h,__m256i in_l)1203 static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l)
1204 {
1205     __m256i tmp0, tmp1, out;
1206 
1207     tmp0 = __lasx_xvilvh_h(in_h, in_h);
1208     tmp1 = __lasx_xvilvh_h(in_l, in_l);
1209     out  = __lasx_xvmulwev_w_h(tmp0, tmp1);
1210     return out;
1211 }
1212 
1213 /*
1214  * =============================================================================
1215  * Description : The low half of the vector elements are expanded and
1216  *               added saturately after being doubled.
1217  * Arguments   : Inputs - in_h, in_l
1218  *               Output - out
1219  * Details     : The in_h vector adds the in_l vector saturately after the lower
1220  *               half of the two-fold zero extension (unsigned byte to unsigned
1221  *               halfword) and the results are stored to the out vector.
1222  * Example     : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
1223  *        in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
1224  *        in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
1225  *         out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
1226  * =============================================================================
1227  */
__lasx_xvsaddw_hu_hu_bu(__m256i in_h,__m256i in_l)1228 static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l)
1229 {
1230     __m256i tmp1, out;
1231     __m256i zero = {0};
1232 
1233     tmp1 = __lasx_xvilvl_b(zero, in_l);
1234     out  = __lasx_xvsadd_hu(in_h, tmp1);
1235     return out;
1236 }
1237 
1238 /*
1239  * =============================================================================
1240  * Description : Clip all halfword elements of input vector between min & max
1241  *               out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1242  * Arguments   : Inputs  - in    (input vector)
1243  *                       - min   (min threshold)
1244  *                       - max   (max threshold)
1245  *               Outputs - in    (output vector with clipped elements)
1246  *               Return Type - signed halfword
1247  * Example     : out = __lasx_xvclip_h(in, min, max)
1248  *          in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
1249  *         min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
1250  *         max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
1251  *         out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
1252  * =============================================================================
1253  */
__lasx_xvclip_h(__m256i in,__m256i min,__m256i max)1254 static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max)
1255 {
1256     __m256i out;
1257 
1258     out = __lasx_xvmax_h(min, in);
1259     out = __lasx_xvmin_h(max, out);
1260     return out;
1261 }
1262 
1263 /*
1264  * =============================================================================
1265  * Description : Clip all signed halfword elements of input vector
1266  *               between 0 & 255
1267  * Arguments   : Inputs  - in   (input vector)
1268  *               Outputs - out  (output vector with clipped elements)
1269  *               Return Type - signed halfword
1270  * Example     : See out = __lasx_xvclip255_w(in)
1271  * =============================================================================
1272  */
__lasx_xvclip255_h(__m256i in)1273 static inline __m256i __lasx_xvclip255_h(__m256i in)
1274 {
1275     __m256i out;
1276 
1277     out = __lasx_xvmaxi_h(in, 0);
1278     out = __lasx_xvsat_hu(out, 7);
1279     return out;
1280 }
1281 
1282 /*
1283  * =============================================================================
1284  * Description : Clip all signed word elements of input vector
1285  *               between 0 & 255
1286  * Arguments   : Inputs - in   (input vector)
1287  *               Output - out  (output vector with clipped elements)
1288  *               Return Type - signed word
1289  * Example     : out = __lasx_xvclip255_w(in)
1290  *          in : -8,255,280,249, -8,255,280,249
1291  *         out :  0,255,255,249,  0,255,255,249
1292  * =============================================================================
1293  */
__lasx_xvclip255_w(__m256i in)1294 static inline __m256i __lasx_xvclip255_w(__m256i in)
1295 {
1296     __m256i out;
1297 
1298     out = __lasx_xvmaxi_w(in, 0);
1299     out = __lasx_xvsat_wu(out, 7);
1300     return out;
1301 }
1302 
1303 /*
1304  * =============================================================================
1305  * Description : Indexed halfword element values are replicated to all
1306  *               elements in output vector. If 'indx < 8' use xvsplati_l_*,
1307  *               if 'indx >= 8' use xvsplati_h_*.
1308  * Arguments   : Inputs - in, idx
1309  *               Output - out
1310  * Details     : Idx element value from in vector is replicated to all
1311  *               elements in out vector.
1312  *               Valid index range for halfword operation is 0-7
1313  * Example     : out = __lasx_xvsplati_l_h(in, idx)
1314  *          in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
1315  *         idx : 0x02
1316  *         out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
1317  * =============================================================================
1318  */
__lasx_xvsplati_l_h(__m256i in,int idx)1319 static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx)
1320 {
1321     __m256i out;
1322 
1323     out = __lasx_xvpermi_q(in, in, 0x02);
1324     out = __lasx_xvreplve_h(out, idx);
1325     return out;
1326 }
1327 
1328 /*
1329  * =============================================================================
1330  * Description : Indexed halfword element values are replicated to all
1331  *               elements in output vector. If 'indx < 8' use xvsplati_l_*,
1332  *               if 'indx >= 8' use xvsplati_h_*.
1333  * Arguments   : Inputs - in, idx
1334  *               Output - out
1335  * Details     : Idx element value from in vector is replicated to all
1336  *               elements in out vector.
1337  *               Valid index range for halfword operation is 0-7
1338  * Example     : out = __lasx_xvsplati_h_h(in, idx)
1339  *          in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
1340  *         idx : 0x09
1341  *         out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1342  * =============================================================================
1343  */
__lasx_xvsplati_h_h(__m256i in,int idx)1344 static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
1345 {
1346     __m256i out;
1347 
1348     out = __lasx_xvpermi_q(in, in, 0x13);
1349     out = __lasx_xvreplve_h(out, idx);
1350     return out;
1351 }
1352 
1353 /*
1354  * =============================================================================
1355  * Description : Transpose 4x4 block with double word elements in vectors
1356  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1357  *               Outputs - _out0, _out1, _out2, _out3
1358  * Example     : LASX_TRANSPOSE4x4_D
1359  *        _in0 : 1,2,3,4
1360  *        _in1 : 1,2,3,4
1361  *        _in2 : 1,2,3,4
1362  *        _in3 : 1,2,3,4
1363  *
1364  *       _out0 : 1,1,1,1
1365  *       _out1 : 2,2,2,2
1366  *       _out2 : 3,3,3,3
1367  *       _out3 : 4,4,4,4
1368  * =============================================================================
1369  */
1370 #define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1371 {                                                                               \
1372     __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                         \
1373     _tmp0 = __lasx_xvilvl_d(_in1, _in0);                                        \
1374     _tmp1 = __lasx_xvilvh_d(_in1, _in0);                                        \
1375     _tmp2 = __lasx_xvilvl_d(_in3, _in2);                                        \
1376     _tmp3 = __lasx_xvilvh_d(_in3, _in2);                                        \
1377     _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20);                               \
1378     _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31);                               \
1379     _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20);                               \
1380     _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31);                               \
1381 }
1382 
1383 /*
1384  * =============================================================================
1385  * Description : Transpose 8x8 block with word elements in vectors
1386  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1387  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
1388  * Example     : LASX_TRANSPOSE8x8_W
1389  *        _in0 : 1,2,3,4,5,6,7,8
1390  *        _in1 : 2,2,3,4,5,6,7,8
1391  *        _in2 : 3,2,3,4,5,6,7,8
1392  *        _in3 : 4,2,3,4,5,6,7,8
1393  *        _in4 : 5,2,3,4,5,6,7,8
1394  *        _in5 : 6,2,3,4,5,6,7,8
1395  *        _in6 : 7,2,3,4,5,6,7,8
1396  *        _in7 : 8,2,3,4,5,6,7,8
1397  *
1398  *       _out0 : 1,2,3,4,5,6,7,8
1399  *       _out1 : 2,2,2,2,2,2,2,2
1400  *       _out2 : 3,3,3,3,3,3,3,3
1401  *       _out3 : 4,4,4,4,4,4,4,4
1402  *       _out4 : 5,5,5,5,5,5,5,5
1403  *       _out5 : 6,6,6,6,6,6,6,6
1404  *       _out6 : 7,7,7,7,7,7,7,7
1405  *       _out7 : 8,8,8,8,8,8,8,8
1406  * =============================================================================
1407  */
1408 #define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,         \
1409                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1410 {                                                                                   \
1411     __m256i _s0_m, _s1_m;                                                           \
1412     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                     \
1413     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                                     \
1414                                                                                     \
1415     _s0_m   = __lasx_xvilvl_w(_in2, _in0);                                          \
1416     _s1_m   = __lasx_xvilvl_w(_in3, _in1);                                          \
1417     _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                        \
1418     _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                        \
1419     _s0_m   = __lasx_xvilvh_w(_in2, _in0);                                          \
1420     _s1_m   = __lasx_xvilvh_w(_in3, _in1);                                          \
1421     _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                        \
1422     _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                        \
1423     _s0_m   = __lasx_xvilvl_w(_in6, _in4);                                          \
1424     _s1_m   = __lasx_xvilvl_w(_in7, _in5);                                          \
1425     _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                        \
1426     _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                        \
1427     _s0_m   = __lasx_xvilvh_w(_in6, _in4);                                          \
1428     _s1_m   = __lasx_xvilvh_w(_in7, _in5);                                          \
1429     _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                        \
1430     _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                        \
1431     _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20);                               \
1432     _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20);                               \
1433     _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20);                               \
1434     _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20);                               \
1435     _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31);                               \
1436     _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31);                               \
1437     _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31);                               \
1438     _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31);                               \
1439 }
1440 
1441 /*
1442  * =============================================================================
1443  * Description : Transpose input 16x8 byte block
1444  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1445  *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1446  *                         (input 16x8 byte block)
1447  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
1448  *                         (output 8x16 byte block)
1449  * Details     : The rows of the matrix become columns, and the columns become rows.
1450  * Example     : See LASX_TRANSPOSE16x8_H
1451  * =============================================================================
1452  */
1453 #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,         \
1454                              _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15,   \
1455                              _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1456 {                                                                                    \
1457     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                      \
1458     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                                      \
1459                                                                                      \
1460     _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                           \
1461     _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                           \
1462     _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                           \
1463     _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                           \
1464     _tmp4_m = __lasx_xvilvl_b(_in10, _in8);                                          \
1465     _tmp5_m = __lasx_xvilvl_b(_in11, _in9);                                          \
1466     _tmp6_m = __lasx_xvilvl_b(_in14, _in12);                                         \
1467     _tmp7_m = __lasx_xvilvl_b(_in15, _in13);                                         \
1468     _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                       \
1469     _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                       \
1470     _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                       \
1471     _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                       \
1472     _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m);                                       \
1473     _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m);                                       \
1474     _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m);                                       \
1475     _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m);                                       \
1476     _tmp0_m = __lasx_xvilvl_w(_out2, _out0);                                         \
1477     _tmp2_m = __lasx_xvilvh_w(_out2, _out0);                                         \
1478     _tmp4_m = __lasx_xvilvl_w(_out3, _out1);                                         \
1479     _tmp6_m = __lasx_xvilvh_w(_out3, _out1);                                         \
1480     _tmp1_m = __lasx_xvilvl_w(_out6, _out4);                                         \
1481     _tmp3_m = __lasx_xvilvh_w(_out6, _out4);                                         \
1482     _tmp5_m = __lasx_xvilvl_w(_out7, _out5);                                         \
1483     _tmp7_m = __lasx_xvilvh_w(_out7, _out5);                                         \
1484     _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m);                                       \
1485     _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m);                                       \
1486     _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m);                                       \
1487     _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m);                                       \
1488     _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m);                                       \
1489     _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m);                                       \
1490     _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m);                                       \
1491     _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m);                                       \
1492 }
1493 
1494 /*
1495  * =============================================================================
1496  * Description : Transpose input 16x8 byte block
1497  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1498  *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1499  *                         (input 16x8 byte block)
1500  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
1501  *                         (output 8x16 byte block)
1502  * Details     : The rows of the matrix become columns, and the columns become rows.
1503  * Example     : LASX_TRANSPOSE16x8_H
1504  *        _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1505  *        _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1506  *        _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1507  *        _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1508  *        _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1509  *        _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1510  *        _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1511  *        _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1512  *        _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1513  *        _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1514  *       _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1515  *       _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1516  *       _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1517  *       _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1518  *       _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1519  *       _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1520  *
1521  *       _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
1522  *       _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1523  *       _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
1524  *       _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
1525  *       _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
1526  *       _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1527  *       _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
1528  *       _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
1529  * =============================================================================
1530  */
1531 #define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,         \
1532                              _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15,   \
1533                              _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1534    {                                                                                 \
1535     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                      \
1536     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                                      \
1537     __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                                  \
1538                                                                                      \
1539     _tmp0_m = __lasx_xvilvl_h(_in2, _in0);                                           \
1540     _tmp1_m = __lasx_xvilvl_h(_in3, _in1);                                           \
1541     _tmp2_m = __lasx_xvilvl_h(_in6, _in4);                                           \
1542     _tmp3_m = __lasx_xvilvl_h(_in7, _in5);                                           \
1543     _tmp4_m = __lasx_xvilvl_h(_in10, _in8);                                          \
1544     _tmp5_m = __lasx_xvilvl_h(_in11, _in9);                                          \
1545     _tmp6_m = __lasx_xvilvl_h(_in14, _in12);                                         \
1546     _tmp7_m = __lasx_xvilvl_h(_in15, _in13);                                         \
1547     _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                         \
1548     _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                         \
1549     _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                         \
1550     _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                         \
1551     _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                         \
1552     _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                         \
1553     _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                         \
1554     _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                         \
1555     _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                             \
1556     _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                             \
1557     _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                             \
1558     _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                             \
1559     _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                             \
1560     _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                             \
1561     _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                             \
1562     _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                             \
1563     _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                                \
1564     _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                                \
1565     _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                                \
1566     _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                                \
1567                                                                                      \
1568     _tmp0_m = __lasx_xvilvh_h(_in2, _in0);                                           \
1569     _tmp1_m = __lasx_xvilvh_h(_in3, _in1);                                           \
1570     _tmp2_m = __lasx_xvilvh_h(_in6, _in4);                                           \
1571     _tmp3_m = __lasx_xvilvh_h(_in7, _in5);                                           \
1572     _tmp4_m = __lasx_xvilvh_h(_in10, _in8);                                          \
1573     _tmp5_m = __lasx_xvilvh_h(_in11, _in9);                                          \
1574     _tmp6_m = __lasx_xvilvh_h(_in14, _in12);                                         \
1575     _tmp7_m = __lasx_xvilvh_h(_in15, _in13);                                         \
1576     _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                         \
1577     _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                         \
1578     _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                         \
1579     _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                         \
1580     _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                         \
1581     _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                         \
1582     _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                         \
1583     _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                         \
1584     _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                             \
1585     _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                             \
1586     _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                             \
1587     _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                             \
1588     _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                             \
1589     _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                             \
1590     _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                             \
1591     _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                             \
1592     _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                                \
1593     _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                                \
1594     _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                                \
1595     _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                                \
1596 }
1597 
1598 /*
1599  * =============================================================================
1600  * Description : Transpose 4x4 block with halfword elements in vectors
1601  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1602  *               Outputs - _out0, _out1, _out2, _out3
1603  *               Return Type - signed halfword
1604  * Details     : The rows of the matrix become columns, and the columns become rows.
1605  * Example     : See LASX_TRANSPOSE8x8_H
1606  * =============================================================================
1607  */
1608 #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3)     \
1609 {                                                                                   \
1610     __m256i _s0_m, _s1_m;                                                           \
1611                                                                                     \
1612     _s0_m = __lasx_xvilvl_h(_in1, _in0);                                            \
1613     _s1_m = __lasx_xvilvl_h(_in3, _in2);                                            \
1614     _out0 = __lasx_xvilvl_w(_s1_m, _s0_m);                                          \
1615     _out2 = __lasx_xvilvh_w(_s1_m, _s0_m);                                          \
1616     _out1 = __lasx_xvilvh_d(_out0, _out0);                                          \
1617     _out3 = __lasx_xvilvh_d(_out2, _out2);                                          \
1618 }
1619 
1620 /*
1621  * =============================================================================
1622  * Description : Transpose input 8x8 byte block
1623  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1624  *                         (input 8x8 byte block)
1625  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
1626  *                         (output 8x8 byte block)
1627  * Example     : See LASX_TRANSPOSE8x8_H
1628  * =============================================================================
1629  */
1630 #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0,  \
1631                             _out1, _out2, _out3, _out4, _out5, _out6, _out7)        \
1632 {                                                                                   \
1633     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                     \
1634     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                                     \
1635     _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                          \
1636     _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                          \
1637     _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                          \
1638     _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                          \
1639     _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                    \
1640     _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                    \
1641     _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                    \
1642     _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                    \
1643     _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m);                                      \
1644     _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m);                                      \
1645     _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m);                                      \
1646     _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m);                                      \
1647     _out1 = __lasx_xvbsrl_v(_out0, 8);                                              \
1648     _out3 = __lasx_xvbsrl_v(_out2, 8);                                              \
1649     _out5 = __lasx_xvbsrl_v(_out4, 8);                                              \
1650     _out7 = __lasx_xvbsrl_v(_out6, 8);                                              \
1651 }
1652 
1653 /*
1654  * =============================================================================
1655  * Description : Transpose 8x8 block with halfword elements in vectors.
1656  * Arguments   : Inputs  - _in0, _in1, ~
1657  *               Outputs - _out0, _out1, ~
1658  * Details     : The rows of the matrix become columns, and the columns become rows.
1659  * Example     : LASX_TRANSPOSE8x8_H
1660  *        _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1661  *        _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1662  *        _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1663  *        _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1664  *        _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1665  *        _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1666  *        _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1667  *        _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1668  *
1669  *       _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
1670  *       _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1671  *       _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
1672  *       _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
1673  *       _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
1674  *       _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
1675  *       _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
1676  *       _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
1677  * =============================================================================
1678  */
1679 #define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0,  \
1680                             _out1, _out2, _out3, _out4, _out5, _out6, _out7)        \
1681 {                                                                                   \
1682     __m256i _s0_m, _s1_m;                                                           \
1683     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                                     \
1684     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                                     \
1685                                                                                     \
1686     _s0_m   = __lasx_xvilvl_h(_in6, _in4);                                          \
1687     _s1_m   = __lasx_xvilvl_h(_in7, _in5);                                          \
1688     _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                        \
1689     _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                        \
1690     _s0_m   = __lasx_xvilvh_h(_in6, _in4);                                          \
1691     _s1_m   = __lasx_xvilvh_h(_in7, _in5);                                          \
1692     _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                        \
1693     _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                        \
1694                                                                                     \
1695     _s0_m   = __lasx_xvilvl_h(_in2, _in0);                                          \
1696     _s1_m   = __lasx_xvilvl_h(_in3, _in1);                                          \
1697     _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                        \
1698     _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                        \
1699     _s0_m   = __lasx_xvilvh_h(_in2, _in0);                                          \
1700     _s1_m   = __lasx_xvilvh_h(_in3, _in1);                                          \
1701     _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                        \
1702     _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                        \
1703                                                                                     \
1704     _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m);                                    \
1705     _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m);                                    \
1706     _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m);                                    \
1707     _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m);                                    \
1708     _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m);                                    \
1709     _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m);                                    \
1710     _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m);                                    \
1711     _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m);                                    \
1712 }
1713 
1714 /*
1715  * =============================================================================
1716  * Description : Butterfly of 4 input vectors
1717  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1718  *               Outputs - _out0, _out1, _out2, _out3
1719  * Details     : Butterfly operation
1720  * Example     : LASX_BUTTERFLY_4
1721  *               _out0 = _in0 + _in3;
1722  *               _out1 = _in1 + _in2;
1723  *               _out2 = _in1 - _in2;
1724  *               _out3 = _in0 - _in3;
1725  * =============================================================================
1726  */
1727 #define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3)  \
1728 {                                                                               \
1729     _out0 = __lasx_xvadd_b(_in0, _in3);                                         \
1730     _out1 = __lasx_xvadd_b(_in1, _in2);                                         \
1731     _out2 = __lasx_xvsub_b(_in1, _in2);                                         \
1732     _out3 = __lasx_xvsub_b(_in0, _in3);                                         \
1733 }
1734 #define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3)  \
1735 {                                                                               \
1736     _out0 = __lasx_xvadd_h(_in0, _in3);                                         \
1737     _out1 = __lasx_xvadd_h(_in1, _in2);                                         \
1738     _out2 = __lasx_xvsub_h(_in1, _in2);                                         \
1739     _out3 = __lasx_xvsub_h(_in0, _in3);                                         \
1740 }
1741 #define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3)  \
1742 {                                                                               \
1743     _out0 = __lasx_xvadd_w(_in0, _in3);                                         \
1744     _out1 = __lasx_xvadd_w(_in1, _in2);                                         \
1745     _out2 = __lasx_xvsub_w(_in1, _in2);                                         \
1746     _out3 = __lasx_xvsub_w(_in0, _in3);                                         \
1747 }
1748 #define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3)  \
1749 {                                                                               \
1750     _out0 = __lasx_xvadd_d(_in0, _in3);                                         \
1751     _out1 = __lasx_xvadd_d(_in1, _in2);                                         \
1752     _out2 = __lasx_xvsub_d(_in1, _in2);                                         \
1753     _out3 = __lasx_xvsub_d(_in0, _in3);                                         \
1754 }
1755 
1756 /*
1757  * =============================================================================
1758  * Description : Butterfly of 8 input vectors
1759  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
1760  *               Outputs - _out0, _out1, _out2, _out3, ~
1761  * Details     : Butterfly operation
1762  * Example     : LASX_BUTTERFLY_8
1763  *               _out0 = _in0 + _in7;
1764  *               _out1 = _in1 + _in6;
1765  *               _out2 = _in2 + _in5;
1766  *               _out3 = _in3 + _in4;
1767  *               _out4 = _in3 - _in4;
1768  *               _out5 = _in2 - _in5;
1769  *               _out6 = _in1 - _in6;
1770  *               _out7 = _in0 - _in7;
1771  * =============================================================================
1772  */
1773 #define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
1774                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
1775 {                                                                                 \
1776     _out0 = __lasx_xvadd_b(_in0, _in7);                                           \
1777     _out1 = __lasx_xvadd_b(_in1, _in6);                                           \
1778     _out2 = __lasx_xvadd_b(_in2, _in5);                                           \
1779     _out3 = __lasx_xvadd_b(_in3, _in4);                                           \
1780     _out4 = __lasx_xvsub_b(_in3, _in4);                                           \
1781     _out5 = __lasx_xvsub_b(_in2, _in5);                                           \
1782     _out6 = __lasx_xvsub_b(_in1, _in6);                                           \
1783     _out7 = __lasx_xvsub_b(_in0, _in7);                                           \
1784 }
1785 
1786 #define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
1787                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
1788 {                                                                                 \
1789     _out0 = __lasx_xvadd_h(_in0, _in7);                                           \
1790     _out1 = __lasx_xvadd_h(_in1, _in6);                                           \
1791     _out2 = __lasx_xvadd_h(_in2, _in5);                                           \
1792     _out3 = __lasx_xvadd_h(_in3, _in4);                                           \
1793     _out4 = __lasx_xvsub_h(_in3, _in4);                                           \
1794     _out5 = __lasx_xvsub_h(_in2, _in5);                                           \
1795     _out6 = __lasx_xvsub_h(_in1, _in6);                                           \
1796     _out7 = __lasx_xvsub_h(_in0, _in7);                                           \
1797 }
1798 
1799 #define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
1800                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
1801 {                                                                                 \
1802     _out0 = __lasx_xvadd_w(_in0, _in7);                                           \
1803     _out1 = __lasx_xvadd_w(_in1, _in6);                                           \
1804     _out2 = __lasx_xvadd_w(_in2, _in5);                                           \
1805     _out3 = __lasx_xvadd_w(_in3, _in4);                                           \
1806     _out4 = __lasx_xvsub_w(_in3, _in4);                                           \
1807     _out5 = __lasx_xvsub_w(_in2, _in5);                                           \
1808     _out6 = __lasx_xvsub_w(_in1, _in6);                                           \
1809     _out7 = __lasx_xvsub_w(_in0, _in7);                                           \
1810 }
1811 
1812 #define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,        \
1813                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
1814 {                                                                                 \
1815     _out0 = __lasx_xvadd_d(_in0, _in7);                                           \
1816     _out1 = __lasx_xvadd_d(_in1, _in6);                                           \
1817     _out2 = __lasx_xvadd_d(_in2, _in5);                                           \
1818     _out3 = __lasx_xvadd_d(_in3, _in4);                                           \
1819     _out4 = __lasx_xvsub_d(_in3, _in4);                                           \
1820     _out5 = __lasx_xvsub_d(_in2, _in5);                                           \
1821     _out6 = __lasx_xvsub_d(_in1, _in6);                                           \
1822     _out7 = __lasx_xvsub_d(_in0, _in7);                                           \
1823 }
1824 
1825 #endif //LASX
1826 
1827 /*
1828  * =============================================================================
1829  * Description : Print out elements in vector.
1830  * Arguments   : Inputs  - RTYPE, _element_num, _in0, _enter
1831  *               Outputs -
1832  * Details     : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
1833  *               '_enter' is TRUE, prefix "\nVP:" will be added first.
1834  * Example     : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
1835  *               VP:1,2,3,4,
1836  * =============================================================================
1837  */
1838 #define VECT_PRINT(RTYPE, element_num, in0, enter)    \
1839 {                                                     \
1840     RTYPE _tmp0 = (RTYPE)in0;                         \
1841     int _i = 0;                                       \
1842     if (enter)                                        \
1843         printf("\nVP:");                              \
1844     for(_i = 0; _i < element_num; _i++)               \
1845         printf("%d,",_tmp0[_i]);                      \
1846 }
1847 
1848 #endif /* LOONGSON_INTRINSICS_H */
1849 
1850