• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * All rights reserved.
4  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
5  *                Xiwei Gu   <guxiwei-hf@loongson.cn>
6  *                Lu Wang    <wanglu@loongson.cn>
7  *
8  * This file is a header file for loongarch builtin extension.
9  *
10  */
11 
12 #ifndef LOONGSON_INTRINSICS_H
13 #define LOONGSON_INTRINSICS_H
14 
15 /**
16  * MAJOR version: Macro usage changes.
17  * MINOR version: Add new functions, or bug fixes.
18  * MICRO version: Comment changes or implementation changes.
19  */
20 #define LSOM_VERSION_MAJOR 1
21 #define LSOM_VERSION_MINOR 1
22 #define LSOM_VERSION_MICRO 0
23 
24 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
25   {                                               \
26     _OUT0 = _INS(_IN0);                           \
27     _OUT1 = _INS(_IN1);                           \
28   }
29 
30 #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
31   {                                                           \
32     _OUT0 = _INS(_IN0, _IN1);                                 \
33     _OUT1 = _INS(_IN2, _IN3);                                 \
34   }
35 
36 #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
37   {                                                                       \
38     _OUT0 = _INS(_IN0, _IN1, _IN2);                                       \
39     _OUT1 = _INS(_IN3, _IN4, _IN5);                                       \
40   }
41 
42 #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
43   {                                                                         \
44     DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1);                              \
45     DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3);                              \
46   }
47 
48 #define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
49                   _OUT1, _OUT2, _OUT3)                                         \
50   {                                                                            \
51     DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1);                     \
52     DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3);                     \
53   }
54 
55 #define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
56                   _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)             \
57   {                                                                           \
58     DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1);        \
59     DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3);      \
60   }
61 
62 #ifdef __loongarch_sx
63 #include <lsxintrin.h>
64 /*
65  * =============================================================================
66  * Description : Dot product & addition of byte vector elements
67  * Arguments   : Inputs  - in_c, in_h, in_l
68  *               Outputs - out
69  *               Return Type - halfword
70  * Details     : Signed byte elements from in_h are multiplied by
71  *               signed byte elements from in_l, and then added adjacent to
72  *               each other to get results with the twice size of input.
73  *               Then the results plus to signed half-word elements from in_c.
74  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
75  *        in_c : 1,2,3,4, 1,2,3,4
76  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
77  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
78  *         out : 23,40,41,26, 23,40,41,26
79  * =============================================================================
80  */
__lsx_vdp2add_h_b(__m128i in_c,__m128i in_h,__m128i in_l)81 static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
82                                         __m128i in_l) {
83   __m128i out;
84 
85   out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
86   out = __lsx_vmaddwod_h_b(out, in_h, in_l);
87   return out;
88 }
89 
90 /*
91  * =============================================================================
92  * Description : Dot product & addition of byte vector elements
93  * Arguments   : Inputs  - in_c, in_h, in_l
94  *               Outputs - out
95  *               Return Type - halfword
96  * Details     : Unsigned byte elements from in_h are multiplied by
97  *               unsigned byte elements from in_l, and then added adjacent to
98  *               each other to get results with the twice size of input.
99  *               The results plus to signed half-word elements from in_c.
100  * Example     : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
101  *        in_c : 1,2,3,4, 1,2,3,4
102  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
103  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
104  *         out : 23,40,41,26, 23,40,41,26
105  * =============================================================================
106  */
__lsx_vdp2add_h_bu(__m128i in_c,__m128i in_h,__m128i in_l)107 static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
108                                          __m128i in_l) {
109   __m128i out;
110 
111   out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
112   out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
113   return out;
114 }
115 
116 /*
117  * =============================================================================
118  * Description : Dot product & addition of byte vector elements
119  * Arguments   : Inputs  - in_c, in_h, in_l
120  *               Outputs - out
121  *               Return Type - halfword
122  * Details     : Unsigned byte elements from in_h are multiplied by
123  *               signed byte elements from in_l, and then added adjacent to
124  *               each other to get results with the twice size of input.
125  *               The results plus to signed half-word elements from in_c.
126  * Example     : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
127  *        in_c : 1,1,1,1, 1,1,1,1
128  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
129  *        in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
130  *         out : -4,-24,-60,-112, 6,26,62,114
131  * =============================================================================
132  */
__lsx_vdp2add_h_bu_b(__m128i in_c,__m128i in_h,__m128i in_l)133 static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
134                                            __m128i in_l) {
135   __m128i out;
136 
137   out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
138   out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
139   return out;
140 }
141 
142 /*
143  * =============================================================================
144  * Description : Dot product & addition of half-word vector elements
145  * Arguments   : Inputs  - in_c, in_h, in_l
146  *               Outputs - out
147  *               Return Type - __m128i
148  * Details     : Signed half-word elements from in_h are multiplied by
149  *               signed half-word elements from in_l, and then added adjacent to
150  *               each other to get results with the twice size of input.
151  *               Then the results plus to signed word elements from in_c.
152  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
153  *        in_c : 1,2,3,4
154  *        in_h : 1,2,3,4, 5,6,7,8
155  *        in_l : 8,7,6,5, 4,3,2,1
156  *         out : 23,40,41,26
157  * =============================================================================
158  */
__lsx_vdp2add_w_h(__m128i in_c,__m128i in_h,__m128i in_l)159 static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
160                                         __m128i in_l) {
161   __m128i out;
162 
163   out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
164   out = __lsx_vmaddwod_w_h(out, in_h, in_l);
165   return out;
166 }
167 
168 /*
169  * =============================================================================
170  * Description : Dot product of byte vector elements
171  * Arguments   : Inputs  - in_h, in_l
172  *               Outputs - out
173  *               Return Type - halfword
174  * Details     : Signed byte elements from in_h are multiplied by
175  *               signed byte elements from in_l, and then added adjacent to
176  *               each other to get results with the twice size of input.
177  * Example     : out = __lsx_vdp2_h_b(in_h, in_l)
178  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
179  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
180  *         out : 22,38,38,22, 22,38,38,22
181  * =============================================================================
182  */
__lsx_vdp2_h_b(__m128i in_h,__m128i in_l)183 static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
184   __m128i out;
185 
186   out = __lsx_vmulwev_h_b(in_h, in_l);
187   out = __lsx_vmaddwod_h_b(out, in_h, in_l);
188   return out;
189 }
190 
191 /*
192  * =============================================================================
193  * Description : Dot product of byte vector elements
194  * Arguments   : Inputs  - in_h, in_l
195  *               Outputs - out
196  *               Return Type - halfword
197  * Details     : Unsigned byte elements from in_h are multiplied by
198  *               unsigned byte elements from in_l, and then added adjacent to
199  *               each other to get results with the twice size of input.
200  * Example     : out = __lsx_vdp2_h_bu(in_h, in_l)
201  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
202  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
203  *         out : 22,38,38,22, 22,38,38,22
204  * =============================================================================
205  */
__lsx_vdp2_h_bu(__m128i in_h,__m128i in_l)206 static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
207   __m128i out;
208 
209   out = __lsx_vmulwev_h_bu(in_h, in_l);
210   out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
211   return out;
212 }
213 
214 /*
215  * =============================================================================
216  * Description : Dot product of byte vector elements
217  * Arguments   : Inputs  - in_h, in_l
218  *               Outputs - out
219  *               Return Type - halfword
220  * Details     : Unsigned byte elements from in_h are multiplied by
221  *               signed byte elements from in_l, and then added adjacent to
222  *               each other to get results with the twice size of input.
223  * Example     : out = __lsx_vdp2_h_bu_b(in_h, in_l)
224  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
225  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
226  *         out : 22,38,38,22, 22,38,38,6
227  * =============================================================================
228  */
__lsx_vdp2_h_bu_b(__m128i in_h,__m128i in_l)229 static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
230   __m128i out;
231 
232   out = __lsx_vmulwev_h_bu_b(in_h, in_l);
233   out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
234   return out;
235 }
236 
237 /*
238  * =============================================================================
239  * Description : Dot product of byte vector elements
240  * Arguments   : Inputs  - in_h, in_l
241  *               Outputs - out
242  *               Return Type - halfword
243  * Details     : Signed byte elements from in_h are multiplied by
244  *               signed byte elements from in_l, and then added adjacent to
245  *               each other to get results with the twice size of input.
246  * Example     : out = __lsx_vdp2_w_h(in_h, in_l)
247  *        in_h : 1,2,3,4, 5,6,7,8
248  *        in_l : 8,7,6,5, 4,3,2,1
249  *         out : 22,38,38,22
250  * =============================================================================
251  */
__lsx_vdp2_w_h(__m128i in_h,__m128i in_l)252 static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
253   __m128i out;
254 
255   out = __lsx_vmulwev_w_h(in_h, in_l);
256   out = __lsx_vmaddwod_w_h(out, in_h, in_l);
257   return out;
258 }
259 
260 /*
261  * =============================================================================
262  * Description : Clip all halfword elements of input vector between min & max
263  *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
264  *               (_in))
265  * Arguments   : Inputs  - _in  (input vector)
266  *                       - min  (min threshold)
267  *                       - max  (max threshold)
268  *               Outputs - out  (output vector with clipped elements)
269  *               Return Type - signed halfword
270  * Example     : out = __lsx_vclip_h(_in)
271  *         _in : -8,2,280,249, -8,255,280,249
272  *         min : 1,1,1,1, 1,1,1,1
273  *         max : 9,9,9,9, 9,9,9,9
274  *         out : 1,2,9,9, 1,9,9,9
275  * =============================================================================
276  */
__lsx_vclip_h(__m128i _in,__m128i min,__m128i max)277 static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
278   __m128i out;
279 
280   out = __lsx_vmax_h(min, _in);
281   out = __lsx_vmin_h(max, out);
282   return out;
283 }
284 
285 /*
286  * =============================================================================
287  * Description : Set each element of vector between 0 and 255
288  * Arguments   : Inputs  - _in
289  *               Outputs - out
290  *               Return Type - halfword
291  * Details     : Signed byte elements from _in are clamped between 0 and 255.
292  * Example     : out = __lsx_vclip255_h(_in)
293  *         _in : -8,255,280,249, -8,255,280,249
294  *         out : 0,255,255,249, 0,255,255,249
295  * =============================================================================
296  */
__lsx_vclip255_h(__m128i _in)297 static inline __m128i __lsx_vclip255_h(__m128i _in) {
298   __m128i out;
299 
300   out = __lsx_vmaxi_h(_in, 0);
301   out = __lsx_vsat_hu(out, 7);
302   return out;
303 }
304 
305 /*
306  * =============================================================================
307  * Description : Set each element of vector between 0 and 255
308  * Arguments   : Inputs  - _in
309  *               Outputs - out
310  *               Return Type - word
311  * Details     : Signed byte elements from _in are clamped between 0 and 255.
312  * Example     : out = __lsx_vclip255_w(_in)
313  *         _in : -8,255,280,249
314  *         out : 0,255,255,249
315  * =============================================================================
316  */
__lsx_vclip255_w(__m128i _in)317 static inline __m128i __lsx_vclip255_w(__m128i _in) {
318   __m128i out;
319 
320   out = __lsx_vmaxi_w(_in, 0);
321   out = __lsx_vsat_wu(out, 7);
322   return out;
323 }
324 
325 /*
326  * =============================================================================
327  * Description : Swap two variables
328  * Arguments   : Inputs  - _in0, _in1
329  *               Outputs - _in0, _in1 (in-place)
330  * Details     : Swapping of two input variables using xor
331  * Example     : LSX_SWAP(_in0, _in1)
332  *        _in0 : 1,2,3,4
333  *        _in1 : 5,6,7,8
334  *   _in0(out) : 5,6,7,8
335  *   _in1(out) : 1,2,3,4
336  * =============================================================================
337  */
338 #define LSX_SWAP(_in0, _in1)         \
339   {                                  \
340     _in0 = __lsx_vxor_v(_in0, _in1); \
341     _in1 = __lsx_vxor_v(_in0, _in1); \
342     _in0 = __lsx_vxor_v(_in0, _in1); \
343   }
344 
345 /*
346  * =============================================================================
347  * Description : Transpose 4x4 block with word elements in vectors
348  * Arguments   : Inputs  - in0, in1, in2, in3
349  *               Outputs - out0, out1, out2, out3
350  * Details     :
351  * Example     :
352  *               1, 2, 3, 4            1, 5, 9,13
353  *               5, 6, 7, 8    to      2, 6,10,14
354  *               9,10,11,12  =====>    3, 7,11,15
355  *              13,14,15,16            4, 8,12,16
356  * =============================================================================
357  */
358 #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
359   {                                                                            \
360     __m128i _t0, _t1, _t2, _t3;                                                \
361                                                                                \
362     _t0 = __lsx_vilvl_w(_in1, _in0);                                           \
363     _t1 = __lsx_vilvh_w(_in1, _in0);                                           \
364     _t2 = __lsx_vilvl_w(_in3, _in2);                                           \
365     _t3 = __lsx_vilvh_w(_in3, _in2);                                           \
366     _out0 = __lsx_vilvl_d(_t2, _t0);                                           \
367     _out1 = __lsx_vilvh_d(_t2, _t0);                                           \
368     _out2 = __lsx_vilvl_d(_t3, _t1);                                           \
369     _out3 = __lsx_vilvh_d(_t3, _t1);                                           \
370   }
371 
372 /*
373  * =============================================================================
374  * Description : Transpose 8x8 block with byte elements in vectors
375  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
376  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
377  *               _out7
378  * Details     : The rows of the matrix become columns, and the columns
379  *               become rows.
380  * Example     : LSX_TRANSPOSE8x8_B
381  *        _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
382  *        _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
383  *        _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
384  *        _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
385  *        _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
386  *        _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
387  *        _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
388  *        _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
389  *
390  *      _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
391  *      _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
392  *      _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
393  *      _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
394  *      _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
395  *      _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
396  *      _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
397  *      _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
398  * =============================================================================
399  */
400 #define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
401                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
402                            _out7)                                           \
403   {                                                                         \
404     __m128i zero = { 0 };                                                   \
405     __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };             \
406     __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                         \
407                                                                             \
408     _t0 = __lsx_vilvl_b(_in2, _in0);                                        \
409     _t1 = __lsx_vilvl_b(_in3, _in1);                                        \
410     _t2 = __lsx_vilvl_b(_in6, _in4);                                        \
411     _t3 = __lsx_vilvl_b(_in7, _in5);                                        \
412     _t4 = __lsx_vilvl_b(_t1, _t0);                                          \
413     _t5 = __lsx_vilvh_b(_t1, _t0);                                          \
414     _t6 = __lsx_vilvl_b(_t3, _t2);                                          \
415     _t7 = __lsx_vilvh_b(_t3, _t2);                                          \
416     _out0 = __lsx_vilvl_w(_t6, _t4);                                        \
417     _out2 = __lsx_vilvh_w(_t6, _t4);                                        \
418     _out4 = __lsx_vilvl_w(_t7, _t5);                                        \
419     _out6 = __lsx_vilvh_w(_t7, _t5);                                        \
420     _out1 = __lsx_vshuf_b(zero, _out0, shuf8);                              \
421     _out3 = __lsx_vshuf_b(zero, _out2, shuf8);                              \
422     _out5 = __lsx_vshuf_b(zero, _out4, shuf8);                              \
423     _out7 = __lsx_vshuf_b(zero, _out6, shuf8);                              \
424   }
425 
426 /*
427  * =============================================================================
428  * Description : Transpose 8x8 block with half-word elements in vectors
429  * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
430  *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
431  * Details     :
432  * Example     :
433  *              00,01,02,03,04,05,06,07           00,10,20,30,40,50,60,70
434  *              10,11,12,13,14,15,16,17           01,11,21,31,41,51,61,71
435  *              20,21,22,23,24,25,26,27           02,12,22,32,42,52,62,72
436  *              30,31,32,33,34,35,36,37    to     03,13,23,33,43,53,63,73
437  *              40,41,42,43,44,45,46,47  ======>  04,14,24,34,44,54,64,74
438  *              50,51,52,53,54,55,56,57           05,15,25,35,45,55,65,75
439  *              60,61,62,63,64,65,66,67           06,16,26,36,46,56,66,76
440  *              70,71,72,73,74,75,76,77           07,17,27,37,47,57,67,77
441  * =============================================================================
442  */
443 #define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
444                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
445                            _out7)                                           \
446   {                                                                         \
447     __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;               \
448                                                                             \
449     _s0 = __lsx_vilvl_h(_in6, _in4);                                        \
450     _s1 = __lsx_vilvl_h(_in7, _in5);                                        \
451     _t0 = __lsx_vilvl_h(_s1, _s0);                                          \
452     _t1 = __lsx_vilvh_h(_s1, _s0);                                          \
453     _s0 = __lsx_vilvh_h(_in6, _in4);                                        \
454     _s1 = __lsx_vilvh_h(_in7, _in5);                                        \
455     _t2 = __lsx_vilvl_h(_s1, _s0);                                          \
456     _t3 = __lsx_vilvh_h(_s1, _s0);                                          \
457     _s0 = __lsx_vilvl_h(_in2, _in0);                                        \
458     _s1 = __lsx_vilvl_h(_in3, _in1);                                        \
459     _t4 = __lsx_vilvl_h(_s1, _s0);                                          \
460     _t5 = __lsx_vilvh_h(_s1, _s0);                                          \
461     _s0 = __lsx_vilvh_h(_in2, _in0);                                        \
462     _s1 = __lsx_vilvh_h(_in3, _in1);                                        \
463     _t6 = __lsx_vilvl_h(_s1, _s0);                                          \
464     _t7 = __lsx_vilvh_h(_s1, _s0);                                          \
465                                                                             \
466     _out0 = __lsx_vpickev_d(_t0, _t4);                                      \
467     _out2 = __lsx_vpickev_d(_t1, _t5);                                      \
468     _out4 = __lsx_vpickev_d(_t2, _t6);                                      \
469     _out6 = __lsx_vpickev_d(_t3, _t7);                                      \
470     _out1 = __lsx_vpickod_d(_t0, _t4);                                      \
471     _out3 = __lsx_vpickod_d(_t1, _t5);                                      \
472     _out5 = __lsx_vpickod_d(_t2, _t6);                                      \
473     _out7 = __lsx_vpickod_d(_t3, _t7);                                      \
474   }
475 
476 /*
477  * =============================================================================
478  * Description : Transpose input 8x4 byte block into 4x8
479  * Arguments   : Inputs  - _in0, _in1, _in2, _in3      (input 8x4 byte block)
480  *               Outputs - _out0, _out1, _out2, _out3  (output 4x8 byte block)
481  *               Return Type - as per RTYPE
482  * Details     : The rows of the matrix become columns, and the columns become
483  *               rows.
484  * Example     : LSX_TRANSPOSE8x4_B
485  *        _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
486  *        _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
487  *        _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
488  *        _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
489  *        _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
490  *        _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
491  *        _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
492  *        _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
493  *
494  *       _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
495  *       _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
496  *       _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
497  *       _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
498  * =============================================================================
499  */
500 #define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
501                            _out0, _out1, _out2, _out3)                     \
502   {                                                                        \
503     __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                            \
504                                                                            \
505     _tmp0_m = __lsx_vpackev_w(_in4, _in0);                                 \
506     _tmp1_m = __lsx_vpackev_w(_in5, _in1);                                 \
507     _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
508     _tmp0_m = __lsx_vpackev_w(_in6, _in2);                                 \
509     _tmp1_m = __lsx_vpackev_w(_in7, _in3);                                 \
510                                                                            \
511     _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
512     _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m);                             \
513     _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m);                             \
514                                                                            \
515     _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m);                               \
516     _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m);                               \
517     _out1 = __lsx_vilvh_d(_out2, _out0);                                   \
518     _out3 = __lsx_vilvh_d(_out0, _out2);                                   \
519   }
520 
521 /*
522  * =============================================================================
523  * Description : Transpose 16x8 block with byte elements in vectors
524  * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, in8
525  *                         in9, in10, in11, in12, in13, in14, in15
526  *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
527  * Details     :
528  * Example     :
529  *              000,001,002,003,004,005,006,007
530  *              008,009,010,011,012,013,014,015
531  *              016,017,018,019,020,021,022,023
532  *              024,025,026,027,028,029,030,031
533  *              032,033,034,035,036,037,038,039
534  *              040,041,042,043,044,045,046,047        000,008,...,112,120
535  *              048,049,050,051,052,053,054,055        001,009,...,113,121
536  *              056,057,058,059,060,061,062,063   to   002,010,...,114,122
537  *              064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
538  *              072,073,074,075,076,077,078,079        004,012,...,116,124
539  *              080,081,082,083,084,085,086,087        005,013,...,117,125
540  *              088,089,090,091,092,093,094,095        006,014,...,118,126
541  *              096,097,098,099,100,101,102,103        007,015,...,119,127
542  *              104,105,106,107,108,109,110,111
543  *              112,113,114,115,116,117,118,119
544  *              120,121,122,123,124,125,126,127
545  * =============================================================================
546  */
547 #define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
548                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
549                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
550                             _out6, _out7)                                    \
551   {                                                                          \
552     __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;          \
553     __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                          \
554     DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
555               _tmp0, _tmp1, _tmp2, _tmp3);                                   \
556     DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15,  \
557               _in13, _tmp4, _tmp5, _tmp6, _tmp7);                            \
558     DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2);          \
559     DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3);          \
560     DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6);          \
561     DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7);          \
562     DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4);              \
563     DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6);              \
564     DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5);              \
565     DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7);              \
566     DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2);      \
567     DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3);      \
568     DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6);      \
569     DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7);      \
570   }
571 
572 /*
573  * =============================================================================
574  * Description : Butterfly of 4 input vectors
575  * Arguments   : Inputs  - in0, in1, in2, in3
576  *               Outputs - out0, out1, out2, out3
577  * Details     : Butterfly operation
578  * Example     :
579  *               out0 = in0 + in3;
580  *               out1 = in1 + in2;
581  *               out2 = in1 - in2;
582  *               out3 = in0 - in3;
583  * =============================================================================
584  */
585 #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
586   {                                                                           \
587     _out0 = __lsx_vadd_b(_in0, _in3);                                         \
588     _out1 = __lsx_vadd_b(_in1, _in2);                                         \
589     _out2 = __lsx_vsub_b(_in1, _in2);                                         \
590     _out3 = __lsx_vsub_b(_in0, _in3);                                         \
591   }
592 #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
593   {                                                                           \
594     _out0 = __lsx_vadd_h(_in0, _in3);                                         \
595     _out1 = __lsx_vadd_h(_in1, _in2);                                         \
596     _out2 = __lsx_vsub_h(_in1, _in2);                                         \
597     _out3 = __lsx_vsub_h(_in0, _in3);                                         \
598   }
599 #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
600   {                                                                           \
601     _out0 = __lsx_vadd_w(_in0, _in3);                                         \
602     _out1 = __lsx_vadd_w(_in1, _in2);                                         \
603     _out2 = __lsx_vsub_w(_in1, _in2);                                         \
604     _out3 = __lsx_vsub_w(_in0, _in3);                                         \
605   }
606 #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
607   {                                                                           \
608     _out0 = __lsx_vadd_d(_in0, _in3);                                         \
609     _out1 = __lsx_vadd_d(_in1, _in2);                                         \
610     _out2 = __lsx_vsub_d(_in1, _in2);                                         \
611     _out3 = __lsx_vsub_d(_in0, _in3);                                         \
612   }
613 
614 /*
615  * =============================================================================
616  * Description : Butterfly of 8 input vectors
617  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
618  *               Outputs - _out0, _out1, _out2, _out3, ~
619  * Details     : Butterfly operation
620  * Example     :
621  *              _out0 = _in0 + _in7;
622  *              _out1 = _in1 + _in6;
623  *              _out2 = _in2 + _in5;
624  *              _out3 = _in3 + _in4;
625  *              _out4 = _in3 - _in4;
626  *              _out5 = _in2 - _in5;
627  *              _out6 = _in1 - _in6;
628  *              _out7 = _in0 - _in7;
629  * =============================================================================
630  */
631 #define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
632                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
633                           _out7)                                           \
634   {                                                                        \
635     _out0 = __lsx_vadd_b(_in0, _in7);                                      \
636     _out1 = __lsx_vadd_b(_in1, _in6);                                      \
637     _out2 = __lsx_vadd_b(_in2, _in5);                                      \
638     _out3 = __lsx_vadd_b(_in3, _in4);                                      \
639     _out4 = __lsx_vsub_b(_in3, _in4);                                      \
640     _out5 = __lsx_vsub_b(_in2, _in5);                                      \
641     _out6 = __lsx_vsub_b(_in1, _in6);                                      \
642     _out7 = __lsx_vsub_b(_in0, _in7);                                      \
643   }
644 
645 #define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
646                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
647                           _out7)                                           \
648   {                                                                        \
649     _out0 = __lsx_vadd_h(_in0, _in7);                                      \
650     _out1 = __lsx_vadd_h(_in1, _in6);                                      \
651     _out2 = __lsx_vadd_h(_in2, _in5);                                      \
652     _out3 = __lsx_vadd_h(_in3, _in4);                                      \
653     _out4 = __lsx_vsub_h(_in3, _in4);                                      \
654     _out5 = __lsx_vsub_h(_in2, _in5);                                      \
655     _out6 = __lsx_vsub_h(_in1, _in6);                                      \
656     _out7 = __lsx_vsub_h(_in0, _in7);                                      \
657   }
658 
659 #define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
660                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
661                           _out7)                                           \
662   {                                                                        \
663     _out0 = __lsx_vadd_w(_in0, _in7);                                      \
664     _out1 = __lsx_vadd_w(_in1, _in6);                                      \
665     _out2 = __lsx_vadd_w(_in2, _in5);                                      \
666     _out3 = __lsx_vadd_w(_in3, _in4);                                      \
667     _out4 = __lsx_vsub_w(_in3, _in4);                                      \
668     _out5 = __lsx_vsub_w(_in2, _in5);                                      \
669     _out6 = __lsx_vsub_w(_in1, _in6);                                      \
670     _out7 = __lsx_vsub_w(_in0, _in7);                                      \
671   }
672 
673 #define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
674                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
675                           _out7)                                           \
676   {                                                                        \
677     _out0 = __lsx_vadd_d(_in0, _in7);                                      \
678     _out1 = __lsx_vadd_d(_in1, _in6);                                      \
679     _out2 = __lsx_vadd_d(_in2, _in5);                                      \
680     _out3 = __lsx_vadd_d(_in3, _in4);                                      \
681     _out4 = __lsx_vsub_d(_in3, _in4);                                      \
682     _out5 = __lsx_vsub_d(_in2, _in5);                                      \
683     _out6 = __lsx_vsub_d(_in1, _in6);                                      \
684     _out7 = __lsx_vsub_d(_in0, _in7);                                      \
685   }
686 
687 #endif  // LSX
688 
689 #ifdef __loongarch_asx
690 #include <lasxintrin.h>
691 /*
692  * =============================================================================
693  * Description : Dot product of byte vector elements
694  * Arguments   : Inputs - in_h, in_l
695  *               Output - out
696  *               Return Type - signed halfword
697  * Details     : Unsigned byte elements from in_h are multiplied with
698  *               unsigned byte elements from in_l producing a result
699  *               twice the size of input i.e. signed halfword.
700  *               Then this multiplied results of adjacent odd-even elements
701  *               are added to the out vector
702  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
703  * =============================================================================
704  */
__lasx_xvdp2_h_bu(__m256i in_h,__m256i in_l)705 static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
706   __m256i out;
707 
708   out = __lasx_xvmulwev_h_bu(in_h, in_l);
709   out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
710   return out;
711 }
712 
713 /*
714  * =============================================================================
715  * Description : Dot product of byte vector elements
716  * Arguments   : Inputs - in_h, in_l
717  *               Output - out
718  *               Return Type - signed halfword
719  * Details     : Signed byte elements from in_h are multiplied with
720  *               signed byte elements from in_l producing a result
721  *               twice the size of input i.e. signed halfword.
722  *               Then this multiplication results of adjacent odd-even elements
723  *               are added to the out vector
724  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
725  * =============================================================================
726  */
__lasx_xvdp2_h_b(__m256i in_h,__m256i in_l)727 static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
728   __m256i out;
729 
730   out = __lasx_xvmulwev_h_b(in_h, in_l);
731   out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
732   return out;
733 }
734 
735 /*
736  * =============================================================================
737  * Description : Dot product of halfword vector elements
738  * Arguments   : Inputs - in_h, in_l
739  *               Output - out
740  *               Return Type - signed word
741  * Details     : Signed halfword elements from in_h are multiplied with
742  *               signed halfword elements from in_l producing a result
743  *               twice the size of input i.e. signed word.
744  *               Then this multiplied results of adjacent odd-even elements
745  *               are added to the out vector.
746  * Example     : out = __lasx_xvdp2_w_h(in_h, in_l)
747  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
748  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
749  *         out : 22,38,38,22, 22,38,38,22
750  * =============================================================================
751  */
__lasx_xvdp2_w_h(__m256i in_h,__m256i in_l)752 static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
753   __m256i out;
754 
755   out = __lasx_xvmulwev_w_h(in_h, in_l);
756   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
757   return out;
758 }
759 
760 /*
761  * =============================================================================
762  * Description : Dot product of word vector elements
763  * Arguments   : Inputs - in_h, in_l
764  *               Output - out
765  *               Return Type - signed double
766  * Details     : Signed word elements from in_h are multiplied with
767  *               signed word elements from in_l producing a result
768  *               twice the size of input i.e. signed double-word.
769  *               Then this multiplied results of adjacent odd-even elements
770  *               are added to the out vector.
771  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
772  * =============================================================================
773  */
__lasx_xvdp2_d_w(__m256i in_h,__m256i in_l)774 static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
775   __m256i out;
776 
777   out = __lasx_xvmulwev_d_w(in_h, in_l);
778   out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
779   return out;
780 }
781 
782 /*
783  * =============================================================================
784  * Description : Dot product of halfword vector elements
785  * Arguments   : Inputs - in_h, in_l
786  *               Output - out
787  *               Return Type - signed word
788  * Details     : Unsigned halfword elements from in_h are multiplied with
789  *               signed halfword elements from in_l producing a result
790  *               twice the size of input i.e. unsigned word.
791  *               Multiplication result of adjacent odd-even elements
792  *               are added to the out vector
793  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
794  * =============================================================================
795  */
__lasx_xvdp2_w_hu_h(__m256i in_h,__m256i in_l)796 static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
797   __m256i out;
798 
799   out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
800   out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
801   return out;
802 }
803 
804 /*
805  * =============================================================================
806  * Description : Dot product & addition of byte vector elements
807  * Arguments   : Inputs - in_h, in_l
808  *               Output - out
809  *               Return Type - halfword
810  * Details     : Signed byte elements from in_h are multiplied with
811  *               signed byte elements from in_l producing a result
812  *               twice the size of input i.e. signed halfword.
813  *               Then this multiplied results of adjacent odd-even elements
814  *               are added to the in_c vector.
815  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
816  * =============================================================================
817  */
__lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h,__m256i in_l)818 static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
819                                           __m256i in_l) {
820   __m256i out;
821 
822   out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
823   out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
824   return out;
825 }
826 
827 /*
828  * =============================================================================
829  * Description : Dot product & addition of byte vector elements
830  * Arguments   : Inputs - in_h, in_l
831  *               Output - out
832  *               Return Type - halfword
833  * Details     : Unsigned byte elements from in_h are multiplied with
834  *               unsigned byte elements from in_l producing a result
835  *               twice the size of input i.e. signed halfword.
836  *               Then this multiplied results of adjacent odd-even elements
837  *               are added to the in_c vector.
838  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
839  * =============================================================================
840  */
__lasx_xvdp2add_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)841 static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
842                                            __m256i in_l) {
843   __m256i out;
844 
845   out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
846   out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
847   return out;
848 }
849 
850 /*
851  * =============================================================================
852  * Description : Dot product & addition of byte vector elements
853  * Arguments   : Inputs - in_h, in_l
854  *               Output - out
855  *               Return Type - halfword
856  * Details     : Unsigned byte elements from in_h are multiplied with
857  *               signed byte elements from in_l producing a result
858  *               twice the size of input i.e. signed halfword.
859  *               Then this multiplied results of adjacent odd-even elements
860  *               are added to the in_c vector.
861  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
862  * =============================================================================
863  */
__lasx_xvdp2add_h_bu_b(__m256i in_c,__m256i in_h,__m256i in_l)864 static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
865                                              __m256i in_l) {
866   __m256i out;
867 
868   out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
869   out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
870   return out;
871 }
872 
873 /*
874  * =============================================================================
875  * Description : Dot product of halfword vector elements
876  * Arguments   : Inputs - in_c, in_h, in_l
877  *               Output - out
878  *               Return Type - per RTYPE
879  * Details     : Signed halfword elements from in_h are multiplied with
880  *               signed halfword elements from in_l producing a result
881  *               twice the size of input i.e. signed word.
882  *               Multiplication result of adjacent odd-even elements
883  *               are added to the in_c vector.
884  * Example     : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
885  *        in_c : 1,2,3,4, 1,2,3,4
886  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
887  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
888  *         out : 23,40,41,26, 23,40,41,26
889  * =============================================================================
890  */
__lasx_xvdp2add_w_h(__m256i in_c,__m256i in_h,__m256i in_l)891 static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
892                                           __m256i in_l) {
893   __m256i out;
894 
895   out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
896   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
897   return out;
898 }
899 
900 /*
901  * =============================================================================
902  * Description : Dot product of halfword vector elements
903  * Arguments   : Inputs - in_c, in_h, in_l
904  *               Output - out
905  *               Return Type - signed word
906  * Details     : Unsigned halfword elements from in_h are multiplied with
907  *               unsigned halfword elements from in_l producing a result
908  *               twice the size of input i.e. signed word.
909  *               Multiplication result of adjacent odd-even elements
910  *               are added to the in_c vector.
911  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
912  * =============================================================================
913  */
__lasx_xvdp2add_w_hu(__m256i in_c,__m256i in_h,__m256i in_l)914 static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
915                                            __m256i in_l) {
916   __m256i out;
917 
918   out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
919   out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
920   return out;
921 }
922 
923 /*
924  * =============================================================================
925  * Description : Dot product of halfword vector elements
926  * Arguments   : Inputs - in_c, in_h, in_l
927  *               Output - out
928  *               Return Type - signed word
929  * Details     : Unsigned halfword elements from in_h are multiplied with
930  *               signed halfword elements from in_l producing a result
931  *               twice the size of input i.e. signed word.
932  *               Multiplication result of adjacent odd-even elements
933  *               are added to the in_c vector
934  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
935  * =============================================================================
936  */
__lasx_xvdp2add_w_hu_h(__m256i in_c,__m256i in_h,__m256i in_l)937 static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
938                                              __m256i in_l) {
939   __m256i out;
940 
941   out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
942   out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
943   return out;
944 }
945 
946 /*
947  * =============================================================================
948  * Description : Vector Unsigned Dot Product and Subtract
949  * Arguments   : Inputs - in_c, in_h, in_l
950  *               Output - out
951  *               Return Type - signed halfword
952  * Details     : Unsigned byte elements from in_h are multiplied with
953  *               unsigned byte elements from in_l producing a result
954  *               twice the size of input i.e. signed halfword.
955  *               Multiplication result of adjacent odd-even elements
956  *               are added together and subtracted from double width elements
957  *               in_c vector.
958  * Example     : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
959  * =============================================================================
960  */
__lasx_xvdp2sub_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)961 static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
962                                            __m256i in_l) {
963   __m256i out;
964 
965   out = __lasx_xvmulwev_h_bu(in_h, in_l);
966   out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
967   out = __lasx_xvsub_h(in_c, out);
968   return out;
969 }
970 
971 /*
972  * =============================================================================
973  * Description : Vector Signed Dot Product and Subtract
974  * Arguments   : Inputs - in_c, in_h, in_l
975  *               Output - out
976  *               Return Type - signed word
977  * Details     : Signed halfword elements from in_h are multiplied with
978  *               Signed halfword elements from in_l producing a result
979  *               twice the size of input i.e. signed word.
980  *               Multiplication result of adjacent odd-even elements
981  *               are added together and subtracted from double width elements
982  *               in_c vector.
983  * Example     : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
984  *        in_c : 0,0,0,0, 0,0,0,0
985  *        in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
986  *        in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
987  *         out : -7,-3,0,0, 0,-1,0,-1
988  * =============================================================================
989  */
__lasx_xvdp2sub_w_h(__m256i in_c,__m256i in_h,__m256i in_l)990 static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
991                                           __m256i in_l) {
992   __m256i out;
993 
994   out = __lasx_xvmulwev_w_h(in_h, in_l);
995   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
996   out = __lasx_xvsub_w(in_c, out);
997   return out;
998 }
999 
1000 /*
1001  * =============================================================================
1002  * Description : Dot product of halfword vector elements
1003  * Arguments   : Inputs - in_h, in_l
1004  *               Output - out
1005  *               Return Type - signed word
1006  * Details     : Signed halfword elements from in_h are multiplied with
1007  *               signed halfword elements from in_l producing a result
1008  *               four times the size of input i.e. signed doubleword.
1009  *               Then this multiplication results of four adjacent elements
1010  *               are added together and stored to the out vector.
1011  * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
1012  *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
1013  *        in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
1014  *         out : -2,0,1,1
1015  * =============================================================================
1016  */
__lasx_xvdp4_d_h(__m256i in_h,__m256i in_l)1017 static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
1018   __m256i out;
1019 
1020   out = __lasx_xvmulwev_w_h(in_h, in_l);
1021   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1022   out = __lasx_xvhaddw_d_w(out, out);
1023   return out;
1024 }
1025 
1026 /*
1027  * =============================================================================
1028  * Description : The high half of the vector elements are expanded and
1029  *               added after being doubled.
1030  * Arguments   : Inputs - in_h, in_l
1031  *               Output - out
1032  * Details     : The in_h vector and the in_l vector are added after the
1033  *               higher half of the two-fold sign extension (signed byte
1034  *               to signed halfword) and stored to the out vector.
1035  * Example     : See out = __lasx_xvaddwh_w_h(in_h, in_l)
1036  * =============================================================================
1037  */
__lasx_xvaddwh_h_b(__m256i in_h,__m256i in_l)1038 static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
1039   __m256i out;
1040 
1041   out = __lasx_xvilvh_b(in_h, in_l);
1042   out = __lasx_xvhaddw_h_b(out, out);
1043   return out;
1044 }
1045 
1046 /*
1047  * =============================================================================
1048  * Description : The high half of the vector elements are expanded and
1049  *               added after being doubled.
1050  * Arguments   : Inputs - in_h, in_l
1051  *               Output - out
1052  * Details     : The in_h vector and the in_l vector are added after the
1053  *               higher half of the two-fold sign extension (signed halfword
1054  *               to signed word) and stored to the out vector.
1055  * Example     : out = __lasx_xvaddwh_w_h(in_h, in_l)
1056  *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1057  *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1058  *         out : 1,0,0,-1, 1,0,0, 2
1059  * =============================================================================
1060  */
__lasx_xvaddwh_w_h(__m256i in_h,__m256i in_l)1061 static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
1062   __m256i out;
1063 
1064   out = __lasx_xvilvh_h(in_h, in_l);
1065   out = __lasx_xvhaddw_w_h(out, out);
1066   return out;
1067 }
1068 
1069 /*
1070  * =============================================================================
1071  * Description : The low half of the vector elements are expanded and
1072  *               added after being doubled.
1073  * Arguments   : Inputs - in_h, in_l
1074  *               Output - out
1075  * Details     : The in_h vector and the in_l vector are added after the
1076  *               lower half of the two-fold sign extension (signed byte
1077  *               to signed halfword) and stored to the out vector.
1078  * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1079  * =============================================================================
1080  */
__lasx_xvaddwl_h_b(__m256i in_h,__m256i in_l)1081 static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
1082   __m256i out;
1083 
1084   out = __lasx_xvilvl_b(in_h, in_l);
1085   out = __lasx_xvhaddw_h_b(out, out);
1086   return out;
1087 }
1088 
1089 /*
1090  * =============================================================================
1091  * Description : The low half of the vector elements are expanded and
1092  *               added after being doubled.
1093  * Arguments   : Inputs - in_h, in_l
1094  *               Output - out
1095  * Details     : The in_h vector and the in_l vector are added after the
1096  *               lower half of the two-fold sign extension (signed halfword
1097  *               to signed word) and stored to the out vector.
1098  * Example     : out = __lasx_xvaddwl_w_h(in_h, in_l)
1099  *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1100  *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1101  *         out : 5,-1,4,2, 1,0,2,-1
1102  * =============================================================================
1103  */
__lasx_xvaddwl_w_h(__m256i in_h,__m256i in_l)1104 static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
1105   __m256i out;
1106 
1107   out = __lasx_xvilvl_h(in_h, in_l);
1108   out = __lasx_xvhaddw_w_h(out, out);
1109   return out;
1110 }
1111 
1112 /*
1113  * =============================================================================
1114  * Description : The low half of the vector elements are expanded and
1115  *               added after being doubled.
1116  * Arguments   : Inputs - in_h, in_l
1117  *               Output - out
1118  * Details     : The out vector and the out vector are added after the
1119  *               lower half of the two-fold zero extension (unsigned byte
1120  *               to unsigned halfword) and stored to the out vector.
1121  * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1122  * =============================================================================
1123  */
__lasx_xvaddwl_h_bu(__m256i in_h,__m256i in_l)1124 static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
1125   __m256i out;
1126 
1127   out = __lasx_xvilvl_b(in_h, in_l);
1128   out = __lasx_xvhaddw_hu_bu(out, out);
1129   return out;
1130 }
1131 
1132 /*
1133  * =============================================================================
1134  * Description : The low half of the vector elements are expanded and
1135  *               added after being doubled.
1136  * Arguments   : Inputs - in_h, in_l
1137  *               Output - out
1138  * Details     : The in_l vector after double zero extension (unsigned byte to
1139  *               signed halfword),added to the in_h vector.
1140  * Example     : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
1141  * =============================================================================
1142  */
__lasx_xvaddw_h_h_bu(__m256i in_h,__m256i in_l)1143 static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
1144   __m256i out;
1145 
1146   out = __lasx_xvsllwil_hu_bu(in_l, 0);
1147   out = __lasx_xvadd_h(in_h, out);
1148   return out;
1149 }
1150 
1151 /*
1152  * =============================================================================
1153  * Description : The low half of the vector elements are expanded and
1154  *               added after being doubled.
1155  * Arguments   : Inputs - in_h, in_l
1156  *               Output - out
1157  * Details     : The in_l vector after double sign extension (signed halfword to
1158  *               signed word), added to the in_h vector.
1159  * Example     : out = __lasx_xvaddw_w_w_h(in_h, in_l)
1160  *        in_h : 0, 1,0,0, -1,0,0,1,
1161  *        in_l : 2,-1,1,2,  1,0,0,0, 0,0,1,0, 1,0,0,1,
1162  *         out : 2, 0,1,2, -1,0,1,1,
1163  * =============================================================================
1164  */
__lasx_xvaddw_w_w_h(__m256i in_h,__m256i in_l)1165 static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
1166   __m256i out;
1167 
1168   out = __lasx_xvsllwil_w_h(in_l, 0);
1169   out = __lasx_xvadd_w(in_h, out);
1170   return out;
1171 }
1172 
1173 /*
1174  * =============================================================================
1175  * Description : Multiplication and addition calculation after expansion
1176  *               of the lower half of the vector.
1177  * Arguments   : Inputs - in_c, in_h, in_l
1178  *               Output - out
1179  * Details     : The in_h vector and the in_l vector are multiplied after
1180  *               the lower half of the two-fold sign extension (signed halfword
1181  *               to signed word), and the result is added to the vector in_c,
1182  *               then stored to the out vector.
1183  * Example     : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1184  *        in_c : 1,2,3,4, 5,6,7,8
1185  *        in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
1186  *        in_l : 200, 300, 400, 500,  2000, 3000, 4000, 5000,
1187  *              -200,-300,-400,-500, -2000,-3000,-4000,-5000
1188  *         out : 201, 602,1203,2004, -995, -1794,-2793,-3992
1189  * =============================================================================
1190  */
__lasx_xvmaddwl_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1191 static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
1192                                           __m256i in_l) {
1193   __m256i tmp0, tmp1, out;
1194 
1195   tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1196   tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1197   tmp0 = __lasx_xvmul_w(tmp0, tmp1);
1198   out = __lasx_xvadd_w(tmp0, in_c);
1199   return out;
1200 }
1201 
1202 /*
1203  * =============================================================================
1204  * Description : Multiplication and addition calculation after expansion
1205  *               of the higher half of the vector.
1206  * Arguments   : Inputs - in_c, in_h, in_l
1207  *               Output - out
1208  * Details     : The in_h vector and the in_l vector are multiplied after
1209  *               the higher half of the two-fold sign extension (signed
1210  *               halfword to signed word), and the result is added to
1211  *               the vector in_c, then stored to the out vector.
1212  * Example     : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1213  * =============================================================================
1214  */
__lasx_xvmaddwh_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1215 static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
1216                                           __m256i in_l) {
1217   __m256i tmp0, tmp1, out;
1218 
1219   tmp0 = __lasx_xvilvh_h(in_h, in_h);
1220   tmp1 = __lasx_xvilvh_h(in_l, in_l);
1221   tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
1222   out = __lasx_xvadd_w(tmp0, in_c);
1223   return out;
1224 }
1225 
1226 /*
1227  * =============================================================================
1228  * Description : Multiplication calculation after expansion of the lower
1229  *               half of the vector.
1230  * Arguments   : Inputs - in_h, in_l
1231  *               Output - out
1232  * Details     : The in_h vector and the in_l vector are multiplied after
1233  *               the lower half of the two-fold sign extension (signed
1234  *               halfword to signed word), then stored to the out vector.
1235  * Example     : out = __lasx_xvmulwl_w_h(in_h, in_l)
1236  *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1237  *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1238  *         out : 6,1,3,0, 0,0,1,0
1239  * =============================================================================
1240  */
__lasx_xvmulwl_w_h(__m256i in_h,__m256i in_l)1241 static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
1242   __m256i tmp0, tmp1, out;
1243 
1244   tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1245   tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1246   out = __lasx_xvmul_w(tmp0, tmp1);
1247   return out;
1248 }
1249 
1250 /*
1251  * =============================================================================
1252  * Description : Multiplication calculation after expansion of the lower
1253  *               half of the vector.
1254  * Arguments   : Inputs - in_h, in_l
1255  *               Output - out
1256  * Details     : The in_h vector and the in_l vector are multiplied after
1257  *               the lower half of the two-fold sign extension (signed
1258  *               halfword to signed word), then stored to the out vector.
1259  * Example     : out = __lasx_xvmulwh_w_h(in_h, in_l)
1260  *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1261  *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1262  *         out : 0,0,0,0, 0,0,0,1
1263  * =============================================================================
1264  */
__lasx_xvmulwh_w_h(__m256i in_h,__m256i in_l)1265 static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
1266   __m256i tmp0, tmp1, out;
1267 
1268   tmp0 = __lasx_xvilvh_h(in_h, in_h);
1269   tmp1 = __lasx_xvilvh_h(in_l, in_l);
1270   out = __lasx_xvmulwev_w_h(tmp0, tmp1);
1271   return out;
1272 }
1273 
1274 /*
1275  * =============================================================================
1276  * Description : The low half of the vector elements are added to the high half
1277  *               after being doubled, then saturated.
1278  * Arguments   : Inputs - in_h, in_l
1279  *               Output - out
1280  * Details     : The in_h vector adds the in_l vector after the lower half of
1281  *               the two-fold zero extension (unsigned byte to unsigned
1282  *               halfword) and then saturated. The results are stored to the out
1283  *               vector.
1284  * Example     : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
1285  *        in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
1286  *        in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
1287  *               0,0,0,1
1288  *        out  : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
1289  * =============================================================================
1290  */
__lasx_xvsaddw_hu_hu_bu(__m256i in_h,__m256i in_l)1291 static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
1292   __m256i tmp1, out;
1293   __m256i zero = { 0 };
1294 
1295   tmp1 = __lasx_xvilvl_b(zero, in_l);
1296   out = __lasx_xvsadd_hu(in_h, tmp1);
1297   return out;
1298 }
1299 
1300 /*
1301  * =============================================================================
1302  * Description : Clip all halfword elements of input vector between min & max
1303  *               out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1304  * Arguments   : Inputs  - in    (input vector)
1305  *                       - min   (min threshold)
1306  *                       - max   (max threshold)
1307  *               Outputs - in    (output vector with clipped elements)
1308  *               Return Type - signed halfword
1309  * Example     : out = __lasx_xvclip_h(in, min, max)
1310  *          in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
1311  *         min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
1312  *         max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
1313  *         out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
1314  * =============================================================================
1315  */
__lasx_xvclip_h(__m256i in,__m256i min,__m256i max)1316 static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
1317   __m256i out;
1318 
1319   out = __lasx_xvmax_h(min, in);
1320   out = __lasx_xvmin_h(max, out);
1321   return out;
1322 }
1323 
1324 /*
1325  * =============================================================================
1326  * Description : Clip all signed halfword elements of input vector
1327  *               between 0 & 255
1328  * Arguments   : Inputs  - in   (input vector)
1329  *               Outputs - out  (output vector with clipped elements)
1330  *               Return Type - signed halfword
1331  * Example     : See out = __lasx_xvclip255_w(in)
1332  * =============================================================================
1333  */
__lasx_xvclip255_h(__m256i in)1334 static inline __m256i __lasx_xvclip255_h(__m256i in) {
1335   __m256i out;
1336 
1337   out = __lasx_xvmaxi_h(in, 0);
1338   out = __lasx_xvsat_hu(out, 7);
1339   return out;
1340 }
1341 
1342 /*
1343  * =============================================================================
1344  * Description : Clip all signed word elements of input vector
1345  *               between 0 & 255
1346  * Arguments   : Inputs - in   (input vector)
1347  *               Output - out  (output vector with clipped elements)
1348  *               Return Type - signed word
1349  * Example     : out = __lasx_xvclip255_w(in)
1350  *          in : -8,255,280,249, -8,255,280,249
1351  *         out :  0,255,255,249,  0,255,255,249
1352  * =============================================================================
1353  */
__lasx_xvclip255_w(__m256i in)1354 static inline __m256i __lasx_xvclip255_w(__m256i in) {
1355   __m256i out;
1356 
1357   out = __lasx_xvmaxi_w(in, 0);
1358   out = __lasx_xvsat_wu(out, 7);
1359   return out;
1360 }
1361 
1362 /*
1363  * =============================================================================
1364  * Description : Indexed halfword element values are replicated to all
1365  *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
1366  *               if 'idx >= 8' use xvsplati_h_*.
1367  * Arguments   : Inputs - in, idx
1368  *               Output - out
1369  * Details     : Idx element value from in vector is replicated to all
1370  *               elements in out vector.
1371  *               Valid index range for halfword operation is 0-7
1372  * Example     : out = __lasx_xvsplati_l_h(in, idx)
1373  *          in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
1374  *         idx : 0x02
1375  *         out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
1376  * =============================================================================
1377  */
__lasx_xvsplati_l_h(__m256i in,int idx)1378 static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
1379   __m256i out;
1380 
1381   out = __lasx_xvpermi_q(in, in, 0x02);
1382   out = __lasx_xvreplve_h(out, idx);
1383   return out;
1384 }
1385 
1386 /*
1387  * =============================================================================
1388  * Description : Indexed halfword element values are replicated to all
1389  *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
1390  *               if 'idx >= 8' use xvsplati_h_*.
1391  * Arguments   : Inputs - in, idx
1392  *               Output - out
1393  * Details     : Idx element value from in vector is replicated to all
1394  *               elements in out vector.
1395  *               Valid index range for halfword operation is 0-7
1396  * Example     : out = __lasx_xvsplati_h_h(in, idx)
1397  *          in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
1398  *         idx : 0x09
1399  *         out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1400  * =============================================================================
1401  */
__lasx_xvsplati_h_h(__m256i in,int idx)1402 static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
1403   __m256i out;
1404 
1405   out = __lasx_xvpermi_q(in, in, 0x13);
1406   out = __lasx_xvreplve_h(out, idx);
1407   return out;
1408 }
1409 
1410 /*
1411  * =============================================================================
1412  * Description : Transpose 4x4 block with double-word elements in vectors
1413  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1414  *               Outputs - _out0, _out1, _out2, _out3
1415  * Example     : LASX_TRANSPOSE4x4_D
1416  *        _in0 : 1,2,3,4
1417  *        _in1 : 1,2,3,4
1418  *        _in2 : 1,2,3,4
1419  *        _in3 : 1,2,3,4
1420  *
1421  *       _out0 : 1,1,1,1
1422  *       _out1 : 2,2,2,2
1423  *       _out2 : 3,3,3,3
1424  *       _out3 : 4,4,4,4
1425  * =============================================================================
1426  */
1427 #define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1428                             _out3)                                       \
1429   {                                                                      \
1430     __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                  \
1431     _tmp0 = __lasx_xvilvl_d(_in1, _in0);                                 \
1432     _tmp1 = __lasx_xvilvh_d(_in1, _in0);                                 \
1433     _tmp2 = __lasx_xvilvl_d(_in3, _in2);                                 \
1434     _tmp3 = __lasx_xvilvh_d(_in3, _in2);                                 \
1435     _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20);                        \
1436     _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31);                        \
1437     _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20);                        \
1438     _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31);                        \
1439   }
1440 
1441 /*
1442  * =============================================================================
1443  * Description : Transpose 8x8 block with word elements in vectors
1444  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1445  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1446  *               _out7
1447  * Example     : LASX_TRANSPOSE8x8_W
1448  *        _in0 : 1,2,3,4,5,6,7,8
1449  *        _in1 : 2,2,3,4,5,6,7,8
1450  *        _in2 : 3,2,3,4,5,6,7,8
1451  *        _in3 : 4,2,3,4,5,6,7,8
1452  *        _in4 : 5,2,3,4,5,6,7,8
1453  *        _in5 : 6,2,3,4,5,6,7,8
1454  *        _in6 : 7,2,3,4,5,6,7,8
1455  *        _in7 : 8,2,3,4,5,6,7,8
1456  *
1457  *       _out0 : 1,2,3,4,5,6,7,8
1458  *       _out1 : 2,2,2,2,2,2,2,2
1459  *       _out2 : 3,3,3,3,3,3,3,3
1460  *       _out3 : 4,4,4,4,4,4,4,4
1461  *       _out4 : 5,5,5,5,5,5,5,5
1462  *       _out5 : 6,6,6,6,6,6,6,6
1463  *       _out6 : 7,7,7,7,7,7,7,7
1464  *       _out7 : 8,8,8,8,8,8,8,8
1465  * =============================================================================
1466  */
1467 #define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1468                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1469                             _out7)                                           \
1470   {                                                                          \
1471     __m256i _s0_m, _s1_m;                                                    \
1472     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1473     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1474                                                                              \
1475     _s0_m = __lasx_xvilvl_w(_in2, _in0);                                     \
1476     _s1_m = __lasx_xvilvl_w(_in3, _in1);                                     \
1477     _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1478     _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1479     _s0_m = __lasx_xvilvh_w(_in2, _in0);                                     \
1480     _s1_m = __lasx_xvilvh_w(_in3, _in1);                                     \
1481     _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1482     _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1483     _s0_m = __lasx_xvilvl_w(_in6, _in4);                                     \
1484     _s1_m = __lasx_xvilvl_w(_in7, _in5);                                     \
1485     _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1486     _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1487     _s0_m = __lasx_xvilvh_w(_in6, _in4);                                     \
1488     _s1_m = __lasx_xvilvh_w(_in7, _in5);                                     \
1489     _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1490     _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1491     _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20);                        \
1492     _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20);                        \
1493     _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20);                        \
1494     _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20);                        \
1495     _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31);                        \
1496     _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31);                        \
1497     _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31);                        \
1498     _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31);                        \
1499   }
1500 
1501 /*
1502  * =============================================================================
1503  * Description : Transpose input 16x8 byte block
1504  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1505  *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1506  *                         (input 16x8 byte block)
1507  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1508  *                         _out7 (output 8x16 byte block)
1509  * Details     : The rows of the matrix become columns, and the columns become
1510  *               rows.
1511  * Example     : See LASX_TRANSPOSE16x8_H
1512  * =============================================================================
1513  */
1514 #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1515                              _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
1516                              _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1517                              _out6, _out7)                                    \
1518   {                                                                           \
1519     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
1520     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
1521                                                                               \
1522     _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                    \
1523     _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                    \
1524     _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                    \
1525     _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                    \
1526     _tmp4_m = __lasx_xvilvl_b(_in10, _in8);                                   \
1527     _tmp5_m = __lasx_xvilvl_b(_in11, _in9);                                   \
1528     _tmp6_m = __lasx_xvilvl_b(_in14, _in12);                                  \
1529     _tmp7_m = __lasx_xvilvl_b(_in15, _in13);                                  \
1530     _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                \
1531     _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                \
1532     _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                \
1533     _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                \
1534     _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m);                                \
1535     _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m);                                \
1536     _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m);                                \
1537     _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m);                                \
1538     _tmp0_m = __lasx_xvilvl_w(_out2, _out0);                                  \
1539     _tmp2_m = __lasx_xvilvh_w(_out2, _out0);                                  \
1540     _tmp4_m = __lasx_xvilvl_w(_out3, _out1);                                  \
1541     _tmp6_m = __lasx_xvilvh_w(_out3, _out1);                                  \
1542     _tmp1_m = __lasx_xvilvl_w(_out6, _out4);                                  \
1543     _tmp3_m = __lasx_xvilvh_w(_out6, _out4);                                  \
1544     _tmp5_m = __lasx_xvilvl_w(_out7, _out5);                                  \
1545     _tmp7_m = __lasx_xvilvh_w(_out7, _out5);                                  \
1546     _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m);                                \
1547     _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m);                                \
1548     _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m);                                \
1549     _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m);                                \
1550     _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m);                                \
1551     _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m);                                \
1552     _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m);                                \
1553     _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m);                                \
1554   }
1555 
1556 /*
1557  * =============================================================================
1558  * Description : Transpose input 16x8 byte block
1559  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1560  *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1561  *                         (input 16x8 byte block)
1562  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1563  *                         _out7 (output 8x16 byte block)
1564  * Details     : The rows of the matrix become columns, and the columns become
1565  *               rows.
1566  * Example     : LASX_TRANSPOSE16x8_H
1567  *        _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1568  *        _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1569  *        _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1570  *        _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1571  *        _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1572  *        _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1573  *        _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1574  *        _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1575  *        _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1576  *        _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1577  *       _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1578  *       _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1579  *       _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1580  *       _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1581  *       _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1582  *       _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1583  *
1584  *       _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
1585  *       _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1586  *       _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
1587  *       _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
1588  *       _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
1589  *       _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1590  *       _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
1591  *       _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
1592  * =============================================================================
1593  */
1594 #define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1595                              _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
1596                              _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1597                              _out6, _out7)                                    \
1598   {                                                                           \
1599     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
1600     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
1601     __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                           \
1602                                                                               \
1603     _tmp0_m = __lasx_xvilvl_h(_in2, _in0);                                    \
1604     _tmp1_m = __lasx_xvilvl_h(_in3, _in1);                                    \
1605     _tmp2_m = __lasx_xvilvl_h(_in6, _in4);                                    \
1606     _tmp3_m = __lasx_xvilvl_h(_in7, _in5);                                    \
1607     _tmp4_m = __lasx_xvilvl_h(_in10, _in8);                                   \
1608     _tmp5_m = __lasx_xvilvl_h(_in11, _in9);                                   \
1609     _tmp6_m = __lasx_xvilvl_h(_in14, _in12);                                  \
1610     _tmp7_m = __lasx_xvilvl_h(_in15, _in13);                                  \
1611     _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
1612     _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
1613     _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
1614     _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
1615     _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
1616     _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
1617     _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
1618     _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
1619     _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
1620     _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
1621     _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
1622     _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
1623     _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
1624     _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
1625     _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
1626     _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
1627     _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
1628     _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
1629     _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
1630     _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
1631                                                                               \
1632     _tmp0_m = __lasx_xvilvh_h(_in2, _in0);                                    \
1633     _tmp1_m = __lasx_xvilvh_h(_in3, _in1);                                    \
1634     _tmp2_m = __lasx_xvilvh_h(_in6, _in4);                                    \
1635     _tmp3_m = __lasx_xvilvh_h(_in7, _in5);                                    \
1636     _tmp4_m = __lasx_xvilvh_h(_in10, _in8);                                   \
1637     _tmp5_m = __lasx_xvilvh_h(_in11, _in9);                                   \
1638     _tmp6_m = __lasx_xvilvh_h(_in14, _in12);                                  \
1639     _tmp7_m = __lasx_xvilvh_h(_in15, _in13);                                  \
1640     _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
1641     _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
1642     _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
1643     _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
1644     _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
1645     _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
1646     _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
1647     _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
1648     _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
1649     _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
1650     _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
1651     _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
1652     _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
1653     _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
1654     _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
1655     _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
1656     _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
1657     _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
1658     _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
1659     _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
1660   }
1661 
1662 /*
1663  * =============================================================================
1664  * Description : Transpose 4x4 block with halfword elements in vectors
1665  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1666  *               Outputs - _out0, _out1, _out2, _out3
1667  *               Return Type - signed halfword
1668  * Details     : The rows of the matrix become columns, and the columns become
1669  *               rows.
1670  * Example     : See LASX_TRANSPOSE8x8_H
1671  * =============================================================================
1672  */
1673 #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1674                             _out3)                                       \
1675   {                                                                      \
1676     __m256i _s0_m, _s1_m;                                                \
1677                                                                          \
1678     _s0_m = __lasx_xvilvl_h(_in1, _in0);                                 \
1679     _s1_m = __lasx_xvilvl_h(_in3, _in2);                                 \
1680     _out0 = __lasx_xvilvl_w(_s1_m, _s0_m);                               \
1681     _out2 = __lasx_xvilvh_w(_s1_m, _s0_m);                               \
1682     _out1 = __lasx_xvilvh_d(_out0, _out0);                               \
1683     _out3 = __lasx_xvilvh_d(_out2, _out2);                               \
1684   }
1685 
1686 /*
1687  * =============================================================================
1688  * Description : Transpose input 8x8 byte block
1689  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1690  *                         (input 8x8 byte block)
1691  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1692  *                         _out7 (output 8x8 byte block)
1693  * Example     : See LASX_TRANSPOSE8x8_H
1694  * =============================================================================
1695  */
1696 #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1697                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1698                             _out7)                                           \
1699   {                                                                          \
1700     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1701     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1702     _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                   \
1703     _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                   \
1704     _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                   \
1705     _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                   \
1706     _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                             \
1707     _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                             \
1708     _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                             \
1709     _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                             \
1710     _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m);                               \
1711     _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m);                               \
1712     _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m);                               \
1713     _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m);                               \
1714     _out1 = __lasx_xvbsrl_v(_out0, 8);                                       \
1715     _out3 = __lasx_xvbsrl_v(_out2, 8);                                       \
1716     _out5 = __lasx_xvbsrl_v(_out4, 8);                                       \
1717     _out7 = __lasx_xvbsrl_v(_out6, 8);                                       \
1718   }
1719 
1720 /*
1721  * =============================================================================
1722  * Description : Transpose 8x8 block with halfword elements in vectors.
1723  * Arguments   : Inputs  - _in0, _in1, ~
1724  *               Outputs - _out0, _out1, ~
1725  * Details     : The rows of the matrix become columns, and the columns become
1726  *               rows.
1727  * Example     : LASX_TRANSPOSE8x8_H
1728  *        _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1729  *        _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1730  *        _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1731  *        _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1732  *        _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1733  *        _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1734  *        _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1735  *        _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1736  *
1737  *       _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
1738  *       _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1739  *       _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
1740  *       _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
1741  *       _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
1742  *       _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
1743  *       _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
1744  *       _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
1745  * =============================================================================
1746  */
1747 #define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1748                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1749                             _out7)                                           \
1750   {                                                                          \
1751     __m256i _s0_m, _s1_m;                                                    \
1752     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1753     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1754                                                                              \
1755     _s0_m = __lasx_xvilvl_h(_in6, _in4);                                     \
1756     _s1_m = __lasx_xvilvl_h(_in7, _in5);                                     \
1757     _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1758     _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1759     _s0_m = __lasx_xvilvh_h(_in6, _in4);                                     \
1760     _s1_m = __lasx_xvilvh_h(_in7, _in5);                                     \
1761     _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1762     _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1763                                                                              \
1764     _s0_m = __lasx_xvilvl_h(_in2, _in0);                                     \
1765     _s1_m = __lasx_xvilvl_h(_in3, _in1);                                     \
1766     _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1767     _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1768     _s0_m = __lasx_xvilvh_h(_in2, _in0);                                     \
1769     _s1_m = __lasx_xvilvh_h(_in3, _in1);                                     \
1770     _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1771     _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1772                                                                              \
1773     _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m);                             \
1774     _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m);                             \
1775     _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m);                             \
1776     _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m);                             \
1777     _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m);                             \
1778     _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m);                             \
1779     _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m);                             \
1780     _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m);                             \
1781   }
1782 
1783 /*
1784  * =============================================================================
1785  * Description : Butterfly of 4 input vectors
1786  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1787  *               Outputs - _out0, _out1, _out2, _out3
1788  * Details     : Butterfly operation
1789  * Example     : LASX_BUTTERFLY_4
1790  *               _out0 = _in0 + _in3;
1791  *               _out1 = _in1 + _in2;
1792  *               _out2 = _in1 - _in2;
1793  *               _out3 = _in0 - _in3;
1794  * =============================================================================
1795  */
1796 #define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1797   {                                                                            \
1798     _out0 = __lasx_xvadd_b(_in0, _in3);                                        \
1799     _out1 = __lasx_xvadd_b(_in1, _in2);                                        \
1800     _out2 = __lasx_xvsub_b(_in1, _in2);                                        \
1801     _out3 = __lasx_xvsub_b(_in0, _in3);                                        \
1802   }
1803 #define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1804   {                                                                            \
1805     _out0 = __lasx_xvadd_h(_in0, _in3);                                        \
1806     _out1 = __lasx_xvadd_h(_in1, _in2);                                        \
1807     _out2 = __lasx_xvsub_h(_in1, _in2);                                        \
1808     _out3 = __lasx_xvsub_h(_in0, _in3);                                        \
1809   }
1810 #define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1811   {                                                                            \
1812     _out0 = __lasx_xvadd_w(_in0, _in3);                                        \
1813     _out1 = __lasx_xvadd_w(_in1, _in2);                                        \
1814     _out2 = __lasx_xvsub_w(_in1, _in2);                                        \
1815     _out3 = __lasx_xvsub_w(_in0, _in3);                                        \
1816   }
1817 #define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1818   {                                                                            \
1819     _out0 = __lasx_xvadd_d(_in0, _in3);                                        \
1820     _out1 = __lasx_xvadd_d(_in1, _in2);                                        \
1821     _out2 = __lasx_xvsub_d(_in1, _in2);                                        \
1822     _out3 = __lasx_xvsub_d(_in0, _in3);                                        \
1823   }
1824 
1825 /*
1826  * =============================================================================
1827  * Description : Butterfly of 8 input vectors
1828  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
1829  *               Outputs - _out0, _out1, _out2, _out3, ~
1830  * Details     : Butterfly operation
1831  * Example     : LASX_BUTTERFLY_8
1832  *               _out0 = _in0 + _in7;
1833  *               _out1 = _in1 + _in6;
1834  *               _out2 = _in2 + _in5;
1835  *               _out3 = _in3 + _in4;
1836  *               _out4 = _in3 - _in4;
1837  *               _out5 = _in2 - _in5;
1838  *               _out6 = _in1 - _in6;
1839  *               _out7 = _in0 - _in7;
1840  * =============================================================================
1841  */
1842 #define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1843                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1844                            _out7)                                           \
1845   {                                                                         \
1846     _out0 = __lasx_xvadd_b(_in0, _in7);                                     \
1847     _out1 = __lasx_xvadd_b(_in1, _in6);                                     \
1848     _out2 = __lasx_xvadd_b(_in2, _in5);                                     \
1849     _out3 = __lasx_xvadd_b(_in3, _in4);                                     \
1850     _out4 = __lasx_xvsub_b(_in3, _in4);                                     \
1851     _out5 = __lasx_xvsub_b(_in2, _in5);                                     \
1852     _out6 = __lasx_xvsub_b(_in1, _in6);                                     \
1853     _out7 = __lasx_xvsub_b(_in0, _in7);                                     \
1854   }
1855 
1856 #define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1857                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1858                            _out7)                                           \
1859   {                                                                         \
1860     _out0 = __lasx_xvadd_h(_in0, _in7);                                     \
1861     _out1 = __lasx_xvadd_h(_in1, _in6);                                     \
1862     _out2 = __lasx_xvadd_h(_in2, _in5);                                     \
1863     _out3 = __lasx_xvadd_h(_in3, _in4);                                     \
1864     _out4 = __lasx_xvsub_h(_in3, _in4);                                     \
1865     _out5 = __lasx_xvsub_h(_in2, _in5);                                     \
1866     _out6 = __lasx_xvsub_h(_in1, _in6);                                     \
1867     _out7 = __lasx_xvsub_h(_in0, _in7);                                     \
1868   }
1869 
1870 #define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1871                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1872                            _out7)                                           \
1873   {                                                                         \
1874     _out0 = __lasx_xvadd_w(_in0, _in7);                                     \
1875     _out1 = __lasx_xvadd_w(_in1, _in6);                                     \
1876     _out2 = __lasx_xvadd_w(_in2, _in5);                                     \
1877     _out3 = __lasx_xvadd_w(_in3, _in4);                                     \
1878     _out4 = __lasx_xvsub_w(_in3, _in4);                                     \
1879     _out5 = __lasx_xvsub_w(_in2, _in5);                                     \
1880     _out6 = __lasx_xvsub_w(_in1, _in6);                                     \
1881     _out7 = __lasx_xvsub_w(_in0, _in7);                                     \
1882   }
1883 
1884 #define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1885                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1886                            _out7)                                           \
1887   {                                                                         \
1888     _out0 = __lasx_xvadd_d(_in0, _in7);                                     \
1889     _out1 = __lasx_xvadd_d(_in1, _in6);                                     \
1890     _out2 = __lasx_xvadd_d(_in2, _in5);                                     \
1891     _out3 = __lasx_xvadd_d(_in3, _in4);                                     \
1892     _out4 = __lasx_xvsub_d(_in3, _in4);                                     \
1893     _out5 = __lasx_xvsub_d(_in2, _in5);                                     \
1894     _out6 = __lasx_xvsub_d(_in1, _in6);                                     \
1895     _out7 = __lasx_xvsub_d(_in0, _in7);                                     \
1896   }
1897 
1898 #endif  // LASX
1899 
1900 /*
1901  * =============================================================================
1902  * Description : Print out elements in vector.
1903  * Arguments   : Inputs  - RTYPE, _element_num, _in0, _enter
1904  *               Outputs -
1905  * Details     : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
1906  *               '_enter' is TRUE, prefix "\nVP:" will be added first.
1907  * Example     : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
1908  *               VP:1,2,3,4,
1909  * =============================================================================
1910  */
1911 #define VECT_PRINT(RTYPE, element_num, in0, enter)                 \
1912   {                                                                \
1913     RTYPE _tmp0 = (RTYPE)in0;                                      \
1914     int _i = 0;                                                    \
1915     if (enter) printf("\nVP:");                                    \
1916     for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
1917   }
1918 
1919 #endif /* LOONGSON_INTRINSICS_H */
1920