• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  *  Copyright 2022 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H
12 #define INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H
13 
14 /*
15  * Copyright (c) 2022 Loongson Technology Corporation Limited
16  * All rights reserved.
17  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
18  *                Xiwei Gu   <guxiwei-hf@loongson.cn>
19  *                Lu Wang    <wanglu@loongson.cn>
20  *
21  * This file is a header file for loongarch builtin extension.
22  *
23  */
24 
25 #ifndef LOONGSON_INTRINSICS_H
26 #define LOONGSON_INTRINSICS_H
27 
28 /**
29  * MAJOR version: Macro usage changes.
30  * MINOR version: Add new functions, or bug fixes.
31  * MICRO version: Comment changes or implementation changes.
32  */
33 #define LSOM_VERSION_MAJOR 1
34 #define LSOM_VERSION_MINOR 1
35 #define LSOM_VERSION_MICRO 0
36 
37 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
38   {                                               \
39     _OUT0 = _INS(_IN0);                           \
40     _OUT1 = _INS(_IN1);                           \
41   }
42 
43 #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
44   {                                                           \
45     _OUT0 = _INS(_IN0, _IN1);                                 \
46     _OUT1 = _INS(_IN2, _IN3);                                 \
47   }
48 
49 #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
50   {                                                                       \
51     _OUT0 = _INS(_IN0, _IN1, _IN2);                                       \
52     _OUT1 = _INS(_IN3, _IN4, _IN5);                                       \
53   }
54 
55 #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
56   {                                                                         \
57     DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1);                              \
58     DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3);                              \
59   }
60 
61 #define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
62                   _OUT1, _OUT2, _OUT3)                                         \
63   {                                                                            \
64     DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1);                     \
65     DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3);                     \
66   }
67 
68 #define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
69                   _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)             \
70   {                                                                           \
71     DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1);        \
72     DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3);      \
73   }
74 
75 #ifdef __loongarch_sx
76 #include <lsxintrin.h>
77 /*
78  * =============================================================================
79  * Description : Dot product & addition of byte vector elements
80  * Arguments   : Inputs  - in_c, in_h, in_l
81  *               Outputs - out
82  *               Return Type - halfword
83  * Details     : Signed byte elements from in_h are multiplied by
84  *               signed byte elements from in_l, and then added adjacent to
85  *               each other to get results with the twice size of input.
86  *               Then the results plus to signed half-word elements from in_c.
87  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
88  *        in_c : 1,2,3,4, 1,2,3,4
89  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
90  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
91  *         out : 23,40,41,26, 23,40,41,26
92  * =============================================================================
93  */
__lsx_vdp2add_h_b(__m128i in_c,__m128i in_h,__m128i in_l)94 static inline __m128i __lsx_vdp2add_h_b(__m128i in_c,
95                                         __m128i in_h,
96                                         __m128i in_l) {
97   __m128i out;
98 
99   out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
100   out = __lsx_vmaddwod_h_b(out, in_h, in_l);
101   return out;
102 }
103 
104 /*
105  * =============================================================================
106  * Description : Dot product & addition of byte vector elements
107  * Arguments   : Inputs  - in_c, in_h, in_l
108  *               Outputs - out
109  *               Return Type - halfword
110  * Details     : Unsigned byte elements from in_h are multiplied by
111  *               unsigned byte elements from in_l, and then added adjacent to
112  *               each other to get results with the twice size of input.
113  *               The results plus to signed half-word elements from in_c.
114  * Example     : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
115  *        in_c : 1,2,3,4, 1,2,3,4
116  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
117  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
118  *         out : 23,40,41,26, 23,40,41,26
119  * =============================================================================
120  */
__lsx_vdp2add_h_bu(__m128i in_c,__m128i in_h,__m128i in_l)121 static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c,
122                                          __m128i in_h,
123                                          __m128i in_l) {
124   __m128i out;
125 
126   out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
127   out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
128   return out;
129 }
130 
131 /*
132  * =============================================================================
133  * Description : Dot product & addition of byte vector elements
134  * Arguments   : Inputs  - in_c, in_h, in_l
135  *               Outputs - out
136  *               Return Type - halfword
137  * Details     : Unsigned byte elements from in_h are multiplied by
138  *               signed byte elements from in_l, and then added adjacent to
139  *               each other to get results with the twice size of input.
140  *               The results plus to signed half-word elements from in_c.
141  * Example     : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
142  *        in_c : 1,1,1,1, 1,1,1,1
143  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
144  *        in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
145  *         out : -4,-24,-60,-112, 6,26,62,114
146  * =============================================================================
147  */
__lsx_vdp2add_h_bu_b(__m128i in_c,__m128i in_h,__m128i in_l)148 static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c,
149                                            __m128i in_h,
150                                            __m128i in_l) {
151   __m128i out;
152 
153   out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
154   out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
155   return out;
156 }
157 
158 /*
159  * =============================================================================
160  * Description : Dot product & addition of half-word vector elements
161  * Arguments   : Inputs  - in_c, in_h, in_l
162  *               Outputs - out
163  *               Return Type - __m128i
164  * Details     : Signed half-word elements from in_h are multiplied by
165  *               signed half-word elements from in_l, and then added adjacent to
166  *               each other to get results with the twice size of input.
167  *               Then the results plus to signed word elements from in_c.
168  * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
169  *        in_c : 1,2,3,4
170  *        in_h : 1,2,3,4, 5,6,7,8
171  *        in_l : 8,7,6,5, 4,3,2,1
172  *         out : 23,40,41,26
173  * =============================================================================
174  */
__lsx_vdp2add_w_h(__m128i in_c,__m128i in_h,__m128i in_l)175 static inline __m128i __lsx_vdp2add_w_h(__m128i in_c,
176                                         __m128i in_h,
177                                         __m128i in_l) {
178   __m128i out;
179 
180   out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
181   out = __lsx_vmaddwod_w_h(out, in_h, in_l);
182   return out;
183 }
184 
185 /*
186  * =============================================================================
187  * Description : Dot product of byte vector elements
188  * Arguments   : Inputs  - in_h, in_l
189  *               Outputs - out
190  *               Return Type - halfword
191  * Details     : Signed byte elements from in_h are multiplied by
192  *               signed byte elements from in_l, and then added adjacent to
193  *               each other to get results with the twice size of input.
194  * Example     : out = __lsx_vdp2_h_b(in_h, in_l)
195  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
196  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
197  *         out : 22,38,38,22, 22,38,38,22
198  * =============================================================================
199  */
__lsx_vdp2_h_b(__m128i in_h,__m128i in_l)200 static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
201   __m128i out;
202 
203   out = __lsx_vmulwev_h_b(in_h, in_l);
204   out = __lsx_vmaddwod_h_b(out, in_h, in_l);
205   return out;
206 }
207 
208 /*
209  * =============================================================================
210  * Description : Dot product of byte vector elements
211  * Arguments   : Inputs  - in_h, in_l
212  *               Outputs - out
213  *               Return Type - halfword
214  * Details     : Unsigned byte elements from in_h are multiplied by
215  *               unsigned byte elements from in_l, and then added adjacent to
216  *               each other to get results with the twice size of input.
217  * Example     : out = __lsx_vdp2_h_bu(in_h, in_l)
218  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
219  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
220  *         out : 22,38,38,22, 22,38,38,22
221  * =============================================================================
222  */
__lsx_vdp2_h_bu(__m128i in_h,__m128i in_l)223 static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
224   __m128i out;
225 
226   out = __lsx_vmulwev_h_bu(in_h, in_l);
227   out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
228   return out;
229 }
230 
231 /*
232  * =============================================================================
233  * Description : Dot product of byte vector elements
234  * Arguments   : Inputs  - in_h, in_l
235  *               Outputs - out
236  *               Return Type - halfword
237  * Details     : Unsigned byte elements from in_h are multiplied by
238  *               signed byte elements from in_l, and then added adjacent to
239  *               each other to get results with the twice size of input.
240  * Example     : out = __lsx_vdp2_h_bu_b(in_h, in_l)
241  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
242  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
243  *         out : 22,38,38,22, 22,38,38,6
244  * =============================================================================
245  */
__lsx_vdp2_h_bu_b(__m128i in_h,__m128i in_l)246 static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
247   __m128i out;
248 
249   out = __lsx_vmulwev_h_bu_b(in_h, in_l);
250   out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
251   return out;
252 }
253 
254 /*
255  * =============================================================================
256  * Description : Dot product of byte vector elements
257  * Arguments   : Inputs  - in_h, in_l
258  *               Outputs - out
259  *               Return Type - halfword
260  * Details     : Signed byte elements from in_h are multiplied by
261  *               signed byte elements from in_l, and then added adjacent to
262  *               each other to get results with the twice size of input.
263  * Example     : out = __lsx_vdp2_w_h(in_h, in_l)
264  *        in_h : 1,2,3,4, 5,6,7,8
265  *        in_l : 8,7,6,5, 4,3,2,1
266  *         out : 22,38,38,22
267  * =============================================================================
268  */
__lsx_vdp2_w_h(__m128i in_h,__m128i in_l)269 static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
270   __m128i out;
271 
272   out = __lsx_vmulwev_w_h(in_h, in_l);
273   out = __lsx_vmaddwod_w_h(out, in_h, in_l);
274   return out;
275 }
276 
277 /*
278  * =============================================================================
279  * Description : Clip all halfword elements of input vector between min & max
280  *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
281  *               (_in))
282  * Arguments   : Inputs  - _in  (input vector)
283  *                       - min  (min threshold)
284  *                       - max  (max threshold)
285  *               Outputs - out  (output vector with clipped elements)
286  *               Return Type - signed halfword
287  * Example     : out = __lsx_vclip_h(_in)
288  *         _in : -8,2,280,249, -8,255,280,249
289  *         min : 1,1,1,1, 1,1,1,1
290  *         max : 9,9,9,9, 9,9,9,9
291  *         out : 1,2,9,9, 1,9,9,9
292  * =============================================================================
293  */
__lsx_vclip_h(__m128i _in,__m128i min,__m128i max)294 static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
295   __m128i out;
296 
297   out = __lsx_vmax_h(min, _in);
298   out = __lsx_vmin_h(max, out);
299   return out;
300 }
301 
302 /*
303  * =============================================================================
304  * Description : Set each element of vector between 0 and 255
305  * Arguments   : Inputs  - _in
306  *               Outputs - out
307  *               Return Type - halfword
308  * Details     : Signed byte elements from _in are clamped between 0 and 255.
309  * Example     : out = __lsx_vclip255_h(_in)
310  *         _in : -8,255,280,249, -8,255,280,249
311  *         out : 0,255,255,249, 0,255,255,249
312  * =============================================================================
313  */
__lsx_vclip255_h(__m128i _in)314 static inline __m128i __lsx_vclip255_h(__m128i _in) {
315   __m128i out;
316 
317   out = __lsx_vmaxi_h(_in, 0);
318   out = __lsx_vsat_hu(out, 7);
319   return out;
320 }
321 
322 /*
323  * =============================================================================
324  * Description : Set each element of vector between 0 and 255
325  * Arguments   : Inputs  - _in
326  *               Outputs - out
327  *               Return Type - word
328  * Details     : Signed byte elements from _in are clamped between 0 and 255.
329  * Example     : out = __lsx_vclip255_w(_in)
330  *         _in : -8,255,280,249
331  *         out : 0,255,255,249
332  * =============================================================================
333  */
__lsx_vclip255_w(__m128i _in)334 static inline __m128i __lsx_vclip255_w(__m128i _in) {
335   __m128i out;
336 
337   out = __lsx_vmaxi_w(_in, 0);
338   out = __lsx_vsat_wu(out, 7);
339   return out;
340 }
341 
342 /*
343  * =============================================================================
344  * Description : Swap two variables
345  * Arguments   : Inputs  - _in0, _in1
346  *               Outputs - _in0, _in1 (in-place)
347  * Details     : Swapping of two input variables using xor
348  * Example     : LSX_SWAP(_in0, _in1)
349  *        _in0 : 1,2,3,4
350  *        _in1 : 5,6,7,8
351  *   _in0(out) : 5,6,7,8
352  *   _in1(out) : 1,2,3,4
353  * =============================================================================
354  */
355 #define LSX_SWAP(_in0, _in1)         \
356   {                                  \
357     _in0 = __lsx_vxor_v(_in0, _in1); \
358     _in1 = __lsx_vxor_v(_in0, _in1); \
359     _in0 = __lsx_vxor_v(_in0, _in1); \
360   }
361 
362 /*
363  * =============================================================================
364  * Description : Transpose 4x4 block with word elements in vectors
365  * Arguments   : Inputs  - in0, in1, in2, in3
366  *               Outputs - out0, out1, out2, out3
367  * Details     :
368  * Example     :
369  *               1, 2, 3, 4            1, 5, 9,13
370  *               5, 6, 7, 8    to      2, 6,10,14
371  *               9,10,11,12  =====>    3, 7,11,15
372  *              13,14,15,16            4, 8,12,16
373  * =============================================================================
374  */
375 #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
376   {                                                                            \
377     __m128i _t0, _t1, _t2, _t3;                                                \
378                                                                                \
379     _t0 = __lsx_vilvl_w(_in1, _in0);                                           \
380     _t1 = __lsx_vilvh_w(_in1, _in0);                                           \
381     _t2 = __lsx_vilvl_w(_in3, _in2);                                           \
382     _t3 = __lsx_vilvh_w(_in3, _in2);                                           \
383     _out0 = __lsx_vilvl_d(_t2, _t0);                                           \
384     _out1 = __lsx_vilvh_d(_t2, _t0);                                           \
385     _out2 = __lsx_vilvl_d(_t3, _t1);                                           \
386     _out3 = __lsx_vilvh_d(_t3, _t1);                                           \
387   }
388 
389 /*
390  * =============================================================================
391  * Description : Transpose 8x8 block with byte elements in vectors
392  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
393  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
394  *               _out7
395  * Details     : The rows of the matrix become columns, and the columns
396  *               become rows.
397  * Example     : LSX_TRANSPOSE8x8_B
398  *        _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
399  *        _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
400  *        _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
401  *        _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
402  *        _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
403  *        _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
404  *        _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
405  *        _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
406  *
407  *      _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
408  *      _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
409  *      _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
410  *      _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
411  *      _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
412  *      _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
413  *      _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
414  *      _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
415  * =============================================================================
416  */
417 #define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
418                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
419                            _out7)                                           \
420   {                                                                         \
421     __m128i zero = {0};                                                     \
422     __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110};               \
423     __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                         \
424                                                                             \
425     _t0 = __lsx_vilvl_b(_in2, _in0);                                        \
426     _t1 = __lsx_vilvl_b(_in3, _in1);                                        \
427     _t2 = __lsx_vilvl_b(_in6, _in4);                                        \
428     _t3 = __lsx_vilvl_b(_in7, _in5);                                        \
429     _t4 = __lsx_vilvl_b(_t1, _t0);                                          \
430     _t5 = __lsx_vilvh_b(_t1, _t0);                                          \
431     _t6 = __lsx_vilvl_b(_t3, _t2);                                          \
432     _t7 = __lsx_vilvh_b(_t3, _t2);                                          \
433     _out0 = __lsx_vilvl_w(_t6, _t4);                                        \
434     _out2 = __lsx_vilvh_w(_t6, _t4);                                        \
435     _out4 = __lsx_vilvl_w(_t7, _t5);                                        \
436     _out6 = __lsx_vilvh_w(_t7, _t5);                                        \
437     _out1 = __lsx_vshuf_b(zero, _out0, shuf8);                              \
438     _out3 = __lsx_vshuf_b(zero, _out2, shuf8);                              \
439     _out5 = __lsx_vshuf_b(zero, _out4, shuf8);                              \
440     _out7 = __lsx_vshuf_b(zero, _out6, shuf8);                              \
441   }
442 
443 /*
444  * =============================================================================
445  * Description : Transpose 8x8 block with half-word elements in vectors
446  * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
447  *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
448  * Details     :
449  * Example     :
450  *              00,01,02,03,04,05,06,07           00,10,20,30,40,50,60,70
451  *              10,11,12,13,14,15,16,17           01,11,21,31,41,51,61,71
452  *              20,21,22,23,24,25,26,27           02,12,22,32,42,52,62,72
453  *              30,31,32,33,34,35,36,37    to     03,13,23,33,43,53,63,73
454  *              40,41,42,43,44,45,46,47  ======>  04,14,24,34,44,54,64,74
455  *              50,51,52,53,54,55,56,57           05,15,25,35,45,55,65,75
456  *              60,61,62,63,64,65,66,67           06,16,26,36,46,56,66,76
457  *              70,71,72,73,74,75,76,77           07,17,27,37,47,57,67,77
458  * =============================================================================
459  */
460 #define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
461                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
462                            _out7)                                           \
463   {                                                                         \
464     __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;               \
465                                                                             \
466     _s0 = __lsx_vilvl_h(_in6, _in4);                                        \
467     _s1 = __lsx_vilvl_h(_in7, _in5);                                        \
468     _t0 = __lsx_vilvl_h(_s1, _s0);                                          \
469     _t1 = __lsx_vilvh_h(_s1, _s0);                                          \
470     _s0 = __lsx_vilvh_h(_in6, _in4);                                        \
471     _s1 = __lsx_vilvh_h(_in7, _in5);                                        \
472     _t2 = __lsx_vilvl_h(_s1, _s0);                                          \
473     _t3 = __lsx_vilvh_h(_s1, _s0);                                          \
474     _s0 = __lsx_vilvl_h(_in2, _in0);                                        \
475     _s1 = __lsx_vilvl_h(_in3, _in1);                                        \
476     _t4 = __lsx_vilvl_h(_s1, _s0);                                          \
477     _t5 = __lsx_vilvh_h(_s1, _s0);                                          \
478     _s0 = __lsx_vilvh_h(_in2, _in0);                                        \
479     _s1 = __lsx_vilvh_h(_in3, _in1);                                        \
480     _t6 = __lsx_vilvl_h(_s1, _s0);                                          \
481     _t7 = __lsx_vilvh_h(_s1, _s0);                                          \
482                                                                             \
483     _out0 = __lsx_vpickev_d(_t0, _t4);                                      \
484     _out2 = __lsx_vpickev_d(_t1, _t5);                                      \
485     _out4 = __lsx_vpickev_d(_t2, _t6);                                      \
486     _out6 = __lsx_vpickev_d(_t3, _t7);                                      \
487     _out1 = __lsx_vpickod_d(_t0, _t4);                                      \
488     _out3 = __lsx_vpickod_d(_t1, _t5);                                      \
489     _out5 = __lsx_vpickod_d(_t2, _t6);                                      \
490     _out7 = __lsx_vpickod_d(_t3, _t7);                                      \
491   }
492 
493 /*
494  * =============================================================================
495  * Description : Transpose input 8x4 byte block into 4x8
496  * Arguments   : Inputs  - _in0, _in1, _in2, _in3      (input 8x4 byte block)
497  *               Outputs - _out0, _out1, _out2, _out3  (output 4x8 byte block)
498  *               Return Type - as per RTYPE
499  * Details     : The rows of the matrix become columns, and the columns become
500  *               rows.
501  * Example     : LSX_TRANSPOSE8x4_B
502  *        _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
503  *        _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
504  *        _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
505  *        _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
506  *        _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
507  *        _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
508  *        _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
509  *        _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
510  *
511  *       _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
512  *       _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
513  *       _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
514  *       _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
515  * =============================================================================
516  */
517 #define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
518                            _out0, _out1, _out2, _out3)                     \
519   {                                                                        \
520     __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                            \
521                                                                            \
522     _tmp0_m = __lsx_vpackev_w(_in4, _in0);                                 \
523     _tmp1_m = __lsx_vpackev_w(_in5, _in1);                                 \
524     _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
525     _tmp0_m = __lsx_vpackev_w(_in6, _in2);                                 \
526     _tmp1_m = __lsx_vpackev_w(_in7, _in3);                                 \
527                                                                            \
528     _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
529     _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m);                             \
530     _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m);                             \
531                                                                            \
532     _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m);                               \
533     _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m);                               \
534     _out1 = __lsx_vilvh_d(_out2, _out0);                                   \
535     _out3 = __lsx_vilvh_d(_out0, _out2);                                   \
536   }
537 
538 /*
539  * =============================================================================
540  * Description : Transpose 16x8 block with byte elements in vectors
541  * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, in8
542  *                         in9, in10, in11, in12, in13, in14, in15
543  *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
544  * Details     :
545  * Example     :
546  *              000,001,002,003,004,005,006,007
547  *              008,009,010,011,012,013,014,015
548  *              016,017,018,019,020,021,022,023
549  *              024,025,026,027,028,029,030,031
550  *              032,033,034,035,036,037,038,039
551  *              040,041,042,043,044,045,046,047        000,008,...,112,120
552  *              048,049,050,051,052,053,054,055        001,009,...,113,121
553  *              056,057,058,059,060,061,062,063   to   002,010,...,114,122
554  *              064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
555  *              072,073,074,075,076,077,078,079        004,012,...,116,124
556  *              080,081,082,083,084,085,086,087        005,013,...,117,125
557  *              088,089,090,091,092,093,094,095        006,014,...,118,126
558  *              096,097,098,099,100,101,102,103        007,015,...,119,127
559  *              104,105,106,107,108,109,110,111
560  *              112,113,114,115,116,117,118,119
561  *              120,121,122,123,124,125,126,127
562  * =============================================================================
563  */
564 #define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
565                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
566                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
567                             _out6, _out7)                                    \
568   {                                                                          \
569     __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;          \
570     __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                          \
571     DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
572               _tmp0, _tmp1, _tmp2, _tmp3);                                   \
573     DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15,  \
574               _in13, _tmp4, _tmp5, _tmp6, _tmp7);                            \
575     DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2);          \
576     DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3);          \
577     DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6);          \
578     DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7);          \
579     DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4);              \
580     DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6);              \
581     DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5);              \
582     DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7);              \
583     DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2);      \
584     DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3);      \
585     DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6);      \
586     DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7);      \
587   }
588 
589 /*
590  * =============================================================================
591  * Description : Butterfly of 4 input vectors
592  * Arguments   : Inputs  - in0, in1, in2, in3
593  *               Outputs - out0, out1, out2, out3
594  * Details     : Butterfly operation
595  * Example     :
596  *               out0 = in0 + in3;
597  *               out1 = in1 + in2;
598  *               out2 = in1 - in2;
599  *               out3 = in0 - in3;
600  * =============================================================================
601  */
602 #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
603   {                                                                           \
604     _out0 = __lsx_vadd_b(_in0, _in3);                                         \
605     _out1 = __lsx_vadd_b(_in1, _in2);                                         \
606     _out2 = __lsx_vsub_b(_in1, _in2);                                         \
607     _out3 = __lsx_vsub_b(_in0, _in3);                                         \
608   }
609 #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
610   {                                                                           \
611     _out0 = __lsx_vadd_h(_in0, _in3);                                         \
612     _out1 = __lsx_vadd_h(_in1, _in2);                                         \
613     _out2 = __lsx_vsub_h(_in1, _in2);                                         \
614     _out3 = __lsx_vsub_h(_in0, _in3);                                         \
615   }
616 #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
617   {                                                                           \
618     _out0 = __lsx_vadd_w(_in0, _in3);                                         \
619     _out1 = __lsx_vadd_w(_in1, _in2);                                         \
620     _out2 = __lsx_vsub_w(_in1, _in2);                                         \
621     _out3 = __lsx_vsub_w(_in0, _in3);                                         \
622   }
623 #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
624   {                                                                           \
625     _out0 = __lsx_vadd_d(_in0, _in3);                                         \
626     _out1 = __lsx_vadd_d(_in1, _in2);                                         \
627     _out2 = __lsx_vsub_d(_in1, _in2);                                         \
628     _out3 = __lsx_vsub_d(_in0, _in3);                                         \
629   }
630 
631 /*
632  * =============================================================================
633  * Description : Butterfly of 8 input vectors
634  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
635  *               Outputs - _out0, _out1, _out2, _out3, ~
636  * Details     : Butterfly operation
637  * Example     :
638  *              _out0 = _in0 + _in7;
639  *              _out1 = _in1 + _in6;
640  *              _out2 = _in2 + _in5;
641  *              _out3 = _in3 + _in4;
642  *              _out4 = _in3 - _in4;
643  *              _out5 = _in2 - _in5;
644  *              _out6 = _in1 - _in6;
645  *              _out7 = _in0 - _in7;
646  * =============================================================================
647  */
648 #define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
649                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
650                           _out7)                                           \
651   {                                                                        \
652     _out0 = __lsx_vadd_b(_in0, _in7);                                      \
653     _out1 = __lsx_vadd_b(_in1, _in6);                                      \
654     _out2 = __lsx_vadd_b(_in2, _in5);                                      \
655     _out3 = __lsx_vadd_b(_in3, _in4);                                      \
656     _out4 = __lsx_vsub_b(_in3, _in4);                                      \
657     _out5 = __lsx_vsub_b(_in2, _in5);                                      \
658     _out6 = __lsx_vsub_b(_in1, _in6);                                      \
659     _out7 = __lsx_vsub_b(_in0, _in7);                                      \
660   }
661 
662 #define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
663                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
664                           _out7)                                           \
665   {                                                                        \
666     _out0 = __lsx_vadd_h(_in0, _in7);                                      \
667     _out1 = __lsx_vadd_h(_in1, _in6);                                      \
668     _out2 = __lsx_vadd_h(_in2, _in5);                                      \
669     _out3 = __lsx_vadd_h(_in3, _in4);                                      \
670     _out4 = __lsx_vsub_h(_in3, _in4);                                      \
671     _out5 = __lsx_vsub_h(_in2, _in5);                                      \
672     _out6 = __lsx_vsub_h(_in1, _in6);                                      \
673     _out7 = __lsx_vsub_h(_in0, _in7);                                      \
674   }
675 
676 #define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
677                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
678                           _out7)                                           \
679   {                                                                        \
680     _out0 = __lsx_vadd_w(_in0, _in7);                                      \
681     _out1 = __lsx_vadd_w(_in1, _in6);                                      \
682     _out2 = __lsx_vadd_w(_in2, _in5);                                      \
683     _out3 = __lsx_vadd_w(_in3, _in4);                                      \
684     _out4 = __lsx_vsub_w(_in3, _in4);                                      \
685     _out5 = __lsx_vsub_w(_in2, _in5);                                      \
686     _out6 = __lsx_vsub_w(_in1, _in6);                                      \
687     _out7 = __lsx_vsub_w(_in0, _in7);                                      \
688   }
689 
690 #define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
691                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
692                           _out7)                                           \
693   {                                                                        \
694     _out0 = __lsx_vadd_d(_in0, _in7);                                      \
695     _out1 = __lsx_vadd_d(_in1, _in6);                                      \
696     _out2 = __lsx_vadd_d(_in2, _in5);                                      \
697     _out3 = __lsx_vadd_d(_in3, _in4);                                      \
698     _out4 = __lsx_vsub_d(_in3, _in4);                                      \
699     _out5 = __lsx_vsub_d(_in2, _in5);                                      \
700     _out6 = __lsx_vsub_d(_in1, _in6);                                      \
701     _out7 = __lsx_vsub_d(_in0, _in7);                                      \
702   }
703 
704 #endif  // LSX
705 
706 #ifdef __loongarch_asx
707 #include <lasxintrin.h>
708 /*
709  * =============================================================================
710  * Description : Dot product of byte vector elements
711  * Arguments   : Inputs - in_h, in_l
712  *               Output - out
713  *               Return Type - signed halfword
714  * Details     : Unsigned byte elements from in_h are multiplied with
715  *               unsigned byte elements from in_l producing a result
716  *               twice the size of input i.e. signed halfword.
717  *               Then this multiplied results of adjacent odd-even elements
718  *               are added to the out vector
719  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
720  * =============================================================================
721  */
__lasx_xvdp2_h_bu(__m256i in_h,__m256i in_l)722 static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
723   __m256i out;
724 
725   out = __lasx_xvmulwev_h_bu(in_h, in_l);
726   out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
727   return out;
728 }
729 
730 /*
731  * =============================================================================
732  * Description : Dot product of byte vector elements
733  * Arguments   : Inputs - in_h, in_l
734  *               Output - out
735  *               Return Type - signed halfword
736  * Details     : Signed byte elements from in_h are multiplied with
737  *               signed byte elements from in_l producing a result
738  *               twice the size of input i.e. signed halfword.
739  *               Then this multiplication results of adjacent odd-even elements
740  *               are added to the out vector
741  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
742  * =============================================================================
743  */
__lasx_xvdp2_h_b(__m256i in_h,__m256i in_l)744 static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
745   __m256i out;
746 
747   out = __lasx_xvmulwev_h_b(in_h, in_l);
748   out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
749   return out;
750 }
751 
752 /*
753  * =============================================================================
754  * Description : Dot product of halfword vector elements
755  * Arguments   : Inputs - in_h, in_l
756  *               Output - out
757  *               Return Type - signed word
758  * Details     : Signed halfword elements from in_h are multiplied with
759  *               signed halfword elements from in_l producing a result
760  *               twice the size of input i.e. signed word.
761  *               Then this multiplied results of adjacent odd-even elements
762  *               are added to the out vector.
763  * Example     : out = __lasx_xvdp2_w_h(in_h, in_l)
764  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
765  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
766  *         out : 22,38,38,22, 22,38,38,22
767  * =============================================================================
768  */
__lasx_xvdp2_w_h(__m256i in_h,__m256i in_l)769 static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
770   __m256i out;
771 
772   out = __lasx_xvmulwev_w_h(in_h, in_l);
773   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
774   return out;
775 }
776 
777 /*
778  * =============================================================================
779  * Description : Dot product of word vector elements
780  * Arguments   : Inputs - in_h, in_l
781  *               Output - out
782  *               Return Type - signed double
783  * Details     : Signed word elements from in_h are multiplied with
784  *               signed word elements from in_l producing a result
785  *               twice the size of input i.e. signed double-word.
786  *               Then this multiplied results of adjacent odd-even elements
787  *               are added to the out vector.
788  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
789  * =============================================================================
790  */
__lasx_xvdp2_d_w(__m256i in_h,__m256i in_l)791 static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
792   __m256i out;
793 
794   out = __lasx_xvmulwev_d_w(in_h, in_l);
795   out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
796   return out;
797 }
798 
799 /*
800  * =============================================================================
801  * Description : Dot product of halfword vector elements
802  * Arguments   : Inputs - in_h, in_l
803  *               Output - out
804  *               Return Type - signed word
805  * Details     : Unsigned halfword elements from in_h are multiplied with
806  *               signed halfword elements from in_l producing a result
807  *               twice the size of input i.e. unsigned word.
808  *               Multiplication result of adjacent odd-even elements
809  *               are added to the out vector
810  * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
811  * =============================================================================
812  */
__lasx_xvdp2_w_hu_h(__m256i in_h,__m256i in_l)813 static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
814   __m256i out;
815 
816   out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
817   out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
818   return out;
819 }
820 
821 /*
822  * =============================================================================
823  * Description : Dot product & addition of byte vector elements
824  * Arguments   : Inputs - in_h, in_l
825  *               Output - out
826  *               Return Type - halfword
827  * Details     : Signed byte elements from in_h are multiplied with
828  *               signed byte elements from in_l producing a result
829  *               twice the size of input i.e. signed halfword.
830  *               Then this multiplied results of adjacent odd-even elements
831  *               are added to the in_c vector.
832  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
833  * =============================================================================
834  */
__lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h,__m256i in_l)835 static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,
836                                           __m256i in_h,
837                                           __m256i in_l) {
838   __m256i out;
839 
840   out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
841   out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
842   return out;
843 }
844 
845 /*
846  * =============================================================================
847  * Description : Dot product & addition of byte vector elements
848  * Arguments   : Inputs - in_h, in_l
849  *               Output - out
850  *               Return Type - halfword
851  * Details     : Unsigned byte elements from in_h are multiplied with
852  *               unsigned byte elements from in_l producing a result
853  *               twice the size of input i.e. signed halfword.
854  *               Then this multiplied results of adjacent odd-even elements
855  *               are added to the in_c vector.
856  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
857  * =============================================================================
858  */
__lasx_xvdp2add_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)859 static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c,
860                                            __m256i in_h,
861                                            __m256i in_l) {
862   __m256i out;
863 
864   out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
865   out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
866   return out;
867 }
868 
869 /*
870  * =============================================================================
871  * Description : Dot product & addition of byte vector elements
872  * Arguments   : Inputs - in_h, in_l
873  *               Output - out
874  *               Return Type - halfword
875  * Details     : Unsigned byte elements from in_h are multiplied with
876  *               signed byte elements from in_l producing a result
877  *               twice the size of input i.e. signed halfword.
878  *               Then this multiplied results of adjacent odd-even elements
879  *               are added to the in_c vector.
880  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
881  * =============================================================================
882  */
__lasx_xvdp2add_h_bu_b(__m256i in_c,__m256i in_h,__m256i in_l)883 static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c,
884                                              __m256i in_h,
885                                              __m256i in_l) {
886   __m256i out;
887 
888   out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
889   out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
890   return out;
891 }
892 
893 /*
894  * =============================================================================
895  * Description : Dot product of halfword vector elements
896  * Arguments   : Inputs - in_c, in_h, in_l
897  *               Output - out
898  *               Return Type - per RTYPE
899  * Details     : Signed halfword elements from in_h are multiplied with
900  *               signed halfword elements from in_l producing a result
901  *               twice the size of input i.e. signed word.
902  *               Multiplication result of adjacent odd-even elements
903  *               are added to the in_c vector.
904  * Example     : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
905  *        in_c : 1,2,3,4, 1,2,3,4
906  *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
907  *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
908  *         out : 23,40,41,26, 23,40,41,26
909  * =============================================================================
910  */
__lasx_xvdp2add_w_h(__m256i in_c,__m256i in_h,__m256i in_l)911 static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c,
912                                           __m256i in_h,
913                                           __m256i in_l) {
914   __m256i out;
915 
916   out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
917   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
918   return out;
919 }
920 
921 /*
922  * =============================================================================
923  * Description : Dot product of halfword vector elements
924  * Arguments   : Inputs - in_c, in_h, in_l
925  *               Output - out
926  *               Return Type - signed word
927  * Details     : Unsigned halfword elements from in_h are multiplied with
928  *               unsigned halfword elements from in_l producing a result
929  *               twice the size of input i.e. signed word.
930  *               Multiplication result of adjacent odd-even elements
931  *               are added to the in_c vector.
932  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
933  * =============================================================================
934  */
__lasx_xvdp2add_w_hu(__m256i in_c,__m256i in_h,__m256i in_l)935 static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c,
936                                            __m256i in_h,
937                                            __m256i in_l) {
938   __m256i out;
939 
940   out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
941   out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
942   return out;
943 }
944 
945 /*
946  * =============================================================================
947  * Description : Dot product of halfword vector elements
948  * Arguments   : Inputs - in_c, in_h, in_l
949  *               Output - out
950  *               Return Type - signed word
951  * Details     : Unsigned halfword elements from in_h are multiplied with
952  *               signed halfword elements from in_l producing a result
953  *               twice the size of input i.e. signed word.
954  *               Multiplication result of adjacent odd-even elements
955  *               are added to the in_c vector
956  * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
957  * =============================================================================
958  */
__lasx_xvdp2add_w_hu_h(__m256i in_c,__m256i in_h,__m256i in_l)959 static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c,
960                                              __m256i in_h,
961                                              __m256i in_l) {
962   __m256i out;
963 
964   out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
965   out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
966   return out;
967 }
968 
969 /*
970  * =============================================================================
971  * Description : Vector Unsigned Dot Product and Subtract
972  * Arguments   : Inputs - in_c, in_h, in_l
973  *               Output - out
974  *               Return Type - signed halfword
975  * Details     : Unsigned byte elements from in_h are multiplied with
976  *               unsigned byte elements from in_l producing a result
977  *               twice the size of input i.e. signed halfword.
978  *               Multiplication result of adjacent odd-even elements
979  *               are added together and subtracted from double width elements
980  *               in_c vector.
981  * Example     : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
982  * =============================================================================
983  */
__lasx_xvdp2sub_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)984 static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c,
985                                            __m256i in_h,
986                                            __m256i in_l) {
987   __m256i out;
988 
989   out = __lasx_xvmulwev_h_bu(in_h, in_l);
990   out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
991   out = __lasx_xvsub_h(in_c, out);
992   return out;
993 }
994 
995 /*
996  * =============================================================================
997  * Description : Vector Signed Dot Product and Subtract
998  * Arguments   : Inputs - in_c, in_h, in_l
999  *               Output - out
1000  *               Return Type - signed word
1001  * Details     : Signed halfword elements from in_h are multiplied with
1002  *               Signed halfword elements from in_l producing a result
1003  *               twice the size of input i.e. signed word.
1004  *               Multiplication result of adjacent odd-even elements
1005  *               are added together and subtracted from double width elements
1006  *               in_c vector.
1007  * Example     : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
1008  *        in_c : 0,0,0,0, 0,0,0,0
1009  *        in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
1010  *        in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
1011  *         out : -7,-3,0,0, 0,-1,0,-1
1012  * =============================================================================
1013  */
__lasx_xvdp2sub_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1014 static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c,
1015                                           __m256i in_h,
1016                                           __m256i in_l) {
1017   __m256i out;
1018 
1019   out = __lasx_xvmulwev_w_h(in_h, in_l);
1020   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1021   out = __lasx_xvsub_w(in_c, out);
1022   return out;
1023 }
1024 
1025 /*
1026  * =============================================================================
1027  * Description : Dot product of halfword vector elements
1028  * Arguments   : Inputs - in_h, in_l
1029  *               Output - out
1030  *               Return Type - signed word
1031  * Details     : Signed halfword elements from in_h are multiplied with
1032  *               signed halfword elements from in_l producing a result
1033  *               four times the size of input i.e. signed doubleword.
1034  *               Then this multiplication results of four adjacent elements
1035  *               are added together and stored to the out vector.
1036  * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
1037  *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
1038  *        in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
1039  *         out : -2,0,1,1
1040  * =============================================================================
1041  */
__lasx_xvdp4_d_h(__m256i in_h,__m256i in_l)1042 static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
1043   __m256i out;
1044 
1045   out = __lasx_xvmulwev_w_h(in_h, in_l);
1046   out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1047   out = __lasx_xvhaddw_d_w(out, out);
1048   return out;
1049 }
1050 
1051 /*
1052  * =============================================================================
1053  * Description : The high half of the vector elements are expanded and
1054  *               added after being doubled.
1055  * Arguments   : Inputs - in_h, in_l
1056  *               Output - out
1057  * Details     : The in_h vector and the in_l vector are added after the
1058  *               higher half of the two-fold sign extension (signed byte
1059  *               to signed halfword) and stored to the out vector.
1060  * Example     : See out = __lasx_xvaddwh_w_h(in_h, in_l)
1061  * =============================================================================
1062  */
__lasx_xvaddwh_h_b(__m256i in_h,__m256i in_l)1063 static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
1064   __m256i out;
1065 
1066   out = __lasx_xvilvh_b(in_h, in_l);
1067   out = __lasx_xvhaddw_h_b(out, out);
1068   return out;
1069 }
1070 
1071 /*
1072  * =============================================================================
1073  * Description : The high half of the vector elements are expanded and
1074  *               added after being doubled.
1075  * Arguments   : Inputs - in_h, in_l
1076  *               Output - out
1077  * Details     : The in_h vector and the in_l vector are added after the
1078  *               higher half of the two-fold sign extension (signed halfword
1079  *               to signed word) and stored to the out vector.
1080  * Example     : out = __lasx_xvaddwh_w_h(in_h, in_l)
1081  *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1082  *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1083  *         out : 1,0,0,-1, 1,0,0, 2
1084  * =============================================================================
1085  */
__lasx_xvaddwh_w_h(__m256i in_h,__m256i in_l)1086 static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
1087   __m256i out;
1088 
1089   out = __lasx_xvilvh_h(in_h, in_l);
1090   out = __lasx_xvhaddw_w_h(out, out);
1091   return out;
1092 }
1093 
1094 /*
1095  * =============================================================================
1096  * Description : The low half of the vector elements are expanded and
1097  *               added after being doubled.
1098  * Arguments   : Inputs - in_h, in_l
1099  *               Output - out
1100  * Details     : The in_h vector and the in_l vector are added after the
1101  *               lower half of the two-fold sign extension (signed byte
1102  *               to signed halfword) and stored to the out vector.
1103  * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1104  * =============================================================================
1105  */
__lasx_xvaddwl_h_b(__m256i in_h,__m256i in_l)1106 static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
1107   __m256i out;
1108 
1109   out = __lasx_xvilvl_b(in_h, in_l);
1110   out = __lasx_xvhaddw_h_b(out, out);
1111   return out;
1112 }
1113 
1114 /*
1115  * =============================================================================
1116  * Description : The low half of the vector elements are expanded and
1117  *               added after being doubled.
1118  * Arguments   : Inputs - in_h, in_l
1119  *               Output - out
1120  * Details     : The in_h vector and the in_l vector are added after the
1121  *               lower half of the two-fold sign extension (signed halfword
1122  *               to signed word) and stored to the out vector.
1123  * Example     : out = __lasx_xvaddwl_w_h(in_h, in_l)
1124  *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1125  *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1126  *         out : 5,-1,4,2, 1,0,2,-1
1127  * =============================================================================
1128  */
__lasx_xvaddwl_w_h(__m256i in_h,__m256i in_l)1129 static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
1130   __m256i out;
1131 
1132   out = __lasx_xvilvl_h(in_h, in_l);
1133   out = __lasx_xvhaddw_w_h(out, out);
1134   return out;
1135 }
1136 
1137 /*
1138  * =============================================================================
1139  * Description : The low half of the vector elements are expanded and
1140  *               added after being doubled.
1141  * Arguments   : Inputs - in_h, in_l
1142  *               Output - out
1143  * Details     : The out vector and the out vector are added after the
1144  *               lower half of the two-fold zero extension (unsigned byte
1145  *               to unsigned halfword) and stored to the out vector.
1146  * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1147  * =============================================================================
1148  */
__lasx_xvaddwl_h_bu(__m256i in_h,__m256i in_l)1149 static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
1150   __m256i out;
1151 
1152   out = __lasx_xvilvl_b(in_h, in_l);
1153   out = __lasx_xvhaddw_hu_bu(out, out);
1154   return out;
1155 }
1156 
1157 /*
1158  * =============================================================================
1159  * Description : The low half of the vector elements are expanded and
1160  *               added after being doubled.
1161  * Arguments   : Inputs - in_h, in_l
1162  *               Output - out
1163  * Details     : The in_l vector after double zero extension (unsigned byte to
1164  *               signed halfword),added to the in_h vector.
1165  * Example     : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
1166  * =============================================================================
1167  */
__lasx_xvaddw_h_h_bu(__m256i in_h,__m256i in_l)1168 static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
1169   __m256i out;
1170 
1171   out = __lasx_xvsllwil_hu_bu(in_l, 0);
1172   out = __lasx_xvadd_h(in_h, out);
1173   return out;
1174 }
1175 
1176 /*
1177  * =============================================================================
1178  * Description : The low half of the vector elements are expanded and
1179  *               added after being doubled.
1180  * Arguments   : Inputs - in_h, in_l
1181  *               Output - out
1182  * Details     : The in_l vector after double sign extension (signed halfword to
1183  *               signed word), added to the in_h vector.
1184  * Example     : out = __lasx_xvaddw_w_w_h(in_h, in_l)
1185  *        in_h : 0, 1,0,0, -1,0,0,1,
1186  *        in_l : 2,-1,1,2,  1,0,0,0, 0,0,1,0, 1,0,0,1,
1187  *         out : 2, 0,1,2, -1,0,1,1,
1188  * =============================================================================
1189  */
__lasx_xvaddw_w_w_h(__m256i in_h,__m256i in_l)1190 static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
1191   __m256i out;
1192 
1193   out = __lasx_xvsllwil_w_h(in_l, 0);
1194   out = __lasx_xvadd_w(in_h, out);
1195   return out;
1196 }
1197 
1198 /*
1199  * =============================================================================
1200  * Description : Multiplication and addition calculation after expansion
1201  *               of the lower half of the vector.
1202  * Arguments   : Inputs - in_c, in_h, in_l
1203  *               Output - out
1204  * Details     : The in_h vector and the in_l vector are multiplied after
1205  *               the lower half of the two-fold sign extension (signed halfword
1206  *               to signed word), and the result is added to the vector in_c,
1207  *               then stored to the out vector.
1208  * Example     : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1209  *        in_c : 1,2,3,4, 5,6,7,8
1210  *        in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
1211  *        in_l : 200, 300, 400, 500,  2000, 3000, 4000, 5000,
1212  *              -200,-300,-400,-500, -2000,-3000,-4000,-5000
1213  *         out : 201, 602,1203,2004, -995, -1794,-2793,-3992
1214  * =============================================================================
1215  */
__lasx_xvmaddwl_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1216 static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c,
1217                                           __m256i in_h,
1218                                           __m256i in_l) {
1219   __m256i tmp0, tmp1, out;
1220 
1221   tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1222   tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1223   tmp0 = __lasx_xvmul_w(tmp0, tmp1);
1224   out = __lasx_xvadd_w(tmp0, in_c);
1225   return out;
1226 }
1227 
1228 /*
1229  * =============================================================================
1230  * Description : Multiplication and addition calculation after expansion
1231  *               of the higher half of the vector.
1232  * Arguments   : Inputs - in_c, in_h, in_l
1233  *               Output - out
1234  * Details     : The in_h vector and the in_l vector are multiplied after
1235  *               the higher half of the two-fold sign extension (signed
1236  *               halfword to signed word), and the result is added to
1237  *               the vector in_c, then stored to the out vector.
1238  * Example     : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1239  * =============================================================================
1240  */
__lasx_xvmaddwh_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1241 static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c,
1242                                           __m256i in_h,
1243                                           __m256i in_l) {
1244   __m256i tmp0, tmp1, out;
1245 
1246   tmp0 = __lasx_xvilvh_h(in_h, in_h);
1247   tmp1 = __lasx_xvilvh_h(in_l, in_l);
1248   tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
1249   out = __lasx_xvadd_w(tmp0, in_c);
1250   return out;
1251 }
1252 
1253 /*
1254  * =============================================================================
1255  * Description : Multiplication calculation after expansion of the lower
1256  *               half of the vector.
1257  * Arguments   : Inputs - in_h, in_l
1258  *               Output - out
1259  * Details     : The in_h vector and the in_l vector are multiplied after
1260  *               the lower half of the two-fold sign extension (signed
1261  *               halfword to signed word), then stored to the out vector.
1262  * Example     : out = __lasx_xvmulwl_w_h(in_h, in_l)
1263  *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1264  *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1265  *         out : 6,1,3,0, 0,0,1,0
1266  * =============================================================================
1267  */
__lasx_xvmulwl_w_h(__m256i in_h,__m256i in_l)1268 static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
1269   __m256i tmp0, tmp1, out;
1270 
1271   tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1272   tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1273   out = __lasx_xvmul_w(tmp0, tmp1);
1274   return out;
1275 }
1276 
1277 /*
1278  * =============================================================================
1279  * Description : Multiplication calculation after expansion of the lower
1280  *               half of the vector.
1281  * Arguments   : Inputs - in_h, in_l
1282  *               Output - out
1283  * Details     : The in_h vector and the in_l vector are multiplied after
1284  *               the lower half of the two-fold sign extension (signed
1285  *               halfword to signed word), then stored to the out vector.
1286  * Example     : out = __lasx_xvmulwh_w_h(in_h, in_l)
1287  *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1288  *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1289  *         out : 0,0,0,0, 0,0,0,1
1290  * =============================================================================
1291  */
__lasx_xvmulwh_w_h(__m256i in_h,__m256i in_l)1292 static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
1293   __m256i tmp0, tmp1, out;
1294 
1295   tmp0 = __lasx_xvilvh_h(in_h, in_h);
1296   tmp1 = __lasx_xvilvh_h(in_l, in_l);
1297   out = __lasx_xvmulwev_w_h(tmp0, tmp1);
1298   return out;
1299 }
1300 
1301 /*
1302  * =============================================================================
1303  * Description : The low half of the vector elements are added to the high half
1304  *               after being doubled, then saturated.
1305  * Arguments   : Inputs - in_h, in_l
1306  *               Output - out
1307  * Details     : The in_h vector adds the in_l vector after the lower half of
1308  *               the two-fold zero extension (unsigned byte to unsigned
1309  *               halfword) and then saturated. The results are stored to the out
1310  *               vector.
1311  * Example     : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
1312  *        in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
1313  *        in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
1314  *               0,0,0,1
1315  *        out  : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
1316  * =============================================================================
1317  */
__lasx_xvsaddw_hu_hu_bu(__m256i in_h,__m256i in_l)1318 static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
1319   __m256i tmp1, out;
1320   __m256i zero = {0};
1321 
1322   tmp1 = __lasx_xvilvl_b(zero, in_l);
1323   out = __lasx_xvsadd_hu(in_h, tmp1);
1324   return out;
1325 }
1326 
1327 /*
1328  * =============================================================================
1329  * Description : Clip all halfword elements of input vector between min & max
1330  *               out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1331  * Arguments   : Inputs  - in    (input vector)
1332  *                       - min   (min threshold)
1333  *                       - max   (max threshold)
1334  *               Outputs - in    (output vector with clipped elements)
1335  *               Return Type - signed halfword
1336  * Example     : out = __lasx_xvclip_h(in, min, max)
1337  *          in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
1338  *         min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
1339  *         max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
1340  *         out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
1341  * =============================================================================
1342  */
__lasx_xvclip_h(__m256i in,__m256i min,__m256i max)1343 static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
1344   __m256i out;
1345 
1346   out = __lasx_xvmax_h(min, in);
1347   out = __lasx_xvmin_h(max, out);
1348   return out;
1349 }
1350 
1351 /*
1352  * =============================================================================
1353  * Description : Clip all signed halfword elements of input vector
1354  *               between 0 & 255
1355  * Arguments   : Inputs  - in   (input vector)
1356  *               Outputs - out  (output vector with clipped elements)
1357  *               Return Type - signed halfword
1358  * Example     : See out = __lasx_xvclip255_w(in)
1359  * =============================================================================
1360  */
__lasx_xvclip255_h(__m256i in)1361 static inline __m256i __lasx_xvclip255_h(__m256i in) {
1362   __m256i out;
1363 
1364   out = __lasx_xvmaxi_h(in, 0);
1365   out = __lasx_xvsat_hu(out, 7);
1366   return out;
1367 }
1368 
1369 /*
1370  * =============================================================================
1371  * Description : Clip all signed word elements of input vector
1372  *               between 0 & 255
1373  * Arguments   : Inputs - in   (input vector)
1374  *               Output - out  (output vector with clipped elements)
1375  *               Return Type - signed word
1376  * Example     : out = __lasx_xvclip255_w(in)
1377  *          in : -8,255,280,249, -8,255,280,249
1378  *         out :  0,255,255,249,  0,255,255,249
1379  * =============================================================================
1380  */
__lasx_xvclip255_w(__m256i in)1381 static inline __m256i __lasx_xvclip255_w(__m256i in) {
1382   __m256i out;
1383 
1384   out = __lasx_xvmaxi_w(in, 0);
1385   out = __lasx_xvsat_wu(out, 7);
1386   return out;
1387 }
1388 
1389 /*
1390  * =============================================================================
1391  * Description : Indexed halfword element values are replicated to all
1392  *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
1393  *               if 'idx >= 8' use xvsplati_h_*.
1394  * Arguments   : Inputs - in, idx
1395  *               Output - out
1396  * Details     : Idx element value from in vector is replicated to all
1397  *               elements in out vector.
1398  *               Valid index range for halfword operation is 0-7
1399  * Example     : out = __lasx_xvsplati_l_h(in, idx)
1400  *          in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
1401  *         idx : 0x02
1402  *         out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
1403  * =============================================================================
1404  */
__lasx_xvsplati_l_h(__m256i in,int idx)1405 static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
1406   __m256i out;
1407 
1408   out = __lasx_xvpermi_q(in, in, 0x02);
1409   out = __lasx_xvreplve_h(out, idx);
1410   return out;
1411 }
1412 
1413 /*
1414  * =============================================================================
1415  * Description : Indexed halfword element values are replicated to all
1416  *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
1417  *               if 'idx >= 8' use xvsplati_h_*.
1418  * Arguments   : Inputs - in, idx
1419  *               Output - out
1420  * Details     : Idx element value from in vector is replicated to all
1421  *               elements in out vector.
1422  *               Valid index range for halfword operation is 0-7
1423  * Example     : out = __lasx_xvsplati_h_h(in, idx)
1424  *          in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
1425  *         idx : 0x09
1426  *         out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1427  * =============================================================================
1428  */
__lasx_xvsplati_h_h(__m256i in,int idx)1429 static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
1430   __m256i out;
1431 
1432   out = __lasx_xvpermi_q(in, in, 0x13);
1433   out = __lasx_xvreplve_h(out, idx);
1434   return out;
1435 }
1436 
1437 /*
1438  * =============================================================================
1439  * Description : Transpose 4x4 block with double-word elements in vectors
1440  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1441  *               Outputs - _out0, _out1, _out2, _out3
1442  * Example     : LASX_TRANSPOSE4x4_D
1443  *        _in0 : 1,2,3,4
1444  *        _in1 : 1,2,3,4
1445  *        _in2 : 1,2,3,4
1446  *        _in3 : 1,2,3,4
1447  *
1448  *       _out0 : 1,1,1,1
1449  *       _out1 : 2,2,2,2
1450  *       _out2 : 3,3,3,3
1451  *       _out3 : 4,4,4,4
1452  * =============================================================================
1453  */
1454 #define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1455                             _out3)                                       \
1456   {                                                                      \
1457     __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                  \
1458     _tmp0 = __lasx_xvilvl_d(_in1, _in0);                                 \
1459     _tmp1 = __lasx_xvilvh_d(_in1, _in0);                                 \
1460     _tmp2 = __lasx_xvilvl_d(_in3, _in2);                                 \
1461     _tmp3 = __lasx_xvilvh_d(_in3, _in2);                                 \
1462     _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20);                        \
1463     _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31);                        \
1464     _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20);                        \
1465     _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31);                        \
1466   }
1467 
1468 /*
1469  * =============================================================================
1470  * Description : Transpose 8x8 block with word elements in vectors
1471  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1472  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1473  *               _out7
1474  * Example     : LASX_TRANSPOSE8x8_W
1475  *        _in0 : 1,2,3,4,5,6,7,8
1476  *        _in1 : 2,2,3,4,5,6,7,8
1477  *        _in2 : 3,2,3,4,5,6,7,8
1478  *        _in3 : 4,2,3,4,5,6,7,8
1479  *        _in4 : 5,2,3,4,5,6,7,8
1480  *        _in5 : 6,2,3,4,5,6,7,8
1481  *        _in6 : 7,2,3,4,5,6,7,8
1482  *        _in7 : 8,2,3,4,5,6,7,8
1483  *
1484  *       _out0 : 1,2,3,4,5,6,7,8
1485  *       _out1 : 2,2,2,2,2,2,2,2
1486  *       _out2 : 3,3,3,3,3,3,3,3
1487  *       _out3 : 4,4,4,4,4,4,4,4
1488  *       _out4 : 5,5,5,5,5,5,5,5
1489  *       _out5 : 6,6,6,6,6,6,6,6
1490  *       _out6 : 7,7,7,7,7,7,7,7
1491  *       _out7 : 8,8,8,8,8,8,8,8
1492  * =============================================================================
1493  */
1494 #define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1495                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1496                             _out7)                                           \
1497   {                                                                          \
1498     __m256i _s0_m, _s1_m;                                                    \
1499     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1500     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1501                                                                              \
1502     _s0_m = __lasx_xvilvl_w(_in2, _in0);                                     \
1503     _s1_m = __lasx_xvilvl_w(_in3, _in1);                                     \
1504     _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1505     _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1506     _s0_m = __lasx_xvilvh_w(_in2, _in0);                                     \
1507     _s1_m = __lasx_xvilvh_w(_in3, _in1);                                     \
1508     _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1509     _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1510     _s0_m = __lasx_xvilvl_w(_in6, _in4);                                     \
1511     _s1_m = __lasx_xvilvl_w(_in7, _in5);                                     \
1512     _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1513     _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1514     _s0_m = __lasx_xvilvh_w(_in6, _in4);                                     \
1515     _s1_m = __lasx_xvilvh_w(_in7, _in5);                                     \
1516     _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
1517     _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
1518     _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20);                        \
1519     _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20);                        \
1520     _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20);                        \
1521     _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20);                        \
1522     _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31);                        \
1523     _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31);                        \
1524     _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31);                        \
1525     _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31);                        \
1526   }
1527 
1528 /*
1529  * =============================================================================
1530  * Description : Transpose input 16x8 byte block
1531  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1532  *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1533  *                         (input 16x8 byte block)
1534  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1535  *                         _out7 (output 8x16 byte block)
1536  * Details     : The rows of the matrix become columns, and the columns become
1537  *               rows.
1538  * Example     : See LASX_TRANSPOSE16x8_H
1539  * =============================================================================
1540  */
1541 #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1542                              _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
1543                              _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1544                              _out6, _out7)                                    \
1545   {                                                                           \
1546     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
1547     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
1548                                                                               \
1549     _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                    \
1550     _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                    \
1551     _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                    \
1552     _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                    \
1553     _tmp4_m = __lasx_xvilvl_b(_in10, _in8);                                   \
1554     _tmp5_m = __lasx_xvilvl_b(_in11, _in9);                                   \
1555     _tmp6_m = __lasx_xvilvl_b(_in14, _in12);                                  \
1556     _tmp7_m = __lasx_xvilvl_b(_in15, _in13);                                  \
1557     _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                \
1558     _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                \
1559     _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                \
1560     _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                \
1561     _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m);                                \
1562     _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m);                                \
1563     _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m);                                \
1564     _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m);                                \
1565     _tmp0_m = __lasx_xvilvl_w(_out2, _out0);                                  \
1566     _tmp2_m = __lasx_xvilvh_w(_out2, _out0);                                  \
1567     _tmp4_m = __lasx_xvilvl_w(_out3, _out1);                                  \
1568     _tmp6_m = __lasx_xvilvh_w(_out3, _out1);                                  \
1569     _tmp1_m = __lasx_xvilvl_w(_out6, _out4);                                  \
1570     _tmp3_m = __lasx_xvilvh_w(_out6, _out4);                                  \
1571     _tmp5_m = __lasx_xvilvl_w(_out7, _out5);                                  \
1572     _tmp7_m = __lasx_xvilvh_w(_out7, _out5);                                  \
1573     _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m);                                \
1574     _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m);                                \
1575     _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m);                                \
1576     _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m);                                \
1577     _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m);                                \
1578     _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m);                                \
1579     _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m);                                \
1580     _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m);                                \
1581   }
1582 
1583 /*
1584  * =============================================================================
1585  * Description : Transpose input 16x8 byte block
1586  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1587  *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1588  *                         (input 16x8 byte block)
1589  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1590  *                         _out7 (output 8x16 byte block)
1591  * Details     : The rows of the matrix become columns, and the columns become
1592  *               rows.
1593  * Example     : LASX_TRANSPOSE16x8_H
1594  *        _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1595  *        _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1596  *        _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1597  *        _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1598  *        _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1599  *        _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1600  *        _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1601  *        _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1602  *        _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1603  *        _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1604  *       _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1605  *       _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1606  *       _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1607  *       _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1608  *       _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1609  *       _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1610  *
1611  *       _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
1612  *       _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1613  *       _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
1614  *       _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
1615  *       _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
1616  *       _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1617  *       _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
1618  *       _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
1619  * =============================================================================
1620  */
1621 #define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1622                              _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
1623                              _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1624                              _out6, _out7)                                    \
1625   {                                                                           \
1626     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
1627     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
1628     __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                           \
1629                                                                               \
1630     _tmp0_m = __lasx_xvilvl_h(_in2, _in0);                                    \
1631     _tmp1_m = __lasx_xvilvl_h(_in3, _in1);                                    \
1632     _tmp2_m = __lasx_xvilvl_h(_in6, _in4);                                    \
1633     _tmp3_m = __lasx_xvilvl_h(_in7, _in5);                                    \
1634     _tmp4_m = __lasx_xvilvl_h(_in10, _in8);                                   \
1635     _tmp5_m = __lasx_xvilvl_h(_in11, _in9);                                   \
1636     _tmp6_m = __lasx_xvilvl_h(_in14, _in12);                                  \
1637     _tmp7_m = __lasx_xvilvl_h(_in15, _in13);                                  \
1638     _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
1639     _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
1640     _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
1641     _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
1642     _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
1643     _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
1644     _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
1645     _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
1646     _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
1647     _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
1648     _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
1649     _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
1650     _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
1651     _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
1652     _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
1653     _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
1654     _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
1655     _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
1656     _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
1657     _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
1658                                                                               \
1659     _tmp0_m = __lasx_xvilvh_h(_in2, _in0);                                    \
1660     _tmp1_m = __lasx_xvilvh_h(_in3, _in1);                                    \
1661     _tmp2_m = __lasx_xvilvh_h(_in6, _in4);                                    \
1662     _tmp3_m = __lasx_xvilvh_h(_in7, _in5);                                    \
1663     _tmp4_m = __lasx_xvilvh_h(_in10, _in8);                                   \
1664     _tmp5_m = __lasx_xvilvh_h(_in11, _in9);                                   \
1665     _tmp6_m = __lasx_xvilvh_h(_in14, _in12);                                  \
1666     _tmp7_m = __lasx_xvilvh_h(_in15, _in13);                                  \
1667     _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
1668     _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
1669     _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
1670     _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
1671     _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
1672     _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
1673     _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
1674     _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
1675     _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
1676     _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
1677     _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
1678     _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
1679     _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
1680     _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
1681     _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
1682     _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
1683     _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
1684     _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
1685     _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
1686     _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
1687   }
1688 
1689 /*
1690  * =============================================================================
1691  * Description : Transpose 4x4 block with halfword elements in vectors
1692  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1693  *               Outputs - _out0, _out1, _out2, _out3
1694  *               Return Type - signed halfword
1695  * Details     : The rows of the matrix become columns, and the columns become
1696  *               rows.
1697  * Example     : See LASX_TRANSPOSE8x8_H
1698  * =============================================================================
1699  */
1700 #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1701                             _out3)                                       \
1702   {                                                                      \
1703     __m256i _s0_m, _s1_m;                                                \
1704                                                                          \
1705     _s0_m = __lasx_xvilvl_h(_in1, _in0);                                 \
1706     _s1_m = __lasx_xvilvl_h(_in3, _in2);                                 \
1707     _out0 = __lasx_xvilvl_w(_s1_m, _s0_m);                               \
1708     _out2 = __lasx_xvilvh_w(_s1_m, _s0_m);                               \
1709     _out1 = __lasx_xvilvh_d(_out0, _out0);                               \
1710     _out3 = __lasx_xvilvh_d(_out2, _out2);                               \
1711   }
1712 
1713 /*
1714  * =============================================================================
1715  * Description : Transpose input 8x8 byte block
1716  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1717  *                         (input 8x8 byte block)
1718  *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1719  *                         _out7 (output 8x8 byte block)
1720  * Example     : See LASX_TRANSPOSE8x8_H
1721  * =============================================================================
1722  */
1723 #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1724                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1725                             _out7)                                           \
1726   {                                                                          \
1727     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1728     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1729     _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                   \
1730     _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                   \
1731     _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                   \
1732     _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                   \
1733     _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                             \
1734     _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                             \
1735     _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                             \
1736     _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                             \
1737     _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m);                               \
1738     _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m);                               \
1739     _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m);                               \
1740     _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m);                               \
1741     _out1 = __lasx_xvbsrl_v(_out0, 8);                                       \
1742     _out3 = __lasx_xvbsrl_v(_out2, 8);                                       \
1743     _out5 = __lasx_xvbsrl_v(_out4, 8);                                       \
1744     _out7 = __lasx_xvbsrl_v(_out6, 8);                                       \
1745   }
1746 
1747 /*
1748  * =============================================================================
1749  * Description : Transpose 8x8 block with halfword elements in vectors.
1750  * Arguments   : Inputs  - _in0, _in1, ~
1751  *               Outputs - _out0, _out1, ~
1752  * Details     : The rows of the matrix become columns, and the columns become
1753  *               rows.
1754  * Example     : LASX_TRANSPOSE8x8_H
1755  *        _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1756  *        _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1757  *        _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1758  *        _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1759  *        _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1760  *        _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1761  *        _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1762  *        _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1763  *
1764  *       _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
1765  *       _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1766  *       _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
1767  *       _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
1768  *       _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
1769  *       _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
1770  *       _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
1771  *       _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
1772  * =============================================================================
1773  */
1774 #define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1775                             _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1776                             _out7)                                           \
1777   {                                                                          \
1778     __m256i _s0_m, _s1_m;                                                    \
1779     __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
1780     __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
1781                                                                              \
1782     _s0_m = __lasx_xvilvl_h(_in6, _in4);                                     \
1783     _s1_m = __lasx_xvilvl_h(_in7, _in5);                                     \
1784     _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1785     _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1786     _s0_m = __lasx_xvilvh_h(_in6, _in4);                                     \
1787     _s1_m = __lasx_xvilvh_h(_in7, _in5);                                     \
1788     _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1789     _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1790                                                                              \
1791     _s0_m = __lasx_xvilvl_h(_in2, _in0);                                     \
1792     _s1_m = __lasx_xvilvl_h(_in3, _in1);                                     \
1793     _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1794     _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1795     _s0_m = __lasx_xvilvh_h(_in2, _in0);                                     \
1796     _s1_m = __lasx_xvilvh_h(_in3, _in1);                                     \
1797     _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
1798     _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
1799                                                                              \
1800     _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m);                             \
1801     _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m);                             \
1802     _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m);                             \
1803     _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m);                             \
1804     _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m);                             \
1805     _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m);                             \
1806     _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m);                             \
1807     _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m);                             \
1808   }
1809 
1810 /*
1811  * =============================================================================
1812  * Description : Butterfly of 4 input vectors
1813  * Arguments   : Inputs  - _in0, _in1, _in2, _in3
1814  *               Outputs - _out0, _out1, _out2, _out3
1815  * Details     : Butterfly operation
1816  * Example     : LASX_BUTTERFLY_4
1817  *               _out0 = _in0 + _in3;
1818  *               _out1 = _in1 + _in2;
1819  *               _out2 = _in1 - _in2;
1820  *               _out3 = _in0 - _in3;
1821  * =============================================================================
1822  */
1823 #define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1824   {                                                                            \
1825     _out0 = __lasx_xvadd_b(_in0, _in3);                                        \
1826     _out1 = __lasx_xvadd_b(_in1, _in2);                                        \
1827     _out2 = __lasx_xvsub_b(_in1, _in2);                                        \
1828     _out3 = __lasx_xvsub_b(_in0, _in3);                                        \
1829   }
1830 #define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1831   {                                                                            \
1832     _out0 = __lasx_xvadd_h(_in0, _in3);                                        \
1833     _out1 = __lasx_xvadd_h(_in1, _in2);                                        \
1834     _out2 = __lasx_xvsub_h(_in1, _in2);                                        \
1835     _out3 = __lasx_xvsub_h(_in0, _in3);                                        \
1836   }
1837 #define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1838   {                                                                            \
1839     _out0 = __lasx_xvadd_w(_in0, _in3);                                        \
1840     _out1 = __lasx_xvadd_w(_in1, _in2);                                        \
1841     _out2 = __lasx_xvsub_w(_in1, _in2);                                        \
1842     _out3 = __lasx_xvsub_w(_in0, _in3);                                        \
1843   }
1844 #define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1845   {                                                                            \
1846     _out0 = __lasx_xvadd_d(_in0, _in3);                                        \
1847     _out1 = __lasx_xvadd_d(_in1, _in2);                                        \
1848     _out2 = __lasx_xvsub_d(_in1, _in2);                                        \
1849     _out3 = __lasx_xvsub_d(_in0, _in3);                                        \
1850   }
1851 
1852 /*
1853  * =============================================================================
1854  * Description : Butterfly of 8 input vectors
1855  * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
1856  *               Outputs - _out0, _out1, _out2, _out3, ~
1857  * Details     : Butterfly operation
1858  * Example     : LASX_BUTTERFLY_8
1859  *               _out0 = _in0 + _in7;
1860  *               _out1 = _in1 + _in6;
1861  *               _out2 = _in2 + _in5;
1862  *               _out3 = _in3 + _in4;
1863  *               _out4 = _in3 - _in4;
1864  *               _out5 = _in2 - _in5;
1865  *               _out6 = _in1 - _in6;
1866  *               _out7 = _in0 - _in7;
1867  * =============================================================================
1868  */
1869 #define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1870                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1871                            _out7)                                           \
1872   {                                                                         \
1873     _out0 = __lasx_xvadd_b(_in0, _in7);                                     \
1874     _out1 = __lasx_xvadd_b(_in1, _in6);                                     \
1875     _out2 = __lasx_xvadd_b(_in2, _in5);                                     \
1876     _out3 = __lasx_xvadd_b(_in3, _in4);                                     \
1877     _out4 = __lasx_xvsub_b(_in3, _in4);                                     \
1878     _out5 = __lasx_xvsub_b(_in2, _in5);                                     \
1879     _out6 = __lasx_xvsub_b(_in1, _in6);                                     \
1880     _out7 = __lasx_xvsub_b(_in0, _in7);                                     \
1881   }
1882 
1883 #define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1884                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1885                            _out7)                                           \
1886   {                                                                         \
1887     _out0 = __lasx_xvadd_h(_in0, _in7);                                     \
1888     _out1 = __lasx_xvadd_h(_in1, _in6);                                     \
1889     _out2 = __lasx_xvadd_h(_in2, _in5);                                     \
1890     _out3 = __lasx_xvadd_h(_in3, _in4);                                     \
1891     _out4 = __lasx_xvsub_h(_in3, _in4);                                     \
1892     _out5 = __lasx_xvsub_h(_in2, _in5);                                     \
1893     _out6 = __lasx_xvsub_h(_in1, _in6);                                     \
1894     _out7 = __lasx_xvsub_h(_in0, _in7);                                     \
1895   }
1896 
1897 #define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1898                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1899                            _out7)                                           \
1900   {                                                                         \
1901     _out0 = __lasx_xvadd_w(_in0, _in7);                                     \
1902     _out1 = __lasx_xvadd_w(_in1, _in6);                                     \
1903     _out2 = __lasx_xvadd_w(_in2, _in5);                                     \
1904     _out3 = __lasx_xvadd_w(_in3, _in4);                                     \
1905     _out4 = __lasx_xvsub_w(_in3, _in4);                                     \
1906     _out5 = __lasx_xvsub_w(_in2, _in5);                                     \
1907     _out6 = __lasx_xvsub_w(_in1, _in6);                                     \
1908     _out7 = __lasx_xvsub_w(_in0, _in7);                                     \
1909   }
1910 
1911 #define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
1912                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1913                            _out7)                                           \
1914   {                                                                         \
1915     _out0 = __lasx_xvadd_d(_in0, _in7);                                     \
1916     _out1 = __lasx_xvadd_d(_in1, _in6);                                     \
1917     _out2 = __lasx_xvadd_d(_in2, _in5);                                     \
1918     _out3 = __lasx_xvadd_d(_in3, _in4);                                     \
1919     _out4 = __lasx_xvsub_d(_in3, _in4);                                     \
1920     _out5 = __lasx_xvsub_d(_in2, _in5);                                     \
1921     _out6 = __lasx_xvsub_d(_in1, _in6);                                     \
1922     _out7 = __lasx_xvsub_d(_in0, _in7);                                     \
1923   }
1924 
1925 #endif  // LASX
1926 
1927 /*
1928  * =============================================================================
1929  * Description : Print out elements in vector.
1930  * Arguments   : Inputs  - RTYPE, _element_num, _in0, _enter
1931  *               Outputs -
1932  * Details     : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
1933  *               '_enter' is TRUE, prefix "\nVP:" will be added first.
1934  * Example     : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
1935  *               VP:1,2,3,4,
1936  * =============================================================================
1937  */
1938 #define VECT_PRINT(RTYPE, element_num, in0, enter) \
1939   {                                                \
1940     RTYPE _tmp0 = (RTYPE)in0;                      \
1941     int _i = 0;                                    \
1942     if (enter)                                     \
1943       printf("\nVP:");                             \
1944     for (_i = 0; _i < element_num; _i++)           \
1945       printf("%d,", _tmp0[_i]);                    \
1946   }
1947 
1948 #endif /* LOONGSON_INTRINSICS_H */
1949 #endif /* INCLUDE_LIBYUV_LOONGSON_INTRINSICS_H */
1950