1 /*
2 * Copyright (c) 2021 Loongson Technology Corporation Limited
3 * All rights reserved.
4 * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
5 * Xiwei Gu <guxiwei-hf@loongson.cn>
6 * Lu Wang <wanglu@loongson.cn>
7 *
8 * This file is a header file for loongarch builtin extention.
9 *
10 */
11
12 #ifndef LOONGSON_INTRINSICS_H
13 #define LOONGSON_INTRINSICS_H
14
15 /**
16 * MAJOR version: Macro usage changes.
17 * MINOR version: Add new functions, or bug fix.
18 * MICRO version: Comment changes or implementation changes.
19 */
20 #define LSOM_VERSION_MAJOR 1
21 #define LSOM_VERSION_MINOR 0
22 #define LSOM_VERSION_MICRO 3
23
24 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
25 { \
26 _OUT0 = _INS(_IN0); \
27 _OUT1 = _INS(_IN1); \
28 }
29
30 #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
31 { \
32 _OUT0 = _INS(_IN0, _IN1); \
33 _OUT1 = _INS(_IN2, _IN3); \
34 }
35
36 #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
37 { \
38 _OUT0 = _INS(_IN0, _IN1, _IN2); \
39 _OUT1 = _INS(_IN3, _IN4, _IN5); \
40 }
41
42 #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
43 { \
44 DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
45 DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
46 }
47
48 #define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
49 _OUT0, _OUT1, _OUT2, _OUT3) \
50 { \
51 DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
52 DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
53 }
54
55 #define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
56 _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
57 { \
58 DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
59 DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
60 }
61
62 #ifdef __loongarch_sx
63 #include <lsxintrin.h>
64 /*
65 * =============================================================================
66 * Description : Dot product & addition of byte vector elements
67 * Arguments : Inputs - in_c, in_h, in_l
68 * Outputs - out
69 * Retrun Type - halfword
70 * Details : Signed byte elements from in_h are multiplied by
71 * signed byte elements from in_l, and then added adjacent to
72 * each other to get results with the twice size of input.
73 * Then the results plus to signed half word elements from in_c.
74 * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
75 * in_c : 1,2,3,4, 1,2,3,4
76 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
77 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
78 * out : 23,40,41,26, 23,40,41,26
79 * =============================================================================
80 */
__lsx_vdp2add_h_b(__m128i in_c,__m128i in_h,__m128i in_l)81 static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l)
82 {
83 __m128i out;
84
85 out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
86 out = __lsx_vmaddwod_h_b(out, in_h, in_l);
87 return out;
88 }
89
90 /*
91 * =============================================================================
92 * Description : Dot product & addition of byte vector elements
93 * Arguments : Inputs - in_c, in_h, in_l
94 * Outputs - out
95 * Retrun Type - halfword
96 * Details : Unsigned byte elements from in_h are multiplied by
97 * unsigned byte elements from in_l, and then added adjacent to
98 * each other to get results with the twice size of input.
99 * The results plus to signed half word elements from in_c.
100 * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
101 * in_c : 1,2,3,4, 1,2,3,4
102 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
103 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
104 * out : 23,40,41,26, 23,40,41,26
105 * =============================================================================
106 */
__lsx_vdp2add_h_bu(__m128i in_c,__m128i in_h,__m128i in_l)107 static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_l)
108 {
109 __m128i out;
110
111 out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
112 out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
113 return out;
114 }
115
116 /*
117 * =============================================================================
118 * Description : Dot product & addition of half word vector elements
119 * Arguments : Inputs - in_c, in_h, in_l
120 * Outputs - out
121 * Retrun Type - __m128i
122 * Details : Signed half word elements from in_h are multiplied by
123 * signed half word elements from in_l, and then added adjacent to
124 * each other to get results with the twice size of input.
125 * Then the results plus to signed word elements from in_c.
126 * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
127 * in_c : 1,2,3,4
128 * in_h : 1,2,3,4, 5,6,7,8
129 * in_l : 8,7,6,5, 4,3,2,1
130 * out : 23,40,41,26
131 * =============================================================================
132 */
__lsx_vdp2add_w_h(__m128i in_c,__m128i in_h,__m128i in_l)133 static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l)
134 {
135 __m128i out;
136
137 out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
138 out = __lsx_vmaddwod_w_h(out, in_h, in_l);
139 return out;
140 }
141
142 /*
143 * =============================================================================
144 * Description : Dot product of byte vector elements
145 * Arguments : Inputs - in_h, in_l
146 * Outputs - out
147 * Retrun Type - halfword
148 * Details : Signed byte elements from in_h are multiplied by
149 * signed byte elements from in_l, and then added adjacent to
150 * each other to get results with the twice size of input.
151 * Example : out = __lsx_vdp2_h_b(in_h, in_l)
152 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
153 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
154 * out : 22,38,38,22, 22,38,38,22
155 * =============================================================================
156 */
__lsx_vdp2_h_b(__m128i in_h,__m128i in_l)157 static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
158 {
159 __m128i out;
160
161 out = __lsx_vmulwev_h_b(in_h, in_l);
162 out = __lsx_vmaddwod_h_b(out, in_h, in_l);
163 return out;
164 }
165
166 /*
167 * =============================================================================
168 * Description : Dot product of byte vector elements
169 * Arguments : Inputs - in_h, in_l
170 * Outputs - out
171 * Retrun Type - halfword
172 * Details : Unsigned byte elements from in_h are multiplied by
173 * unsigned byte elements from in_l, and then added adjacent to
174 * each other to get results with the twice size of input.
175 * Example : out = __lsx_vdp2_h_bu(in_h, in_l)
176 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
177 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
178 * out : 22,38,38,22, 22,38,38,22
179 * =============================================================================
180 */
__lsx_vdp2_h_bu(__m128i in_h,__m128i in_l)181 static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
182 {
183 __m128i out;
184
185 out = __lsx_vmulwev_h_bu(in_h, in_l);
186 out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
187 return out;
188 }
189
190 /*
191 * =============================================================================
192 * Description : Dot product of byte vector elements
193 * Arguments : Inputs - in_h, in_l
194 * Outputs - out
195 * Retrun Type - halfword
196 * Details : Unsigned byte elements from in_h are multiplied by
197 * signed byte elements from in_l, and then added adjacent to
198 * each other to get results with the twice size of input.
199 * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l)
200 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
201 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
202 * out : 22,38,38,22, 22,38,38,6
203 * =============================================================================
204 */
__lsx_vdp2_h_bu_b(__m128i in_h,__m128i in_l)205 static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
206 {
207 __m128i out;
208
209 out = __lsx_vmulwev_h_bu_b(in_h, in_l);
210 out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
211 return out;
212 }
213
214 /*
215 * =============================================================================
216 * Description : Dot product of byte vector elements
217 * Arguments : Inputs - in_h, in_l
218 * Outputs - out
219 * Retrun Type - halfword
220 * Details : Signed byte elements from in_h are multiplied by
221 * signed byte elements from in_l, and then added adjacent to
222 * each other to get results with the twice size of input.
223 * Example : out = __lsx_vdp2_w_h(in_h, in_l)
224 * in_h : 1,2,3,4, 5,6,7,8
225 * in_l : 8,7,6,5, 4,3,2,1
226 * out : 22,38,38,22
227 * =============================================================================
228 */
__lsx_vdp2_w_h(__m128i in_h,__m128i in_l)229 static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l)
230 {
231 __m128i out;
232
233 out = __lsx_vmulwev_w_h(in_h, in_l);
234 out = __lsx_vmaddwod_w_h(out, in_h, in_l);
235 return out;
236 }
237
238 /*
239 * =============================================================================
240 * Description : Clip all halfword elements of input vector between min & max
241 * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) : (_in))
242 * Arguments : Inputs - _in (input vector)
243 * - min (min threshold)
244 * - max (max threshold)
245 * Outputs - out (output vector with clipped elements)
246 * Return Type - signed halfword
247 * Example : out = __lsx_vclip_h(_in)
248 * _in : -8,2,280,249, -8,255,280,249
249 * min : 1,1,1,1, 1,1,1,1
250 * max : 9,9,9,9, 9,9,9,9
251 * out : 1,2,9,9, 1,9,9,9
252 * =============================================================================
253 */
__lsx_vclip_h(__m128i _in,__m128i min,__m128i max)254 static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max)
255 {
256 __m128i out;
257
258 out = __lsx_vmax_h(min, _in);
259 out = __lsx_vmin_h(max, out);
260 return out;
261 }
262
263 /*
264 * =============================================================================
265 * Description : Set each element of vector between 0 and 255
266 * Arguments : Inputs - _in
267 * Outputs - out
268 * Retrun Type - halfword
269 * Details : Signed byte elements from _in are clamped between 0 and 255.
270 * Example : out = __lsx_vclip255_h(_in)
271 * _in : -8,255,280,249, -8,255,280,249
272 * out : 0,255,255,249, 0,255,255,249
273 * =============================================================================
274 */
__lsx_vclip255_h(__m128i _in)275 static inline __m128i __lsx_vclip255_h(__m128i _in)
276 {
277 __m128i out;
278
279 out = __lsx_vmaxi_h(_in, 0);
280 out = __lsx_vsat_hu(out, 7);
281 return out;
282 }
283
284 /*
285 * =============================================================================
286 * Description : Set each element of vector between 0 and 255
287 * Arguments : Inputs - _in
288 * Outputs - out
289 * Retrun Type - word
290 * Details : Signed byte elements from _in are clamped between 0 and 255.
291 * Example : out = __lsx_vclip255_w(_in)
292 * _in : -8,255,280,249
293 * out : 0,255,255,249
294 * =============================================================================
295 */
__lsx_vclip255_w(__m128i _in)296 static inline __m128i __lsx_vclip255_w(__m128i _in)
297 {
298 __m128i out;
299
300 out = __lsx_vmaxi_w(_in, 0);
301 out = __lsx_vsat_wu(out, 7);
302 return out;
303 }
304
305 /*
306 * =============================================================================
307 * Description : Swap two variables
308 * Arguments : Inputs - _in0, _in1
309 * Outputs - _in0, _in1 (in-place)
310 * Details : Swapping of two input variables using xor
311 * Example : LSX_SWAP(_in0, _in1)
312 * _in0 : 1,2,3,4
313 * _in1 : 5,6,7,8
314 * _in0(out) : 5,6,7,8
315 * _in1(out) : 1,2,3,4
316 * =============================================================================
317 */
318 #define LSX_SWAP(_in0, _in1) \
319 { \
320 _in0 = __lsx_vxor_v(_in0, _in1); \
321 _in1 = __lsx_vxor_v(_in0, _in1); \
322 _in0 = __lsx_vxor_v(_in0, _in1); \
323 } \
324
325 /*
326 * =============================================================================
327 * Description : Transpose 4x4 block with word elements in vectors
328 * Arguments : Inputs - in0, in1, in2, in3
329 * Outputs - out0, out1, out2, out3
330 * Details :
331 * Example :
332 * 1, 2, 3, 4 1, 5, 9,13
333 * 5, 6, 7, 8 to 2, 6,10,14
334 * 9,10,11,12 =====> 3, 7,11,15
335 * 13,14,15,16 4, 8,12,16
336 * =============================================================================
337 */
338 #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
339 { \
340 __m128i _t0, _t1, _t2, _t3; \
341 \
342 _t0 = __lsx_vilvl_w(_in1, _in0); \
343 _t1 = __lsx_vilvh_w(_in1, _in0); \
344 _t2 = __lsx_vilvl_w(_in3, _in2); \
345 _t3 = __lsx_vilvh_w(_in3, _in2); \
346 _out0 = __lsx_vilvl_d(_t2, _t0); \
347 _out1 = __lsx_vilvh_d(_t2, _t0); \
348 _out2 = __lsx_vilvl_d(_t3, _t1); \
349 _out3 = __lsx_vilvh_d(_t3, _t1); \
350 }
351
352 /*
353 * =============================================================================
354 * Description : Transpose 8x8 block with byte elements in vectors
355 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
356 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
357 * Details : The rows of the matrix become columns, and the columns become rows.
358 * Example : LSX_TRANSPOSE8x8_B
359 * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
360 * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
361 * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
362 * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
363 * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
364 * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
365 * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
366 * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
367 *
368 * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
369 * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
370 * _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
371 * _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
372 * _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
373 * _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
374 * _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
375 * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
376 * =============================================================================
377 */
378 #define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
379 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
380 { \
381 __m128i zero = {0}; \
382 __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; \
383 __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
384 \
385 _t0 = __lsx_vilvl_b(_in2, _in0); \
386 _t1 = __lsx_vilvl_b(_in3, _in1); \
387 _t2 = __lsx_vilvl_b(_in6, _in4); \
388 _t3 = __lsx_vilvl_b(_in7, _in5); \
389 _t4 = __lsx_vilvl_b(_t1, _t0); \
390 _t5 = __lsx_vilvh_b(_t1, _t0); \
391 _t6 = __lsx_vilvl_b(_t3, _t2); \
392 _t7 = __lsx_vilvh_b(_t3, _t2); \
393 _out0 = __lsx_vilvl_w(_t6, _t4); \
394 _out2 = __lsx_vilvh_w(_t6, _t4); \
395 _out4 = __lsx_vilvl_w(_t7, _t5); \
396 _out6 = __lsx_vilvh_w(_t7, _t5); \
397 _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
398 _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
399 _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
400 _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
401 }
402
403 /*
404 * =============================================================================
405 * Description : Transpose 8x8 block with half word elements in vectors
406 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
407 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
408 * Details :
409 * Example :
410 * 00,01,02,03,04,05,06,07 00,10,20,30,40,50,60,70
411 * 10,11,12,13,14,15,16,17 01,11,21,31,41,51,61,71
412 * 20,21,22,23,24,25,26,27 02,12,22,32,42,52,62,72
413 * 30,31,32,33,34,35,36,37 to 03,13,23,33,43,53,63,73
414 * 40,41,42,43,44,45,46,47 ======> 04,14,24,34,44,54,64,74
415 * 50,51,52,53,54,55,56,57 05,15,25,35,45,55,65,75
416 * 60,61,62,63,64,65,66,67 06,16,26,36,46,56,66,76
417 * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77
418 * =============================================================================
419 */
420 #define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
421 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
422 { \
423 __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
424 \
425 _s0 = __lsx_vilvl_h(_in6, _in4); \
426 _s1 = __lsx_vilvl_h(_in7, _in5); \
427 _t0 = __lsx_vilvl_h(_s1, _s0); \
428 _t1 = __lsx_vilvh_h(_s1, _s0); \
429 _s0 = __lsx_vilvh_h(_in6, _in4); \
430 _s1 = __lsx_vilvh_h(_in7, _in5); \
431 _t2 = __lsx_vilvl_h(_s1, _s0); \
432 _t3 = __lsx_vilvh_h(_s1, _s0); \
433 _s0 = __lsx_vilvl_h(_in2, _in0); \
434 _s1 = __lsx_vilvl_h(_in3, _in1); \
435 _t4 = __lsx_vilvl_h(_s1, _s0); \
436 _t5 = __lsx_vilvh_h(_s1, _s0); \
437 _s0 = __lsx_vilvh_h(_in2, _in0); \
438 _s1 = __lsx_vilvh_h(_in3, _in1); \
439 _t6 = __lsx_vilvl_h(_s1, _s0); \
440 _t7 = __lsx_vilvh_h(_s1, _s0); \
441 \
442 _out0 = __lsx_vpickev_d(_t0, _t4); \
443 _out2 = __lsx_vpickev_d(_t1, _t5); \
444 _out4 = __lsx_vpickev_d(_t2, _t6); \
445 _out6 = __lsx_vpickev_d(_t3, _t7); \
446 _out1 = __lsx_vpickod_d(_t0, _t4); \
447 _out3 = __lsx_vpickod_d(_t1, _t5); \
448 _out5 = __lsx_vpickod_d(_t2, _t6); \
449 _out7 = __lsx_vpickod_d(_t3, _t7); \
450 }
451
452 /*
453 * =============================================================================
454 * Description : Transpose input 8x4 byte block into 4x8
455 * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block)
456 * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block)
457 * Return Type - as per RTYPE
458 * Details : The rows of the matrix become columns, and the columns become rows.
459 * Example : LSX_TRANSPOSE8x4_B
460 * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
461 * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
462 * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
463 * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
464 * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
465 * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
466 * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
467 * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
468 *
469 * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
470 * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
471 * _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
472 * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
473 * =============================================================================
474 */
475 #define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
476 _out0, _out1, _out2, _out3) \
477 { \
478 __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
479 \
480 _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
481 _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
482 _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
483 _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
484 _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
485 \
486 _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
487 _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
488 _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
489 \
490 _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
491 _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
492 _out1 = __lsx_vilvh_d(_out2, _out0); \
493 _out3 = __lsx_vilvh_d(_out0, _out2); \
494 }
495
496 /*
497 * =============================================================================
498 * Description : Transpose 16x8 block with byte elements in vectors
499 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8
500 * in9, in10, in11, in12, in13, in14, in15
501 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
502 * Details :
503 * Example :
504 * 000,001,002,003,004,005,006,007
505 * 008,009,010,011,012,013,014,015
506 * 016,017,018,019,020,021,022,023
507 * 024,025,026,027,028,029,030,031
508 * 032,033,034,035,036,037,038,039
509 * 040,041,042,043,044,045,046,047 000,008,...,112,120
510 * 048,049,050,051,052,053,054,055 001,009,...,113,121
511 * 056,057,058,059,060,061,062,063 to 002,010,...,114,122
512 * 064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
513 * 072,073,074,075,076,077,078,079 004,012,...,116,124
514 * 080,081,082,083,084,085,086,087 005,013,...,117,125
515 * 088,089,090,091,092,093,094,095 006,014,...,118,126
516 * 096,097,098,099,100,101,102,103 007,015,...,119,127
517 * 104,105,106,107,108,109,110,111
518 * 112,113,114,115,116,117,118,119
519 * 120,121,122,123,124,125,126,127
520 * =============================================================================
521 */
522 #define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _in8, \
523 _in9, _in10, _in11, _in12, _in13, _in14, _in15, _out0, \
524 _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
525 { \
526 __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
527 __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
528 DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
529 _tmp0, _tmp1, _tmp2, _tmp3); \
530 DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
531 _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
532 DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
533 DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
534 DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
535 DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
536 DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
537 DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
538 DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
539 DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
540 DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
541 DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
542 DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
543 DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
544 }
545
546 /*
547 * =============================================================================
548 * Description : Butterfly of 4 input vectors
549 * Arguments : Inputs - in0, in1, in2, in3
550 * Outputs - out0, out1, out2, out3
551 * Details : Butterfly operation
552 * Example :
553 * out0 = in0 + in3;
554 * out1 = in1 + in2;
555 * out2 = in1 - in2;
556 * out3 = in0 - in3;
557 * =============================================================================
558 */
559 #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
560 { \
561 _out0 = __lsx_vadd_b(_in0, _in3); \
562 _out1 = __lsx_vadd_b(_in1, _in2); \
563 _out2 = __lsx_vsub_b(_in1, _in2); \
564 _out3 = __lsx_vsub_b(_in0, _in3); \
565 }
566 #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
567 { \
568 _out0 = __lsx_vadd_h(_in0, _in3); \
569 _out1 = __lsx_vadd_h(_in1, _in2); \
570 _out2 = __lsx_vsub_h(_in1, _in2); \
571 _out3 = __lsx_vsub_h(_in0, _in3); \
572 }
573 #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
574 { \
575 _out0 = __lsx_vadd_w(_in0, _in3); \
576 _out1 = __lsx_vadd_w(_in1, _in2); \
577 _out2 = __lsx_vsub_w(_in1, _in2); \
578 _out3 = __lsx_vsub_w(_in0, _in3); \
579 }
580 #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
581 { \
582 _out0 = __lsx_vadd_d(_in0, _in3); \
583 _out1 = __lsx_vadd_d(_in1, _in2); \
584 _out2 = __lsx_vsub_d(_in1, _in2); \
585 _out3 = __lsx_vsub_d(_in0, _in3); \
586 }
587
588 /*
589 * =============================================================================
590 * Description : Butterfly of 8 input vectors
591 * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
592 * Outputs - _out0, _out1, _out2, _out3, ~
593 * Details : Butterfly operation
594 * Example :
595 * _out0 = _in0 + _in7;
596 * _out1 = _in1 + _in6;
597 * _out2 = _in2 + _in5;
598 * _out3 = _in3 + _in4;
599 * _out4 = _in3 - _in4;
600 * _out5 = _in2 - _in5;
601 * _out6 = _in1 - _in6;
602 * _out7 = _in0 - _in7;
603 * =============================================================================
604 */
605 #define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
606 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
607 { \
608 _out0 = __lsx_vadd_b(_in0, _in7); \
609 _out1 = __lsx_vadd_b(_in1, _in6); \
610 _out2 = __lsx_vadd_b(_in2, _in5); \
611 _out3 = __lsx_vadd_b(_in3, _in4); \
612 _out4 = __lsx_vsub_b(_in3, _in4); \
613 _out5 = __lsx_vsub_b(_in2, _in5); \
614 _out6 = __lsx_vsub_b(_in1, _in6); \
615 _out7 = __lsx_vsub_b(_in0, _in7); \
616 }
617
618 #define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
619 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
620 { \
621 _out0 = __lsx_vadd_h(_in0, _in7); \
622 _out1 = __lsx_vadd_h(_in1, _in6); \
623 _out2 = __lsx_vadd_h(_in2, _in5); \
624 _out3 = __lsx_vadd_h(_in3, _in4); \
625 _out4 = __lsx_vsub_h(_in3, _in4); \
626 _out5 = __lsx_vsub_h(_in2, _in5); \
627 _out6 = __lsx_vsub_h(_in1, _in6); \
628 _out7 = __lsx_vsub_h(_in0, _in7); \
629 }
630
631 #define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
632 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
633 { \
634 _out0 = __lsx_vadd_w(_in0, _in7); \
635 _out1 = __lsx_vadd_w(_in1, _in6); \
636 _out2 = __lsx_vadd_w(_in2, _in5); \
637 _out3 = __lsx_vadd_w(_in3, _in4); \
638 _out4 = __lsx_vsub_w(_in3, _in4); \
639 _out5 = __lsx_vsub_w(_in2, _in5); \
640 _out6 = __lsx_vsub_w(_in1, _in6); \
641 _out7 = __lsx_vsub_w(_in0, _in7); \
642 }
643
644 #define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
645 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
646 { \
647 _out0 = __lsx_vadd_d(_in0, _in7); \
648 _out1 = __lsx_vadd_d(_in1, _in6); \
649 _out2 = __lsx_vadd_d(_in2, _in5); \
650 _out3 = __lsx_vadd_d(_in3, _in4); \
651 _out4 = __lsx_vsub_d(_in3, _in4); \
652 _out5 = __lsx_vsub_d(_in2, _in5); \
653 _out6 = __lsx_vsub_d(_in1, _in6); \
654 _out7 = __lsx_vsub_d(_in0, _in7); \
655 }
656
657 #endif //LSX
658
659 #ifdef __loongarch_asx
660 #include <lasxintrin.h>
661 /*
662 * =============================================================================
663 * Description : Dot product of byte vector elements
664 * Arguments : Inputs - in_h, in_l
665 * Output - out
666 * Return Type - signed halfword
667 * Details : Unsigned byte elements from in_h are multiplied with
668 * unsigned byte elements from in_l producing a result
669 * twice the size of input i.e. signed halfword.
670 * Then this multiplied results of adjacent odd-even elements
671 * are added to the out vector
672 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
673 * =============================================================================
674 */
__lasx_xvdp2_h_bu(__m256i in_h,__m256i in_l)675 static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l)
676 {
677 __m256i out;
678
679 out = __lasx_xvmulwev_h_bu(in_h, in_l);
680 out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
681 return out;
682 }
683
684 /*
685 * =============================================================================
686 * Description : Dot product of byte vector elements
687 * Arguments : Inputs - in_h, in_l
688 * Output - out
689 * Return Type - signed halfword
690 * Details : Signed byte elements from in_h are multiplied with
691 * signed byte elements from in_l producing a result
692 * twice the size of input i.e. signed halfword.
693 * Then this iniplication results of adjacent odd-even elements
694 * are added to the out vector
695 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
696 * =============================================================================
697 */
__lasx_xvdp2_h_b(__m256i in_h,__m256i in_l)698 static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l)
699 {
700 __m256i out;
701
702 out = __lasx_xvmulwev_h_b(in_h, in_l);
703 out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
704 return out;
705 }
706
707 /*
708 * =============================================================================
709 * Description : Dot product of halfword vector elements
710 * Arguments : Inputs - in_h, in_l
711 * Output - out
712 * Return Type - signed word
713 * Details : Signed halfword elements from in_h are multiplied with
714 * signed halfword elements from in_l producing a result
715 * twice the size of input i.e. signed word.
716 * Then this multiplied results of adjacent odd-even elements
717 * are added to the out vector.
718 * Example : out = __lasx_xvdp2_w_h(in_h, in_l)
719 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
720 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
721 * out : 22,38,38,22, 22,38,38,22
722 * =============================================================================
723 */
__lasx_xvdp2_w_h(__m256i in_h,__m256i in_l)724 static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l)
725 {
726 __m256i out;
727
728 out = __lasx_xvmulwev_w_h(in_h, in_l);
729 out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
730 return out;
731 }
732
733 /*
734 * =============================================================================
735 * Description : Dot product of word vector elements
736 * Arguments : Inputs - in_h, in_l
737 * Output - out
738 * Retrun Type - signed double
739 * Details : Signed word elements from in_h are multiplied with
740 * signed word elements from in_l producing a result
741 * twice the size of input i.e. signed double word.
742 * Then this multiplied results of adjacent odd-even elements
743 * are added to the out vector.
744 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
745 * =============================================================================
746 */
__lasx_xvdp2_d_w(__m256i in_h,__m256i in_l)747 static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l)
748 {
749 __m256i out;
750
751 out = __lasx_xvmulwev_d_w(in_h, in_l);
752 out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
753 return out;
754 }
755
756 /*
757 * =============================================================================
758 * Description : Dot product of halfword vector elements
759 * Arguments : Inputs - in_h, in_l
760 * Output - out
761 * Return Type - signed word
762 * Details : Unsigned halfword elements from in_h are multiplied with
763 * signed halfword elements from in_l producing a result
764 * twice the size of input i.e. unsigned word.
765 * Multiplication result of adjacent odd-even elements
766 * are added to the out vector
767 * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
768 * =============================================================================
769 */
__lasx_xvdp2_w_hu_h(__m256i in_h,__m256i in_l)770 static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
771 {
772 __m256i out;
773
774 out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
775 out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
776 return out;
777 }
778
779 /*
780 * =============================================================================
781 * Description : Dot product & addition of byte vector elements
782 * Arguments : Inputs - in_h, in_l
783 * Output - out
784 * Retrun Type - halfword
785 * Details : Signed byte elements from in_h are multiplied with
786 * signed byte elements from in_l producing a result
787 * twice the size of input i.e. signed halfword.
788 * Then this multiplied results of adjacent odd-even elements
789 * are added to the in_c vector.
790 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
791 * =============================================================================
792 */
__lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h,__m256i in_l)793 static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h, __m256i in_l)
794 {
795 __m256i out;
796
797 out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
798 out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
799 return out;
800 }
801
802 /*
803 * =============================================================================
804 * Description : Dot product of halfword vector elements
805 * Arguments : Inputs - in_c, in_h, in_l
806 * Output - out
807 * Return Type - per RTYPE
808 * Details : Signed halfword elements from in_h are multiplied with
809 * signed halfword elements from in_l producing a result
810 * twice the size of input i.e. signed word.
811 * Multiplication result of adjacent odd-even elements
812 * are added to the in_c vector.
813 * Example : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
814 * in_c : 1,2,3,4, 1,2,3,4
815 * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
816 * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
817 * out : 23,40,41,26, 23,40,41,26
818 * =============================================================================
819 */
__lasx_xvdp2add_w_h(__m256i in_c,__m256i in_h,__m256i in_l)820 static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
821 {
822 __m256i out;
823
824 out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
825 out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
826 return out;
827 }
828
829 /*
830 * =============================================================================
831 * Description : Dot product of halfword vector elements
832 * Arguments : Inputs - in_c, in_h, in_l
833 * Output - out
834 * Return Type - signed word
835 * Details : Unsigned halfword elements from in_h are multiplied with
836 * unsigned halfword elements from in_l producing a result
837 * twice the size of input i.e. signed word.
838 * Multiplication result of adjacent odd-even elements
839 * are added to the in_c vector.
840 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
841 * =============================================================================
842 */
__lasx_xvdp2add_w_hu(__m256i in_c,__m256i in_h,__m256i in_l)843 static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i in_l)
844 {
845 __m256i out;
846
847 out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
848 out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
849 return out;
850 }
851
852 /*
853 * =============================================================================
854 * Description : Dot product of halfword vector elements
855 * Arguments : Inputs - in_c, in_h, in_l
856 * Output - out
857 * Return Type - signed word
858 * Details : Unsigned halfword elements from in_h are multiplied with
859 * signed halfword elements from in_l producing a result
860 * twice the size of input i.e. signed word.
861 * Multiplication result of adjacent odd-even elements
862 * are added to the in_c vector
863 * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
864 * =============================================================================
865 */
__lasx_xvdp2add_w_hu_h(__m256i in_c,__m256i in_h,__m256i in_l)866 static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i in_l)
867 {
868 __m256i out;
869
870 out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
871 out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
872 return out;
873 }
874
875 /*
876 * =============================================================================
877 * Description : Vector Unsigned Dot Product and Subtract
878 * Arguments : Inputs - in_c, in_h, in_l
879 * Output - out
880 * Return Type - signed halfword
881 * Details : Unsigned byte elements from in_h are multiplied with
882 * unsigned byte elements from in_l producing a result
883 * twice the size of input i.e. signed halfword.
884 * Multiplication result of adjacent odd-even elements
885 * are added together and subtracted from double width elements
886 * in_c vector.
887 * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
888 * =============================================================================
889 */
__lasx_xvdp2sub_h_bu(__m256i in_c,__m256i in_h,__m256i in_l)890 static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i in_l)
891 {
892 __m256i out;
893
894 out = __lasx_xvmulwev_h_bu(in_h, in_l);
895 out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
896 out = __lasx_xvsub_h(in_c, out);
897 return out;
898 }
899
900 /*
901 * =============================================================================
902 * Description : Vector Signed Dot Product and Subtract
903 * Arguments : Inputs - in_c, in_h, in_l
904 * Output - out
905 * Return Type - signed word
906 * Details : Signed halfword elements from in_h are multiplied with
907 * Signed halfword elements from in_l producing a result
908 * twice the size of input i.e. signed word.
909 * Multiplication result of adjacent odd-even elements
910 * are added together and subtracted from double width elements
911 * in_c vector.
912 * Example : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
913 * in_c : 0,0,0,0, 0,0,0,0
914 * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
915 * in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
916 * out : -7,-3,0,0, 0,-1,0,-1
917 * =============================================================================
918 */
__lasx_xvdp2sub_w_h(__m256i in_c,__m256i in_h,__m256i in_l)919 static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
920 {
921 __m256i out;
922
923 out = __lasx_xvmulwev_w_h(in_h, in_l);
924 out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
925 out = __lasx_xvsub_w(in_c, out);
926 return out;
927 }
928
929 /*
930 * =============================================================================
931 * Description : Dot product of halfword vector elements
932 * Arguments : Inputs - in_h, in_l
933 * Output - out
934 * Return Type - signed word
935 * Details : Signed halfword elements from in_h are iniplied with
936 * signed halfword elements from in_l producing a result
937 * four times the size of input i.e. signed doubleword.
938 * Then this iniplication results of four adjacent elements
939 * are added together and stored to the out vector.
940 * Example : out = __lasx_xvdp4_d_h(in_h, in_l)
941 * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
942 * in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
943 * out : -2,0,1,1
944 * =============================================================================
945 */
__lasx_xvdp4_d_h(__m256i in_h,__m256i in_l)946 static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l)
947 {
948 __m256i out;
949
950 out = __lasx_xvmulwev_w_h(in_h, in_l);
951 out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
952 out = __lasx_xvhaddw_d_w(out, out);
953 return out;
954 }
955
956 /*
957 * =============================================================================
958 * Description : The high half of the vector elements are expanded and
959 * added after being doubled.
960 * Arguments : Inputs - in_h, in_l
961 * Output - out
962 * Details : The in_h vector and the in_l vector are added after the
963 * higher half of the two-fold sign extension (signed byte
964 * to signed halfword) and stored to the out vector.
965 * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l)
966 * =============================================================================
967 */
__lasx_xvaddwh_h_b(__m256i in_h,__m256i in_l)968 static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
969 {
970 __m256i out;
971
972 out = __lasx_xvilvh_b(in_h, in_l);
973 out = __lasx_xvhaddw_h_b(out, out);
974 return out;
975 }
976
977 /*
978 * =============================================================================
979 * Description : The high half of the vector elements are expanded and
980 * added after being doubled.
981 * Arguments : Inputs - in_h, in_l
982 * Output - out
983 * Details : The in_h vector and the in_l vector are added after the
984 * higher half of the two-fold sign extension (signed halfword
985 * to signed word) and stored to the out vector.
986 * Example : out = __lasx_xvaddwh_w_h(in_h, in_l)
987 * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
988 * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
989 * out : 1,0,0,-1, 1,0,0, 2
990 * =============================================================================
991 */
__lasx_xvaddwh_w_h(__m256i in_h,__m256i in_l)992 static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l)
993 {
994 __m256i out;
995
996 out = __lasx_xvilvh_h(in_h, in_l);
997 out = __lasx_xvhaddw_w_h(out, out);
998 return out;
999 }
1000
1001 /*
1002 * =============================================================================
1003 * Description : The low half of the vector elements are expanded and
1004 * added after being doubled.
1005 * Arguments : Inputs - in_h, in_l
1006 * Output - out
1007 * Details : The in_h vector and the in_l vector are added after the
1008 * lower half of the two-fold sign extension (signed byte
1009 * to signed halfword) and stored to the out vector.
1010 * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1011 * =============================================================================
1012 */
__lasx_xvaddwl_h_b(__m256i in_h,__m256i in_l)1013 static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l)
1014 {
1015 __m256i out;
1016
1017 out = __lasx_xvilvl_b(in_h, in_l);
1018 out = __lasx_xvhaddw_h_b(out, out);
1019 return out;
1020 }
1021
1022 /*
1023 * =============================================================================
1024 * Description : The low half of the vector elements are expanded and
1025 * added after being doubled.
1026 * Arguments : Inputs - in_h, in_l
1027 * Output - out
1028 * Details : The in_h vector and the in_l vector are added after the
1029 * lower half of the two-fold sign extension (signed halfword
1030 * to signed word) and stored to the out vector.
1031 * Example : out = __lasx_xvaddwl_w_h(in_h, in_l)
1032 * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1033 * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1034 * out : 5,-1,4,2, 1,0,2,-1
1035 * =============================================================================
1036 */
__lasx_xvaddwl_w_h(__m256i in_h,__m256i in_l)1037 static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l)
1038 {
1039 __m256i out;
1040
1041 out = __lasx_xvilvl_h(in_h, in_l);
1042 out = __lasx_xvhaddw_w_h(out, out);
1043 return out;
1044 }
1045
1046 /*
1047 * =============================================================================
1048 * Description : The low half of the vector elements are expanded and
1049 * added after being doubled.
1050 * Arguments : Inputs - in_h, in_l
1051 * Output - out
1052 * Details : The out vector and the out vector are added after the
1053 * lower half of the two-fold zero extension (unsigned byte
1054 * to unsigned halfword) and stored to the out vector.
1055 * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1056 * =============================================================================
1057 */
__lasx_xvaddwl_h_bu(__m256i in_h,__m256i in_l)1058 static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l)
1059 {
1060 __m256i out;
1061
1062 out = __lasx_xvilvl_b(in_h, in_l);
1063 out = __lasx_xvhaddw_hu_bu(out, out);
1064 return out;
1065 }
1066
1067 /*
1068 * =============================================================================
1069 * Description : The low half of the vector elements are expanded and
1070 * added after being doubled.
1071 * Arguments : Inputs - in_h, in_l
1072 * Output - out
1073 * Details : The in_l vector after double zero extension (unsigned byte to
1074 * signed halfword),added to the in_h vector.
1075 * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
1076 * =============================================================================
1077 */
__lasx_xvaddw_h_h_bu(__m256i in_h,__m256i in_l)1078 static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l)
1079 {
1080 __m256i out;
1081
1082 out = __lasx_xvsllwil_hu_bu(in_l, 0);
1083 out = __lasx_xvadd_h(in_h, out);
1084 return out;
1085 }
1086
1087 /*
1088 * =============================================================================
1089 * Description : The low half of the vector elements are expanded and
1090 * added after being doubled.
1091 * Arguments : Inputs - in_h, in_l
1092 * Output - out
1093 * Details : The in_l vector after double sign extension (signed halfword to
1094 * signed word), added to the in_h vector.
1095 * Example : out = __lasx_xvaddw_w_w_h(in_h, in_l)
1096 * in_h : 0, 1,0,0, -1,0,0,1,
1097 * in_l : 2,-1,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1,
1098 * out : 2, 0,1,2, -1,0,1,1,
1099 * =============================================================================
1100 */
__lasx_xvaddw_w_w_h(__m256i in_h,__m256i in_l)1101 static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l)
1102 {
1103 __m256i out;
1104
1105 out = __lasx_xvsllwil_w_h(in_l, 0);
1106 out = __lasx_xvadd_w(in_h, out);
1107 return out;
1108 }
1109
1110 /*
1111 * =============================================================================
1112 * Description : Multiplication and addition calculation after expansion
1113 * of the lower half of the vector.
1114 * Arguments : Inputs - in_c, in_h, in_l
1115 * Output - out
1116 * Details : The in_h vector and the in_l vector are multiplied after
1117 * the lower half of the two-fold sign extension (signed halfword
1118 * to signed word), and the result is added to the vector in_c,
1119 * then stored to the out vector.
1120 * Example : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1121 * in_c : 1,2,3,4, 5,6,7,8
1122 * in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
1123 * in_l : 200, 300, 400, 500, 2000, 3000, 4000, 5000,
1124 * -200,-300,-400,-500, -2000,-3000,-4000,-5000
1125 * out : 201, 602,1203,2004, -995, -1794,-2793,-3992
1126 * =============================================================================
1127 */
__lasx_xvmaddwl_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1128 static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
1129 {
1130 __m256i tmp0, tmp1, out;
1131
1132 tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1133 tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1134 tmp0 = __lasx_xvmul_w(tmp0, tmp1);
1135 out = __lasx_xvadd_w(tmp0, in_c);
1136 return out;
1137 }
1138
1139 /*
1140 * =============================================================================
1141 * Description : Multiplication and addition calculation after expansion
1142 * of the higher half of the vector.
1143 * Arguments : Inputs - in_c, in_h, in_l
1144 * Output - out
1145 * Details : The in_h vector and the in_l vector are multiplied after
1146 * the higher half of the two-fold sign extension (signed
1147 * halfword to signed word), and the result is added to
1148 * the vector in_c, then stored to the out vector.
1149 * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1150 * =============================================================================
1151 */
__lasx_xvmaddwh_w_h(__m256i in_c,__m256i in_h,__m256i in_l)1152 static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
1153 {
1154 __m256i tmp0, tmp1, out;
1155
1156 tmp0 = __lasx_xvilvh_h(in_h, in_h);
1157 tmp1 = __lasx_xvilvh_h(in_l, in_l);
1158 tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
1159 out = __lasx_xvadd_w(tmp0, in_c);
1160 return out;
1161 }
1162
1163 /*
1164 * =============================================================================
1165 * Description : Multiplication calculation after expansion of the lower
1166 * half of the vector.
1167 * Arguments : Inputs - in_h, in_l
1168 * Output - out
1169 * Details : The in_h vector and the in_l vector are multiplied after
1170 * the lower half of the two-fold sign extension (signed
1171 * halfword to signed word), then stored to the out vector.
1172 * Example : out = __lasx_xvmulwl_w_h(in_h, in_l)
1173 * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1174 * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1175 * out : 6,1,3,0, 0,0,1,0
1176 * =============================================================================
1177 */
__lasx_xvmulwl_w_h(__m256i in_h,__m256i in_l)1178 static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l)
1179 {
1180 __m256i tmp0, tmp1, out;
1181
1182 tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1183 tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1184 out = __lasx_xvmul_w(tmp0, tmp1);
1185 return out;
1186 }
1187
1188 /*
1189 * =============================================================================
1190 * Description : Multiplication calculation after expansion of the lower
1191 * half of the vector.
1192 * Arguments : Inputs - in_h, in_l
1193 * Output - out
1194 * Details : The in_h vector and the in_l vector are multiplied after
1195 * the lower half of the two-fold sign extension (signed
1196 * halfword to signed word), then stored to the out vector.
1197 * Example : out = __lasx_xvmulwh_w_h(in_h, in_l)
1198 * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1199 * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1200 * out : 0,0,0,0, 0,0,0,1
1201 * =============================================================================
1202 */
__lasx_xvmulwh_w_h(__m256i in_h,__m256i in_l)1203 static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l)
1204 {
1205 __m256i tmp0, tmp1, out;
1206
1207 tmp0 = __lasx_xvilvh_h(in_h, in_h);
1208 tmp1 = __lasx_xvilvh_h(in_l, in_l);
1209 out = __lasx_xvmulwev_w_h(tmp0, tmp1);
1210 return out;
1211 }
1212
1213 /*
1214 * =============================================================================
1215 * Description : The low half of the vector elements are expanded and
1216 * added saturately after being doubled.
1217 * Arguments : Inputs - in_h, in_l
1218 * Output - out
1219 * Details : The in_h vector adds the in_l vector saturately after the lower
1220 * half of the two-fold zero extension (unsigned byte to unsigned
1221 * halfword) and the results are stored to the out vector.
1222 * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
1223 * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
1224 * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
1225 * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
1226 * =============================================================================
1227 */
__lasx_xvsaddw_hu_hu_bu(__m256i in_h,__m256i in_l)1228 static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l)
1229 {
1230 __m256i tmp1, out;
1231 __m256i zero = {0};
1232
1233 tmp1 = __lasx_xvilvl_b(zero, in_l);
1234 out = __lasx_xvsadd_hu(in_h, tmp1);
1235 return out;
1236 }
1237
1238 /*
1239 * =============================================================================
1240 * Description : Clip all halfword elements of input vector between min & max
1241 * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1242 * Arguments : Inputs - in (input vector)
1243 * - min (min threshold)
1244 * - max (max threshold)
1245 * Outputs - in (output vector with clipped elements)
1246 * Return Type - signed halfword
1247 * Example : out = __lasx_xvclip_h(in, min, max)
1248 * in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
1249 * min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
1250 * max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
1251 * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
1252 * =============================================================================
1253 */
__lasx_xvclip_h(__m256i in,__m256i min,__m256i max)1254 static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max)
1255 {
1256 __m256i out;
1257
1258 out = __lasx_xvmax_h(min, in);
1259 out = __lasx_xvmin_h(max, out);
1260 return out;
1261 }
1262
1263 /*
1264 * =============================================================================
1265 * Description : Clip all signed halfword elements of input vector
1266 * between 0 & 255
1267 * Arguments : Inputs - in (input vector)
1268 * Outputs - out (output vector with clipped elements)
1269 * Return Type - signed halfword
1270 * Example : See out = __lasx_xvclip255_w(in)
1271 * =============================================================================
1272 */
__lasx_xvclip255_h(__m256i in)1273 static inline __m256i __lasx_xvclip255_h(__m256i in)
1274 {
1275 __m256i out;
1276
1277 out = __lasx_xvmaxi_h(in, 0);
1278 out = __lasx_xvsat_hu(out, 7);
1279 return out;
1280 }
1281
1282 /*
1283 * =============================================================================
1284 * Description : Clip all signed word elements of input vector
1285 * between 0 & 255
1286 * Arguments : Inputs - in (input vector)
1287 * Output - out (output vector with clipped elements)
1288 * Return Type - signed word
1289 * Example : out = __lasx_xvclip255_w(in)
1290 * in : -8,255,280,249, -8,255,280,249
1291 * out : 0,255,255,249, 0,255,255,249
1292 * =============================================================================
1293 */
__lasx_xvclip255_w(__m256i in)1294 static inline __m256i __lasx_xvclip255_w(__m256i in)
1295 {
1296 __m256i out;
1297
1298 out = __lasx_xvmaxi_w(in, 0);
1299 out = __lasx_xvsat_wu(out, 7);
1300 return out;
1301 }
1302
1303 /*
1304 * =============================================================================
1305 * Description : Indexed halfword element values are replicated to all
1306 * elements in output vector. If 'indx < 8' use xvsplati_l_*,
1307 * if 'indx >= 8' use xvsplati_h_*.
1308 * Arguments : Inputs - in, idx
1309 * Output - out
1310 * Details : Idx element value from in vector is replicated to all
1311 * elements in out vector.
1312 * Valid index range for halfword operation is 0-7
1313 * Example : out = __lasx_xvsplati_l_h(in, idx)
1314 * in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
1315 * idx : 0x02
1316 * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
1317 * =============================================================================
1318 */
__lasx_xvsplati_l_h(__m256i in,int idx)1319 static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx)
1320 {
1321 __m256i out;
1322
1323 out = __lasx_xvpermi_q(in, in, 0x02);
1324 out = __lasx_xvreplve_h(out, idx);
1325 return out;
1326 }
1327
1328 /*
1329 * =============================================================================
1330 * Description : Indexed halfword element values are replicated to all
1331 * elements in output vector. If 'indx < 8' use xvsplati_l_*,
1332 * if 'indx >= 8' use xvsplati_h_*.
1333 * Arguments : Inputs - in, idx
1334 * Output - out
1335 * Details : Idx element value from in vector is replicated to all
1336 * elements in out vector.
1337 * Valid index range for halfword operation is 0-7
1338 * Example : out = __lasx_xvsplati_h_h(in, idx)
1339 * in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
1340 * idx : 0x09
1341 * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1342 * =============================================================================
1343 */
__lasx_xvsplati_h_h(__m256i in,int idx)1344 static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
1345 {
1346 __m256i out;
1347
1348 out = __lasx_xvpermi_q(in, in, 0x13);
1349 out = __lasx_xvreplve_h(out, idx);
1350 return out;
1351 }
1352
1353 /*
1354 * =============================================================================
1355 * Description : Transpose 4x4 block with double word elements in vectors
1356 * Arguments : Inputs - _in0, _in1, _in2, _in3
1357 * Outputs - _out0, _out1, _out2, _out3
1358 * Example : LASX_TRANSPOSE4x4_D
1359 * _in0 : 1,2,3,4
1360 * _in1 : 1,2,3,4
1361 * _in2 : 1,2,3,4
1362 * _in3 : 1,2,3,4
1363 *
1364 * _out0 : 1,1,1,1
1365 * _out1 : 2,2,2,2
1366 * _out2 : 3,3,3,3
1367 * _out3 : 4,4,4,4
1368 * =============================================================================
1369 */
1370 #define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1371 { \
1372 __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
1373 _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
1374 _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
1375 _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
1376 _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
1377 _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
1378 _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
1379 _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
1380 _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
1381 }
1382
1383 /*
1384 * =============================================================================
1385 * Description : Transpose 8x8 block with word elements in vectors
1386 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1387 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
1388 * Example : LASX_TRANSPOSE8x8_W
1389 * _in0 : 1,2,3,4,5,6,7,8
1390 * _in1 : 2,2,3,4,5,6,7,8
1391 * _in2 : 3,2,3,4,5,6,7,8
1392 * _in3 : 4,2,3,4,5,6,7,8
1393 * _in4 : 5,2,3,4,5,6,7,8
1394 * _in5 : 6,2,3,4,5,6,7,8
1395 * _in6 : 7,2,3,4,5,6,7,8
1396 * _in7 : 8,2,3,4,5,6,7,8
1397 *
1398 * _out0 : 1,2,3,4,5,6,7,8
1399 * _out1 : 2,2,2,2,2,2,2,2
1400 * _out2 : 3,3,3,3,3,3,3,3
1401 * _out3 : 4,4,4,4,4,4,4,4
1402 * _out4 : 5,5,5,5,5,5,5,5
1403 * _out5 : 6,6,6,6,6,6,6,6
1404 * _out6 : 7,7,7,7,7,7,7,7
1405 * _out7 : 8,8,8,8,8,8,8,8
1406 * =============================================================================
1407 */
1408 #define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1409 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1410 { \
1411 __m256i _s0_m, _s1_m; \
1412 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1413 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1414 \
1415 _s0_m = __lasx_xvilvl_w(_in2, _in0); \
1416 _s1_m = __lasx_xvilvl_w(_in3, _in1); \
1417 _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1418 _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1419 _s0_m = __lasx_xvilvh_w(_in2, _in0); \
1420 _s1_m = __lasx_xvilvh_w(_in3, _in1); \
1421 _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1422 _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1423 _s0_m = __lasx_xvilvl_w(_in6, _in4); \
1424 _s1_m = __lasx_xvilvl_w(_in7, _in5); \
1425 _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1426 _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1427 _s0_m = __lasx_xvilvh_w(_in6, _in4); \
1428 _s1_m = __lasx_xvilvh_w(_in7, _in5); \
1429 _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1430 _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1431 _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
1432 _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
1433 _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
1434 _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
1435 _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
1436 _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
1437 _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
1438 _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
1439 }
1440
1441 /*
1442 * =============================================================================
1443 * Description : Transpose input 16x8 byte block
1444 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1445 * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1446 * (input 16x8 byte block)
1447 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
1448 * (output 8x16 byte block)
1449 * Details : The rows of the matrix become columns, and the columns become rows.
1450 * Example : See LASX_TRANSPOSE16x8_H
1451 * =============================================================================
1452 */
1453 #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1454 _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15, \
1455 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1456 { \
1457 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1458 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1459 \
1460 _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1461 _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1462 _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1463 _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1464 _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
1465 _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
1466 _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
1467 _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
1468 _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1469 _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1470 _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1471 _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1472 _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
1473 _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
1474 _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
1475 _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
1476 _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
1477 _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
1478 _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
1479 _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
1480 _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
1481 _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
1482 _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
1483 _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
1484 _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
1485 _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
1486 _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
1487 _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
1488 _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
1489 _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
1490 _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
1491 _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
1492 }
1493
1494 /*
1495 * =============================================================================
1496 * Description : Transpose input 16x8 byte block
1497 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1498 * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1499 * (input 16x8 byte block)
1500 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
1501 * (output 8x16 byte block)
1502 * Details : The rows of the matrix become columns, and the columns become rows.
1503 * Example : LASX_TRANSPOSE16x8_H
1504 * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1505 * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1506 * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1507 * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1508 * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1509 * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1510 * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1511 * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1512 * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1513 * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1514 * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1515 * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1516 * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1517 * _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1518 * _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1519 * _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1520 *
1521 * _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
1522 * _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1523 * _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
1524 * _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
1525 * _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
1526 * _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1527 * _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
1528 * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
1529 * =============================================================================
1530 */
1531 #define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1532 _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15, \
1533 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1534 { \
1535 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1536 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1537 __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
1538 \
1539 _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
1540 _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
1541 _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
1542 _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
1543 _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
1544 _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
1545 _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
1546 _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
1547 _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1548 _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1549 _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1550 _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1551 _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1552 _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1553 _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1554 _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1555 _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1556 _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1557 _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1558 _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1559 _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1560 _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1561 _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1562 _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1563 _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1564 _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1565 _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1566 _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1567 \
1568 _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
1569 _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
1570 _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
1571 _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
1572 _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
1573 _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
1574 _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
1575 _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
1576 _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1577 _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1578 _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1579 _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1580 _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1581 _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1582 _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1583 _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1584 _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1585 _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1586 _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1587 _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1588 _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1589 _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1590 _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1591 _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1592 _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1593 _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1594 _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1595 _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1596 }
1597
1598 /*
1599 * =============================================================================
1600 * Description : Transpose 4x4 block with halfword elements in vectors
1601 * Arguments : Inputs - _in0, _in1, _in2, _in3
1602 * Outputs - _out0, _out1, _out2, _out3
1603 * Return Type - signed halfword
1604 * Details : The rows of the matrix become columns, and the columns become rows.
1605 * Example : See LASX_TRANSPOSE8x8_H
1606 * =============================================================================
1607 */
1608 #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1609 { \
1610 __m256i _s0_m, _s1_m; \
1611 \
1612 _s0_m = __lasx_xvilvl_h(_in1, _in0); \
1613 _s1_m = __lasx_xvilvl_h(_in3, _in2); \
1614 _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
1615 _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
1616 _out1 = __lasx_xvilvh_d(_out0, _out0); \
1617 _out3 = __lasx_xvilvh_d(_out2, _out2); \
1618 }
1619
1620 /*
1621 * =============================================================================
1622 * Description : Transpose input 8x8 byte block
1623 * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1624 * (input 8x8 byte block)
1625 * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
1626 * (output 8x8 byte block)
1627 * Example : See LASX_TRANSPOSE8x8_H
1628 * =============================================================================
1629 */
1630 #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0, \
1631 _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1632 { \
1633 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1634 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1635 _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1636 _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1637 _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1638 _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1639 _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1640 _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1641 _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1642 _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1643 _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
1644 _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
1645 _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
1646 _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
1647 _out1 = __lasx_xvbsrl_v(_out0, 8); \
1648 _out3 = __lasx_xvbsrl_v(_out2, 8); \
1649 _out5 = __lasx_xvbsrl_v(_out4, 8); \
1650 _out7 = __lasx_xvbsrl_v(_out6, 8); \
1651 }
1652
1653 /*
1654 * =============================================================================
1655 * Description : Transpose 8x8 block with halfword elements in vectors.
1656 * Arguments : Inputs - _in0, _in1, ~
1657 * Outputs - _out0, _out1, ~
1658 * Details : The rows of the matrix become columns, and the columns become rows.
1659 * Example : LASX_TRANSPOSE8x8_H
1660 * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1661 * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1662 * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1663 * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1664 * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1665 * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1666 * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1667 * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1668 *
1669 * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
1670 * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1671 * _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
1672 * _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
1673 * _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
1674 * _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
1675 * _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
1676 * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
1677 * =============================================================================
1678 */
1679 #define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0, \
1680 _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1681 { \
1682 __m256i _s0_m, _s1_m; \
1683 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1684 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1685 \
1686 _s0_m = __lasx_xvilvl_h(_in6, _in4); \
1687 _s1_m = __lasx_xvilvl_h(_in7, _in5); \
1688 _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1689 _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1690 _s0_m = __lasx_xvilvh_h(_in6, _in4); \
1691 _s1_m = __lasx_xvilvh_h(_in7, _in5); \
1692 _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1693 _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1694 \
1695 _s0_m = __lasx_xvilvl_h(_in2, _in0); \
1696 _s1_m = __lasx_xvilvl_h(_in3, _in1); \
1697 _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1698 _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1699 _s0_m = __lasx_xvilvh_h(_in2, _in0); \
1700 _s1_m = __lasx_xvilvh_h(_in3, _in1); \
1701 _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1702 _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1703 \
1704 _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
1705 _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
1706 _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
1707 _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
1708 _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
1709 _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
1710 _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
1711 _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
1712 }
1713
1714 /*
1715 * =============================================================================
1716 * Description : Butterfly of 4 input vectors
1717 * Arguments : Inputs - _in0, _in1, _in2, _in3
1718 * Outputs - _out0, _out1, _out2, _out3
1719 * Details : Butterfly operation
1720 * Example : LASX_BUTTERFLY_4
1721 * _out0 = _in0 + _in3;
1722 * _out1 = _in1 + _in2;
1723 * _out2 = _in1 - _in2;
1724 * _out3 = _in0 - _in3;
1725 * =============================================================================
1726 */
1727 #define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1728 { \
1729 _out0 = __lasx_xvadd_b(_in0, _in3); \
1730 _out1 = __lasx_xvadd_b(_in1, _in2); \
1731 _out2 = __lasx_xvsub_b(_in1, _in2); \
1732 _out3 = __lasx_xvsub_b(_in0, _in3); \
1733 }
1734 #define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1735 { \
1736 _out0 = __lasx_xvadd_h(_in0, _in3); \
1737 _out1 = __lasx_xvadd_h(_in1, _in2); \
1738 _out2 = __lasx_xvsub_h(_in1, _in2); \
1739 _out3 = __lasx_xvsub_h(_in0, _in3); \
1740 }
1741 #define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1742 { \
1743 _out0 = __lasx_xvadd_w(_in0, _in3); \
1744 _out1 = __lasx_xvadd_w(_in1, _in2); \
1745 _out2 = __lasx_xvsub_w(_in1, _in2); \
1746 _out3 = __lasx_xvsub_w(_in0, _in3); \
1747 }
1748 #define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1749 { \
1750 _out0 = __lasx_xvadd_d(_in0, _in3); \
1751 _out1 = __lasx_xvadd_d(_in1, _in2); \
1752 _out2 = __lasx_xvsub_d(_in1, _in2); \
1753 _out3 = __lasx_xvsub_d(_in0, _in3); \
1754 }
1755
1756 /*
1757 * =============================================================================
1758 * Description : Butterfly of 8 input vectors
1759 * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
1760 * Outputs - _out0, _out1, _out2, _out3, ~
1761 * Details : Butterfly operation
1762 * Example : LASX_BUTTERFLY_8
1763 * _out0 = _in0 + _in7;
1764 * _out1 = _in1 + _in6;
1765 * _out2 = _in2 + _in5;
1766 * _out3 = _in3 + _in4;
1767 * _out4 = _in3 - _in4;
1768 * _out5 = _in2 - _in5;
1769 * _out6 = _in1 - _in6;
1770 * _out7 = _in0 - _in7;
1771 * =============================================================================
1772 */
1773 #define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1774 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
1775 { \
1776 _out0 = __lasx_xvadd_b(_in0, _in7); \
1777 _out1 = __lasx_xvadd_b(_in1, _in6); \
1778 _out2 = __lasx_xvadd_b(_in2, _in5); \
1779 _out3 = __lasx_xvadd_b(_in3, _in4); \
1780 _out4 = __lasx_xvsub_b(_in3, _in4); \
1781 _out5 = __lasx_xvsub_b(_in2, _in5); \
1782 _out6 = __lasx_xvsub_b(_in1, _in6); \
1783 _out7 = __lasx_xvsub_b(_in0, _in7); \
1784 }
1785
1786 #define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1787 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
1788 { \
1789 _out0 = __lasx_xvadd_h(_in0, _in7); \
1790 _out1 = __lasx_xvadd_h(_in1, _in6); \
1791 _out2 = __lasx_xvadd_h(_in2, _in5); \
1792 _out3 = __lasx_xvadd_h(_in3, _in4); \
1793 _out4 = __lasx_xvsub_h(_in3, _in4); \
1794 _out5 = __lasx_xvsub_h(_in2, _in5); \
1795 _out6 = __lasx_xvsub_h(_in1, _in6); \
1796 _out7 = __lasx_xvsub_h(_in0, _in7); \
1797 }
1798
1799 #define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1800 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
1801 { \
1802 _out0 = __lasx_xvadd_w(_in0, _in7); \
1803 _out1 = __lasx_xvadd_w(_in1, _in6); \
1804 _out2 = __lasx_xvadd_w(_in2, _in5); \
1805 _out3 = __lasx_xvadd_w(_in3, _in4); \
1806 _out4 = __lasx_xvsub_w(_in3, _in4); \
1807 _out5 = __lasx_xvsub_w(_in2, _in5); \
1808 _out6 = __lasx_xvsub_w(_in1, _in6); \
1809 _out7 = __lasx_xvsub_w(_in0, _in7); \
1810 }
1811
1812 #define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1813 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
1814 { \
1815 _out0 = __lasx_xvadd_d(_in0, _in7); \
1816 _out1 = __lasx_xvadd_d(_in1, _in6); \
1817 _out2 = __lasx_xvadd_d(_in2, _in5); \
1818 _out3 = __lasx_xvadd_d(_in3, _in4); \
1819 _out4 = __lasx_xvsub_d(_in3, _in4); \
1820 _out5 = __lasx_xvsub_d(_in2, _in5); \
1821 _out6 = __lasx_xvsub_d(_in1, _in6); \
1822 _out7 = __lasx_xvsub_d(_in0, _in7); \
1823 }
1824
1825 #endif //LASX
1826
1827 /*
1828 * =============================================================================
1829 * Description : Print out elements in vector.
1830 * Arguments : Inputs - RTYPE, _element_num, _in0, _enter
1831 * Outputs -
1832 * Details : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
1833 * '_enter' is TRUE, prefix "\nVP:" will be added first.
1834 * Example : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
1835 * VP:1,2,3,4,
1836 * =============================================================================
1837 */
1838 #define VECT_PRINT(RTYPE, element_num, in0, enter) \
1839 { \
1840 RTYPE _tmp0 = (RTYPE)in0; \
1841 int _i = 0; \
1842 if (enter) \
1843 printf("\nVP:"); \
1844 for(_i = 0; _i < element_num; _i++) \
1845 printf("%d,",_tmp0[_i]); \
1846 }
1847
1848 #endif /* LOONGSON_INTRINSICS_H */
1849
1850